https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93449
--- Comment #2 from Jens Seifert <jens.seifert at de dot ibm.com> --- #include <memory.h> typedef float _Decimal128 __attribute__((mode(TD))); _Decimal128 bcdtodpd(vector double v) { _Decimal128 res; memcpy(&res, &v, sizeof(res)); res = __builtin_denbcdq(0, res); return res; } _Decimal128 bcdtodpd_opt(vector double bcd) { _Decimal128 res; __asm__ volatile("xxlor 4,%x1,%x1;\n" "xxpermdi 5,%x1,%x1,3;\n" "denbcdq 0,%0,4":"=d"(res):"v"(bcd):"vs36","vs37"); return res; } vector double dpdtobcd(_Decimal128 dpd) { _Decimal128 bcd = __builtin_ddedpdq(0, dpd); vector double res; memcpy(&res, &bcd, sizeof(res)); return res; } vector double dpdtobcd_opt(_Decimal128 dpd) { vector double res; __asm__ volatile("ddedpdq 0,4,%1;\n" "xxpermdi %x0,4,5,0":"=v"(res):"d"(dpd):"vs36","vs37"); return res; } The non-inline assembly show store/load (very slow). The assembly version does the conversion from vector to _Decimal128 with optimal sequence for Power7 and above.