Hi, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79261 records that the interface vec_xxpermdi isn't implemented in a bi-endian fashion; instead, it produces results appropriate for big-endian vector element numbering even when run on a little endian machine. This is not part of the "official vector API" from the ELFv2 ABI document, but should still have appropriate bi-endian behavior.
The problem appears to have arisen because some of GCC's internal usages of the xxpermdi instruction wanted to use them in big-endian form, which is natural to the original hardware instruction. The define_expand used for this (vsx_xxpermdi_<mode>) is also the one that is used for the built-in forms. Since these two uses require different semantics, I've created a new vsx_xxpermdi_<mode>_be define_expand to be used for the internal uses, and re-directed those usages there. That version retains the original semantics, while the semantics for the built-ins are changed to be endian-sensitive. I've added a runtime test to verify that the vec_xxpermdi intrinsics produce correct results for the various vector modes in both big- and little-endian environments. Bootstrapped and tested on powerpc64[le]-unknown-linux-gnu with no regressions in either environment. Is this ok for trunk, and after a suitable period, for backport to GCC 5 and 6? Thanks, Bill [gcc] 2017-02-16 Bill Schmidt <wschm...@linux.vnet.ibm.com> PR target/79261 * config/rs6000/rs6000.c (rs6000_expand_ternop_builtin): Add support for CODE_FOR_vsx_xxpermdi_v2d[fi]_be. * config/rs6000/rs6000.md (reload_gpr_from_vsx<mode>): Call generator for vsx_xxpermdi_<mode>_be. * config/rs6000/vsx.md (vsx_xxpermdi_<mode>): Remove logic to force big-endian semantics. (vsx_xxpermdi_<mode>_be): New define_expand with same implementation as previous version of vsx_xxpermdi_<mode>. [gcc/testsuite] 2017-02-16 Bill Schmidt <wschm...@linux.vnet.ibm.com> PR target/79261 * gcc.target/powerpc/vec-xxpermdi.c: New file. Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 245490) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -15436,6 +15436,8 @@ rs6000_expand_ternop_builtin (enum insn_code icode } else if (icode == CODE_FOR_vsx_xxpermdi_v2df || icode == CODE_FOR_vsx_xxpermdi_v2di + || icode == CODE_FOR_vsx_xxpermdi_v2df_be + || icode == CODE_FOR_vsx_xxpermdi_v2di_be || icode == CODE_FOR_vsx_xxsldwi_v16qi || icode == CODE_FOR_vsx_xxsldwi_v8hi || icode == CODE_FOR_vsx_xxsldwi_v4si Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 245490) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -8419,7 +8419,7 @@ rtx gpr_lo_reg = gen_lowpart (DFmode, dest); emit_insn (gen_p8_mfvsrd_3_<mode> (gpr_hi_reg, src)); - emit_insn (gen_vsx_xxpermdi_<mode> (tmp, src, src, GEN_INT (3))); + emit_insn (gen_vsx_xxpermdi_<mode>_be (tmp, src, src, GEN_INT (3))); emit_insn (gen_p8_mfvsrd_3_<mode> (gpr_lo_reg, tmp)); DONE; } Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 245490) +++ gcc/config/rs6000/vsx.md (working copy) @@ -2461,6 +2461,38 @@ op1 = gen_lowpart (V2DImode, op1); } } + emit_insn (gen (target, op0, op1, perm0, perm1)); + DONE; +}) + +;; Special version of xxpermdi that retains big-endian semantics. +(define_expand "vsx_xxpermdi_<mode>_be" + [(match_operand:VSX_L 0 "vsx_register_operand" "") + (match_operand:VSX_L 1 "vsx_register_operand" "") + (match_operand:VSX_L 2 "vsx_register_operand" "") + (match_operand:QI 3 "u5bit_cint_operand" "")] + "VECTOR_MEM_VSX_P (<MODE>mode)" +{ + rtx target = operands[0]; + rtx op0 = operands[1]; + rtx op1 = operands[2]; + int mask = INTVAL (operands[3]); + rtx perm0 = GEN_INT ((mask >> 1) & 1); + rtx perm1 = GEN_INT ((mask & 1) + 2); + rtx (*gen) (rtx, rtx, rtx, rtx, rtx); + + if (<MODE>mode == V2DFmode) + gen = gen_vsx_xxpermdi2_v2df_1; + else + { + gen = gen_vsx_xxpermdi2_v2di_1; + if (<MODE>mode != V2DImode) + { + target = gen_lowpart (V2DImode, target); + op0 = gen_lowpart (V2DImode, op0); + op1 = gen_lowpart (V2DImode, op1); + } + } /* In little endian mode, vsx_xxpermdi2_<mode>_1 will perform a transformation we don't want; it is necessary for rs6000_expand_vec_perm_const_1 but not for this use. So we @@ -3870,7 +3902,7 @@ { rtx op1 = operands[1]; rtx v4si_tmp = gen_reg_rtx (V4SImode); - emit_insn (gen_vsx_xxpermdi_v4si (v4si_tmp, op1, op1, const1_rtx)); + emit_insn (gen_vsx_xxpermdi_v4si_be (v4si_tmp, op1, op1, const1_rtx)); operands[1] = v4si_tmp; operands[3] = GEN_INT (12 - INTVAL (operands[3])); } Index: gcc/testsuite/gcc.target/powerpc/vec-xxpermdi.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/vec-xxpermdi.c (nonexistent) +++ gcc/testsuite/gcc.target/powerpc/vec-xxpermdi.c (working copy) @@ -0,0 +1,68 @@ +/* { dg-do run { target { powerpc64*-*-* && vsx_hw } } } */ +/* { dg-options "-O2 -mvsx" } */ + +/* Added for PR79261 to test that vec_xxpermdi works correctly for + both BE and LE targets. */ + +#include <altivec.h> +void abort (void); + +vector double vdx = { 0.0, 1.0 }; +vector double vdy = { 2.0, 3.0 }; +vector double vdz; + +vector signed long long vsllx = { 0, 1 }; +vector signed long long vslly = { 2, 3 }; +vector signed long long vsllz; + +vector float vfx = { 0.0, 1.0, 2.0, 3.0 }; +vector float vfy = { 4.0, 5.0, 6.0, 7.0 }; +vector float vfz; + +vector signed int vsix = { 0, 1, 2, 3 }; +vector signed int vsiy = { 4, 5, 6, 7 }; +vector signed int vsiz; + +vector signed short vssx = { 0, 1, 2, 3, 4, 5, 6, 7 }; +vector signed short vssy = { 8, 9, 10, 11, 12, 13, 14, 15 }; +vector signed short vssz; + +vector signed char vscx = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; +vector signed char vscy = { 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 }; +vector signed char vscz; + +int +main () +{ + vdz = vec_xxpermdi (vdx, vdy, 0b01); + if (vdz[0] != 0.0 || vdz[1] != 3.0) + abort (); + + vsllz = vec_xxpermdi (vsllx, vslly, 0b10); + if (vsllz[0] != 1 || vsllz[1] != 2) + abort (); + + vfz = vec_xxpermdi (vfx, vfy, 0b01); + if (vfz[0] != 0.0 || vfz[1] != 1.0 || vfz[2] != 6.0 || vfz[3] != 7.0) + abort (); + + vsiz = vec_xxpermdi (vsix, vsiy, 0b10); + if (vsiz[0] != 2 || vsiz[1] != 3 || vsiz[2] != 4 || vsiz[3] != 5) + abort (); + + vssz = vec_xxpermdi (vssx, vssy, 0b00); + if (vssz[0] != 0 || vssz[1] != 1 || vssz[2] != 2 || vssz[3] != 3 + || vssz[4] != 8 || vssz[5] != 9 || vssz[6] != 10 || vssz[7] != 11) + abort (); + + vscz = vec_xxpermdi (vscx, vscy, 0b11); + if (vscz[0] != 8 || vscz[1] != 9 || vscz[2] != 10 || vscz[3] != 11 + || vscz[4] != 12 || vscz[5] != 13 || vscz[6] != 14 || vscz[7] != 15 + || vscz[8] != 24 || vscz[9] != 25 || vscz[10] != 26 || vscz[11] != 27 + || vscz[12] != 28 || vscz[13] != 29 || vscz[14] != 30 || vscz[15] != 31) + abort (); + + return 0; +}