The native RTL expression for vec_mrghw should be same for BE and LE as they are register and endian-independent. So both BE and LE need generate exactly same RTL with index [0 4 1 5] when expanding vec_mrghw with vec_select and vec_concat.
(set (reg:V4SI 141) (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 139) 0) (subreg:V4SI (reg:V16QI 140) 0)) [const_int 0 4 1 5])) Then combine pass could do the nested vec_select optimization in simplify-rtx.c:simplify_binary_operation_1 also on both BE and LE: 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel [0 4 1 5]) 24: {r151:SI=vec_select(r150:V4SI,parallel [const_int 3]);} => 21: r150:V4SI=vec_select(vec_concat(r141:V4SI,r146:V4SI),parallel) 24: {r151:SI=vec_select(r146:V4SI,parallel [const_int 1]);} The endianness check need only once at ASM generation finally. ASM would be better due to nested vec_select simplified to simple scalar load. Regression tested pass for Power8{LE,BE}{32,64} and Power{9,10}LE{32,64} Linux(Thanks to Kewen), OK for master? Or should we revert r12-4496 to restore to the UNSPEC implementation? gcc/ChangeLog: PR target/106069 * config/rs6000/altivec.md (altivec_vmrghb): Emit same native RTL for BE and LE. (altivec_vmrghh): Likewise. (altivec_vmrghw): Likewise. (*altivec_vmrghsf): Adjust. (altivec_vmrglb): Likewise. (altivec_vmrglh): Likewise. (altivec_vmrglw): Likewise. (*altivec_vmrglsf): Adjust. (altivec_vmrghb_direct): Emit different ASM for BE and LE. (altivec_vmrghh_direct): Likewise. (altivec_vmrghw_direct_<mode>): Likewise. (altivec_vmrglb_direct): Likewise. (altivec_vmrglh_direct): Likewise. (altivec_vmrglw_direct_<mode>): Likewise. (vec_widen_smult_hi_v16qi): Adjust. (vec_widen_smult_lo_v16qi): Adjust. (vec_widen_umult_hi_v16qi): Adjust. (vec_widen_umult_lo_v16qi): Adjust. (vec_widen_smult_hi_v8hi): Adjust. (vec_widen_smult_lo_v8hi): Adjust. (vec_widen_umult_hi_v8hi): Adjust. (vec_widen_umult_lo_v8hi): Adjust. * config/rs6000/rs6000.cc (altivec_expand_vec_perm_const): Emit same native RTL for BE and LE. * config/rs6000/vsx.md (vsx_xxmrghw_<mode>): Likewise. (vsx_xxmrglw_<mode>): Likewise. gcc/testsuite/ChangeLog: PR target/106069 * gcc.target/powerpc/pr106069.C: New test. Signed-off-by: Xionghu Luo <xionghu...@tencent.com> --- gcc/config/rs6000/altivec.md | 122 ++++++++++++-------- gcc/config/rs6000/rs6000.cc | 36 +++--- gcc/config/rs6000/vsx.md | 16 +-- gcc/testsuite/gcc.target/powerpc/pr106069.C | 118 +++++++++++++++++++ 4 files changed, 209 insertions(+), 83 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr106069.C diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2c4940f2e21..8d9c0109559 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1144,11 +1144,7 @@ (define_expand "altivec_vmrghb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghb_direct - : gen_altivec_vmrglb_direct; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrghb_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1167,7 +1163,12 @@ (define_insn "altivec_vmrghb_direct" (const_int 6) (const_int 22) (const_int 7) (const_int 23)])))] "TARGET_ALTIVEC" - "vmrghb %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrghb %0,%1,%2"; + else + return "vmrglb %0,%2,%1"; + } [(set_attr "type" "vecperm")]) (define_expand "altivec_vmrghh" @@ -1176,11 +1177,7 @@ (define_expand "altivec_vmrghh" (use (match_operand:V8HI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrghh_direct - : gen_altivec_vmrglh_direct; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrghh_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1195,7 +1192,12 @@ (define_insn "altivec_vmrghh_direct" (const_int 2) (const_int 10) (const_int 3) (const_int 11)])))] "TARGET_ALTIVEC" - "vmrghh %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrghh %0,%1,%2"; + else + return "vmrglh %0,%2,%1"; + } [(set_attr "type" "vecperm")]) (define_expand "altivec_vmrghw" @@ -1204,12 +1206,8 @@ (define_expand "altivec_vmrghw" (use (match_operand:V4SI 2 "register_operand"))] "VECTOR_MEM_ALTIVEC_P (V4SImode)" { - rtx (*fun) (rtx, rtx, rtx); - fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrghw_direct_v4si - : gen_altivec_vmrglw_direct_v4si; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn ( + gen_altivec_vmrghw_direct_v4si (operands[0], operands[1], operands[2])); DONE; }) @@ -1222,9 +1220,22 @@ (define_insn "altivec_vmrghw_direct_<mode>" (parallel [(const_int 0) (const_int 4) (const_int 1) (const_int 5)])))] "TARGET_ALTIVEC" - "@ - xxmrghw %x0,%x1,%x2 - vmrghw %0,%1,%2" + { + if (which_alternative == 0) + { + if (BYTES_BIG_ENDIAN) + return "xxmrghw %x0,%x1,%x2"; + else + return "xxmrglw %x0,%x2,%x1"; + } + else + { + if (BYTES_BIG_ENDIAN) + return "vmrghw %0,%1,%2"; + else + return "vmrglw %0,%2,%1"; + } + } [(set_attr "type" "vecperm")]) (define_insn "*altivec_vmrghsf" @@ -1250,11 +1261,7 @@ (define_expand "altivec_vmrglb" (use (match_operand:V16QI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrglb_direct - : gen_altivec_vmrghb_direct; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrglb_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1273,7 +1280,12 @@ (define_insn "altivec_vmrglb_direct" (const_int 14) (const_int 30) (const_int 15) (const_int 31)])))] "TARGET_ALTIVEC" - "vmrglb %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrglb %0,%1,%2"; + else + return "vmrghb %0,%2,%1"; + } [(set_attr "type" "vecperm")]) (define_expand "altivec_vmrglh" @@ -1282,11 +1294,7 @@ (define_expand "altivec_vmrglh" (use (match_operand:V8HI 2 "register_operand"))] "TARGET_ALTIVEC" { - rtx (*fun) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN ? gen_altivec_vmrglh_direct - : gen_altivec_vmrghh_direct; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn (gen_altivec_vmrglh_direct (operands[0], operands[1], operands[2])); DONE; }) @@ -1301,7 +1309,12 @@ (define_insn "altivec_vmrglh_direct" (const_int 6) (const_int 14) (const_int 7) (const_int 15)])))] "TARGET_ALTIVEC" - "vmrglh %0,%1,%2" + { + if (BYTES_BIG_ENDIAN) + return "vmrglh %0,%1,%2"; + else + return "vmrghh %0,%2,%1"; + } [(set_attr "type" "vecperm")]) (define_expand "altivec_vmrglw" @@ -1310,12 +1323,8 @@ (define_expand "altivec_vmrglw" (use (match_operand:V4SI 2 "register_operand"))] "VECTOR_MEM_ALTIVEC_P (V4SImode)" { - rtx (*fun) (rtx, rtx, rtx); - fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrglw_direct_v4si - : gen_altivec_vmrghw_direct_v4si; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn ( + gen_altivec_vmrglw_direct_v4si (operands[0], operands[1], operands[2])); DONE; }) @@ -1328,9 +1337,22 @@ (define_insn "altivec_vmrglw_direct_<mode>" (parallel [(const_int 2) (const_int 6) (const_int 3) (const_int 7)])))] "TARGET_ALTIVEC" - "@ - xxmrglw %x0,%x1,%x2 - vmrglw %0,%1,%2" + { + if (which_alternative == 0) + { + if (BYTES_BIG_ENDIAN) + return "xxmrglw %x0,%x1,%x2"; + else + return "xxmrghw %x0,%x2,%x1"; + } + else + { + if (BYTES_BIG_ENDIAN) + return "vmrglw %0,%1,%2"; + else + return "vmrghw %0,%2,%1"; + } + } [(set_attr "type" "vecperm")]) (define_insn "*altivec_vmrglsf" @@ -3705,7 +3727,7 @@ (define_expand "vec_widen_umult_hi_v16qi" { emit_insn (gen_altivec_vmuloub (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmuleub (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrghh_direct (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrglh_direct (operands[0], ve, vo)); } DONE; }) @@ -3730,7 +3752,7 @@ (define_expand "vec_widen_umult_lo_v16qi" { emit_insn (gen_altivec_vmuloub (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmuleub (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrglh_direct (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrghh_direct (operands[0], ve, vo)); } DONE; }) @@ -3755,7 +3777,7 @@ (define_expand "vec_widen_smult_hi_v16qi" { emit_insn (gen_altivec_vmulosb (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmulesb (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrghh_direct (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrglh_direct (operands[0], ve, vo)); } DONE; }) @@ -3780,7 +3802,7 @@ (define_expand "vec_widen_smult_lo_v16qi" { emit_insn (gen_altivec_vmulosb (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmulesb (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrglh_direct (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrghh_direct (operands[0], ve, vo)); } DONE; }) @@ -3805,7 +3827,7 @@ (define_expand "vec_widen_umult_hi_v8hi" { emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo)); } DONE; }) @@ -3830,7 +3852,7 @@ (define_expand "vec_widen_umult_lo_v8hi" { emit_insn (gen_altivec_vmulouh (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmuleuh (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo)); } DONE; }) @@ -3855,7 +3877,7 @@ (define_expand "vec_widen_smult_hi_v8hi" { emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], ve, vo)); } DONE; }) @@ -3880,7 +3902,7 @@ (define_expand "vec_widen_smult_lo_v8hi" { emit_insn (gen_altivec_vmulosh (ve, operands[1], operands[2])); emit_insn (gen_altivec_vmulesh (vo, operands[1], operands[2])); - emit_insn (gen_altivec_vmrglw_direct_v4si (operands[0], vo, ve)); + emit_insn (gen_altivec_vmrghw_direct_v4si (operands[0], ve, vo)); } DONE; }) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index df491bee2ea..018bea9f2f8 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -22941,29 +22941,17 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1, {OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vpkuwum_direct, {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31}}, - {OPTION_MASK_ALTIVEC, - BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghb_direct - : CODE_FOR_altivec_vmrglb_direct, + {OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vmrghb_direct, {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}}, - {OPTION_MASK_ALTIVEC, - BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghh_direct - : CODE_FOR_altivec_vmrglh_direct, + {OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vmrghh_direct, {0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23}}, - {OPTION_MASK_ALTIVEC, - BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrghw_direct_v4si - : CODE_FOR_altivec_vmrglw_direct_v4si, + {OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vmrghw_direct_v4si, {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}}, - {OPTION_MASK_ALTIVEC, - BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglb_direct - : CODE_FOR_altivec_vmrghb_direct, + {OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vmrglb_direct, {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}}, - {OPTION_MASK_ALTIVEC, - BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglh_direct - : CODE_FOR_altivec_vmrghh_direct, + {OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vmrglh_direct, {8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31}}, - {OPTION_MASK_ALTIVEC, - BYTES_BIG_ENDIAN ? CODE_FOR_altivec_vmrglw_direct_v4si - : CODE_FOR_altivec_vmrghw_direct_v4si, + {OPTION_MASK_ALTIVEC, CODE_FOR_altivec_vmrglw_direct_v4si, {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}}, {OPTION_MASK_P8_VECTOR, BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgew_v4sf_direct @@ -23146,9 +23134,15 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1, /* For little-endian, the two input operands must be swapped (or swapped back) to ensure proper right-to-left numbering - from 0 to 2N-1. */ - if (swapped ^ !BYTES_BIG_ENDIAN - && icode != CODE_FOR_vsx_xxpermdi_v16qi) + from 0 to 2N-1. Excludes the vmrg[lh][bhw] and xxpermdi ops. */ + if (swapped ^ !BYTES_BIG_ENDIAN) + if (!(icode == CODE_FOR_altivec_vmrghb_direct + || icode == CODE_FOR_altivec_vmrglb_direct + || icode == CODE_FOR_altivec_vmrghh_direct + || icode == CODE_FOR_altivec_vmrglh_direct + || icode == CODE_FOR_altivec_vmrghw_direct_v4si + || icode == CODE_FOR_altivec_vmrglw_direct_v4si + || icode == CODE_FOR_vsx_xxpermdi_v16qi)) std::swap (op0, op1); if (imode != V16QImode) { diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index e226a93bbe5..b84f667e4b2 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -4688,12 +4688,8 @@ (define_expand "vsx_xxmrghw_<mode>" (const_int 1) (const_int 5)])))] "VECTOR_MEM_VSX_P (<MODE>mode)" { - rtx (*fun) (rtx, rtx, rtx); - fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrghw_direct_<mode> - : gen_altivec_vmrglw_direct_<mode>; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn ( + gen_altivec_vmrghw_direct_v4si (operands[0], operands[1], operands[2])); DONE; } [(set_attr "type" "vecperm")]) @@ -4708,12 +4704,8 @@ (define_expand "vsx_xxmrglw_<mode>" (const_int 3) (const_int 7)])))] "VECTOR_MEM_VSX_P (<MODE>mode)" { - rtx (*fun) (rtx, rtx, rtx); - fun = BYTES_BIG_ENDIAN ? gen_altivec_vmrglw_direct_<mode> - : gen_altivec_vmrghw_direct_<mode>; - if (!BYTES_BIG_ENDIAN) - std::swap (operands[1], operands[2]); - emit_insn (fun (operands[0], operands[1], operands[2])); + emit_insn ( + gen_altivec_vmrglw_direct_v4si (operands[0], operands[1], operands[2])); DONE; } [(set_attr "type" "vecperm")]) diff --git a/gcc/testsuite/gcc.target/powerpc/pr106069.C b/gcc/testsuite/gcc.target/powerpc/pr106069.C new file mode 100644 index 00000000000..56219a74692 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr106069.C @@ -0,0 +1,118 @@ +/* { dg-do run } */ + +extern "C" void * +memcpy (void *, const void *, unsigned long); +typedef __attribute__ ((altivec (vector__))) unsigned native_simd_type; + +union +{ + native_simd_type V; + int R[4]; +} store_le_vec; + +struct S +{ + S () = default; + S (unsigned B0) + { + native_simd_type val{B0}; + m_simd = val; + } + void store_le (unsigned int out[]) + { + store_le_vec.V = m_simd; + unsigned int x0 = store_le_vec.R[0]; + memcpy (out, &x0, 1); + } + S rotl (unsigned int r) + { + native_simd_type rot{r}; + return __builtin_vec_rl (m_simd, rot); + } + void operator+= (S other) + { + m_simd = __builtin_vec_add (m_simd, other.m_simd); + } + void operator^= (S other) + { + m_simd = __builtin_vec_xor (m_simd, other.m_simd); + } + static void transpose (S &B0, S B1, S B2, S B3) + { + native_simd_type T0 = __builtin_vec_mergeh (B0.m_simd, B2.m_simd); + native_simd_type T1 = __builtin_vec_mergeh (B1.m_simd, B3.m_simd); + native_simd_type T2 = __builtin_vec_mergel (B0.m_simd, B2.m_simd); + native_simd_type T3 = __builtin_vec_mergel (B1.m_simd, B3.m_simd); + B0 = __builtin_vec_mergeh (T0, T1); + B3 = __builtin_vec_mergel (T2, T3); + } + S (native_simd_type x) : m_simd (x) {} + native_simd_type m_simd; +}; + +void +foo (unsigned int output[], unsigned state[]) +{ + S R00 = state[0]; + S R01 = state[0]; + S R02 = state[2]; + S R03 = state[0]; + S R05 = state[5]; + S R06 = state[6]; + S R07 = state[7]; + S R08 = state[8]; + S R09 = state[9]; + S R10 = state[10]; + S R11 = state[11]; + S R12 = state[12]; + S R13 = state[13]; + S R14 = state[4]; + S R15 = state[15]; + for (int r = 0; r != 10; ++r) + { + R09 += R13; + R11 += R15; + R05 ^= R09; + R06 ^= R10; + R07 ^= R11; + R07 = R07.rotl (7); + R00 += R05; + R01 += R06; + R02 += R07; + R15 ^= R00; + R12 ^= R01; + R13 ^= R02; + R00 += R05; + R01 += R06; + R02 += R07; + R15 ^= R00; + R12 = R12.rotl (8); + R13 = R13.rotl (8); + R10 += R15; + R11 += R12; + R08 += R13; + R09 += R14; + R05 ^= R10; + R06 ^= R11; + R07 ^= R08; + R05 = R05.rotl (7); + R06 = R06.rotl (7); + R07 = R07.rotl (7); + } + R00 += state[0]; + S::transpose (R00, R01, R02, R03); + R00.store_le (output); +} + +unsigned int res[1]; +unsigned main_state[]{1634760805, 60878, 2036477234, 6, + 0, 825562964, 1471091955, 1346092787, + 506976774, 4197066702, 518848283, 118491664, + 0, 0, 0, 0}; +int +main () +{ + foo (res, main_state); + if (res[0] != 0x41fcef98) + __builtin_abort (); +} -- 2.27.0