https://gcc.gnu.org/g:37ebd552056613aa6dac190fcc6e6d2b6963b3b6
commit 37ebd552056613aa6dac190fcc6e6d2b6963b3b6 Author: Michael Meissner <meiss...@linux.ibm.com> Date: Tue Sep 24 22:16:22 2024 -0400 PR 89213: Address review comments. PR 99293: Optimize splat of a V2DF/V2DI extract with constant element We had optimizations for splat of a vector extract for the other vector types, but we missed having one for V2DI and V2DF. This patch adds a combiner insn to do this optimization. In looking at the source, we had similar optimizations for V4SI and V4SF extract and splats, but we missed doing V2DI/V2DF. Without the patch for the code: vector long long splat_dup_l_0 (vector long long v) { return __builtin_vec_splats (__builtin_vec_extract (v, 0)); } the compiler generates (on a little endian power9): splat_dup_l_0: mfvsrld 9,34 mtvsrdd 34,9,9 blr Now it generates: splat_dup_l_0: xxpermdi 34,34,34,3 blr PR 89213: Add better support for shifting vectors with 64-bit elements This patch fixes PR target/89213 to allow better code to be generated to do constant shifts of V2DI/V2DF vectors. Previously GCC would do constant shifts of vectors with 64-bit elements by using: XXSPLTIB 32,4 VEXTSB2D 0,0 VSRAD 2,2,0 I.e., the PowerPC does not have a VSPLTISD instruction to load -15..14 for the 64-bit shift count in one instruction. Instead, it would need to load a byte and then convert it to 64-bit. With this patch, GCC now realizes that the vector shift instructions will look at the bottom 6 bits for the shift count, and it can use either a VSPLTISW or XXSPLTIB instruction to load the shift count. 2024-09-17 Michael Meissner <meiss...@linux.ibm.com> gcc/ PR target/89213 * config/rs6000/altivec.md (altivec_<mode>_shift_const): Remove extra ()'s. gcc/testsuite/ PR target/89213 * gcc.target/powerpc/pr89213.c: Allow running test on 32-bit. 2024-09-12 Michael Meissner <meiss...@linux.ibm.com> gcc/ * config/rs6000/vsx.md (vsx_splat_extract_<mode>): New insn. gcc/testsuite/ * gcc.target/powerpc/builtins-1.c: Adjust insn count. * gcc.target/powerpc/pr99293.c: New test. 2024-09-12 Michael Meissner <meiss...@linux.ibm.com> gcc/ PR target/89213 * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec. (VSHIFT_MODE): New mode iterator. (vshift_code): New code iterator. (vshift_attr): New code attribute. (altivec_<mode>_<vshift_attr>_const): New pattern to optimize vector long long/int shifts by a constant. (altivec_<mode>_shift_const): New helper insn to load up a constant used by the shift operation. * config/rs6000/predicates.md (vector_shift_constant): New predicate. gcc/testsuite/ PR target/89213 * gcc.target/powerpc/pr89213.c: New test. * gcc.target/powerpc/vec-rlmi-rlnm.c: Update instruction count. Diff: --- gcc/config/rs6000/altivec.md | 51 +++++++++++ gcc/config/rs6000/predicates.md | 63 ++++++++++++++ gcc/config/rs6000/vsx.md | 18 ++++ gcc/testsuite/gcc.target/powerpc/builtins-1.c | 2 +- gcc/testsuite/gcc.target/powerpc/pr89213.c | 106 +++++++++++++++++++++++ gcc/testsuite/gcc.target/powerpc/pr99293.c | 22 +++++ gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c | 4 +- 7 files changed, 263 insertions(+), 3 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 1f5489b974f6..e4576c6d0967 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -170,6 +170,7 @@ UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB + UNSPEC_VECTOR_SHIFT ]) (define_c_enum "unspecv" @@ -2176,6 +2177,56 @@ "vsro %0,%1,%2" [(set_attr "type" "vecperm")]) +;; Optimize V2DI shifts by constants. This relies on the shift instructions +;; only looking at the bits needed to do the shift. This means we can use +;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits +;; that the vector shift instructions will not use. +(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR") + (V2DI "TARGET_P8_VECTOR")]) + +(define_code_iterator vshift_code [ashift ashiftrt lshiftrt]) +(define_code_attr vshift_attr [(ashift "ashift") + (ashiftrt "ashiftrt") + (lshiftrt "lshiftrt")]) + +(define_insn_and_split "*altivec_<mode>_<vshift_attr>_const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (vshift_code:VSHIFT_MODE + (match_operand:VSHIFT_MODE 1 "register_operand" "v") + (match_operand:VSHIFT_MODE 2 "vector_shift_constant" ""))) + (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))] + "((<MODE>mode == V2DImode && TARGET_P8_VECTOR) + || (<MODE>mode == V4SImode && TARGET_P9_VECTOR))" + "#" + "&& 1" + [(set (match_dup 3) + (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT)) + (set (match_dup 0) + (vshift_code:VSHIFT_MODE (match_dup 1) + (match_dup 3)))] +{ + if (GET_CODE (operands[3]) == SCRATCH) + operands[3] = gen_reg_rtx (<MODE>mode); + + operands[4] = GET_CODE (operands[2]) == CONST_VECTOR + ? CONST_VECTOR_ELT (operands[2], 0) + : XEXP (operands[2], 0); +}) + +(define_insn "*altivec_<mode>_shift_const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")] + UNSPEC_VECTOR_SHIFT))] + "TARGET_P8_VECTOR" +{ + if (UINTVAL (operands[1]) <= 15) + return "vspltisw %0,%1"; + else if (TARGET_P9_VECTOR) + return "xxspltib %x0,%1"; + else + gcc_unreachable (); +}) + (define_insn "altivec_vsum4ubs" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v") diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 7f0b4ab61e65..0b78901e94be 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -861,6 +861,69 @@ return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode); }) +;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element +;; is the same constant, and the constant can be used for a shift operation. +;; This is to prevent sub-optimal code, that needs to load up the constant and +;; then zero extend it 32 or 64-bit vectors or load up the constant from the +;; literal pool. +;; +;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by +;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction. +;; For V2DImode, we do this all of the time, since there is no convenient +;; instruction to load up a vector long long splatted constant. +;; +;; If we can use XXSPLTIB, then allow constants up to 63. If not, we restrict +;; the constant to 0..15 that can be loaded with VSPLTISW. V4SI shifts are +;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31. Values +;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't +;; need this optimization. +(define_predicate "vector_shift_constant" + (match_code "const_vector,vec_duplicate") +{ + unsigned HOST_WIDE_INT min_value; + + if (mode == V2DImode) + { + min_value = 0; + if (!TARGET_P8_VECTOR) + return 0; + } + else if (mode == V4SImode) + { + min_value = 16; + if (!TARGET_P9_VECTOR) + return 0; + } + else + return 0; + + unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15; + + if (GET_CODE (op) == CONST_VECTOR) + { + unsigned HOST_WIDE_INT first = UINTVAL (CONST_VECTOR_ELT (op, 0)); + unsigned nunits = GET_MODE_NUNITS (mode); + unsigned i; + + if (!IN_RANGE (first, min_value, max_value)) + return 0; + + for (i = 1; i < nunits; i++) + if (first != UINTVAL (CONST_VECTOR_ELT (op, i))) + return 0; + + return 1; + } + else + { + rtx op0 = XEXP (op, 0); + if (!CONST_INT_P (op0)) + return 0; + + return IN_RANGE (UINTVAL (op0), min_value, max_value); + } +}) + ;; Return 1 if operand is 0.0. (define_predicate "zero_fp_constant" (and (match_code "const_double") diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index b2fc39acf4e8..73f20a86e56a 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -4796,6 +4796,24 @@ "lxvdsx %x0,%y1" [(set_attr "type" "vecload")]) +;; Optimize SPLAT of an extract from a V2DF/V2DI vector with a constant element +(define_insn "*vsx_splat_extract_<mode>" + [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa") + (vec_duplicate:VSX_D + (vec_select:<VEC_base> + (match_operand:VSX_D 1 "vsx_register_operand" "wa") + (parallel [(match_operand 2 "const_0_to_1_operand" "n")]))))] + "VECTOR_MEM_VSX_P (<MODE>mode)" +{ + int which_word = INTVAL (operands[2]); + if (!BYTES_BIG_ENDIAN) + which_word = 1 - which_word; + + operands[3] = GEN_INT (which_word ? 3 : 0); + return "xxpermdi %x0,%x1,%x1,%3"; +} + [(set_attr "type" "vecperm")]) + ;; V4SI splat support (define_insn "vsx_splat_v4si" [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa,wa") diff --git a/gcc/testsuite/gcc.target/powerpc/builtins-1.c b/gcc/testsuite/gcc.target/powerpc/builtins-1.c index 8410a5fd4319..4e7e5384675f 100644 --- a/gcc/testsuite/gcc.target/powerpc/builtins-1.c +++ b/gcc/testsuite/gcc.target/powerpc/builtins-1.c @@ -1035,4 +1035,4 @@ foo156 (vector unsigned short usa) /* { dg-final { scan-assembler-times {\mvmrglb\M} 3 } } */ /* { dg-final { scan-assembler-times {\mvmrgew\M} 4 } } */ /* { dg-final { scan-assembler-times {\mvsplth|xxsplth\M} 4 } } */ -/* { dg-final { scan-assembler-times {\mxxpermdi\M} 44 } } */ +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 42 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr89213.c b/gcc/testsuite/gcc.target/powerpc/pr89213.c new file mode 100644 index 000000000000..90a8b1b5787b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr89213.c @@ -0,0 +1,106 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_vsx } */ +/* { dg-options "-mcpu=power9 -O2" } */ + +/* Optimize vector shifts by constants. */ + +#include <altivec.h> + +typedef vector long long vi64_t; +typedef vector unsigned long long vui64_t; + +typedef vector int vi32_t; +typedef vector unsigned int vui32_t; + +vi64_t +shiftra_test64_4 (vi64_t a) +{ + vui64_t x = {4, 4}; + return (vi64_t) vec_vsrad (a, x); +} + +vi64_t +shiftrl_test64_4 (vi64_t a) +{ + vui64_t x = {4, 4}; + return (vi64_t) vec_vsrd (a, x); +} + +vi64_t +shiftl_test64_4 (vi64_t a) +{ + vui64_t x = {4, 4}; + return (vi64_t) vec_vsld (a, x); +} + +vi64_t +shiftra_test64_29 (vi64_t a) +{ + vui64_t x = {29, 29}; + return (vi64_t) vec_vsrad (a, x); +} + +vi64_t +shiftrl_test64_29 (vi64_t a) +{ + vui64_t x = {29, 29}; + return (vi64_t) vec_vsrd (a, x); +} + +vi64_t +shiftl_test64_29 (vi64_t a) +{ + vui64_t x = {29, 29}; + return (vi64_t) vec_vsld (a, x); +} + +vi32_t +shiftra_test32_4 (vi32_t a) +{ + vui32_t x = {4, 4, 4, 4}; + return (vi32_t) vec_vsraw (a, x); +} + +vi32_t +shiftrl_test32_4 (vi32_t a) +{ + vui32_t x = {4, 4, 4, 4}; + return (vi32_t) vec_vsrw (a, x); +} + +vi32_t +shiftl_test32_4 (vi32_t a) +{ + vui32_t x = {4, 4, 4, 4}; + return (vi32_t) vec_vslw (a, x); +} + +vi32_t +shiftra_test32_29 (vi32_t a) +{ + vui32_t x = {29, 29, 29, 29}; + return (vi32_t) vec_vsraw (a, x); +} + +vi32_t +shiftrl_test32_29 (vi32_t a) +{ + vui32_t x = {29, 29, 29, 29}; + return (vi32_t) vec_vsrw (a, x); +} + +vi32_t +shiftl_test32_29 (vi32_t a) +{ + vui32_t x = {29, 29, 29, 29}; + return (vi32_t) vec_vslw (a, x); +} + +/* { dg-final { scan-assembler-times {\mxxspltib\M} 6 } } */ +/* { dg-final { scan-assembler-times {\mvsld\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvslw\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvspltisw\M} 6 } } */ +/* { dg-final { scan-assembler-times {\mvsrd\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsrw\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsrad\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsraw\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr99293.c b/gcc/testsuite/gcc.target/powerpc/pr99293.c new file mode 100644 index 000000000000..20adc1f27f65 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr99293.c @@ -0,0 +1,22 @@ +/* { dg-do compile { target powerpc*-*-* } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O2 -mvsx" } */ + +/* Test for PR 99263, which wants to do: + __builtin_vec_splats (__builtin_vec_extract (v, n)) + + where v is a V2DF or V2DI vector and n is either 0 or 1. Previously the + compiler would do a direct move to the GPR registers to select the item and a + direct move from the GPR registers to do the splat. */ + +vector long long splat_dup_l_0 (vector long long v) +{ + return __builtin_vec_splats (__builtin_vec_extract (v, 0)); +} + +vector long long splat_dup_l_1 (vector long long v) +{ + return __builtin_vec_splats (__builtin_vec_extract (v, 1)); +} + +/* { dg-final { scan-assembler-times "xxpermdi" 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c b/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c index 6834733b1bf3..01fa0a99d465 100644 --- a/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c +++ b/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c @@ -54,12 +54,12 @@ rlnm_test_2 (vector unsigned long long x, vector unsigned long long y, - For rlnm_test_1: vspltisw, vslw, xxlor, vrlwnm. - For rlnm_test_2: xxspltib, vextsb2d, vsld, xxlor, vrldnm. There is a choice of splat instructions in both cases, so we - just check for "splt". */ + just check for "splt". In the past vextsb2d would be generated for + rlnm_test_2, but the compiler no longer generates it. */ /* { dg-final { scan-assembler-times "vrlwmi" 1 } } */ /* { dg-final { scan-assembler-times "vrldmi" 1 } } */ /* { dg-final { scan-assembler-times "splt" 2 } } */ -/* { dg-final { scan-assembler-times "vextsb2d" 1 } } */ /* { dg-final { scan-assembler-times "vslw" 1 } } */ /* { dg-final { scan-assembler-times "vsld" 1 } } */ /* { dg-final { scan-assembler-times "xxlor" 4 } } */