https://gcc.gnu.org/g:bbd50ab9d9c6aac588b3e7a3889b2cf317cf18c6

commit bbd50ab9d9c6aac588b3e7a3889b2cf317cf18c6
Author: Michael Meissner <meiss...@linux.ibm.com>
Date:   Mon Aug 12 17:37:25 2024 -0400

    Optimize vec_splats of vec_extract for V2DI/V2DF (PR target/99293)
    
    This patch optimizes cases like:
    
            vector double v1, v2;
            /* ... */
            v2 = vec_splats (vec_extract (v1, 0);   /* or  */
            v2 = vec_splats (vec_extract (v1, 1);
    
    Previously:
    
            vector long long
            splat_dup_l_0 (vector long long v)
            {
              return __builtin_vec_splats (__builtin_vec_extract (v, 0));
            }
    
    would generate:
    
            mfvsrld 9,34
            mtvsrdd 34,9,9
            blr
    
    With this patch, GCC generates:
    
            xxpermdi 34,34,34,3
            blr
    
    I have tested this patch on the following systems and there was no 
degration.
    Can I check it into the trunk branch?
    
        *   Power10, LE, --with-cpu=power10, IBM 128-bit long double
        *   Power9,  LE, --with-cpu=power9,  IBM 128-bit long double
        *   Power9,  LE, --with-cpu=power9,  IEEE 128-bit long double
        *   Power9,  LE, --with-cpu=power9,  64-bit default long double
        *   Power9,  BE, --with-cpu=power9,  IBM 128-bit long double
        *   Power8,  BE, --with-cpu=power8,  IBM 128-bit long double
    
    2024-08-12  Michael Meissner  <meiss...@linux.ibm.com>
    
    gcc/
    
            PR target/99293
            * gcc/config/rs6000/vsx.md (vsx_splat_extract_<mode>): New combiner
            insn.
    
    gcc/testsuite/
    
            PR target/108958
            * gcc.target/powerpc/pr99293.c: New test.
            * gcc.target/powerpc/builtins-1.c: Update insn count.

Diff:
---
 gcc/config/rs6000/altivec.md               |  51 ++++++++++++++
 gcc/config/rs6000/predicates.md            |  63 +++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 +++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index aa9d8fffc901..8a7926eb369a 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -170,6 +170,7 @@
    UNSPEC_VSTRIL
    UNSPEC_SLDB
    UNSPEC_SRDB
+   UNSPEC_VECTOR_SHIFT
 ])
 
 (define_c_enum "unspecv"
@@ -2176,6 +2177,56 @@
   "vsro %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
+;; Optimize V2DI shifts by constants.  This relies on the shift instructions
+;; only looking at the bits needed to do the shift.  This means we can use
+;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits
+;; that the vector shift instructions will not use.
+(define_mode_iterator VSHIFT_MODE      [(V4SI "TARGET_P9_VECTOR")
+                                        (V2DI "TARGET_P8_VECTOR")])
+
+(define_code_iterator vshift_code      [ashift ashiftrt lshiftrt])
+(define_code_attr vshift_attr          [(ashift   "ashift")
+                                        (ashiftrt "ashiftrt")
+                                        (lshiftrt "lshiftrt")])
+
+(define_insn_and_split "*altivec_<mode>_<vshift_attr>_const"
+  [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v")
+       (vshift_code:VSHIFT_MODE
+        (match_operand:VSHIFT_MODE 1 "register_operand" "v")
+        (match_operand:VSHIFT_MODE 2 "vector_shift_constant" "")))
+   (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))]
+  "((<MODE>mode == V2DImode && TARGET_P8_VECTOR)
+    || (<MODE>mode == V4SImode && TARGET_P9_VECTOR))"
+  "#"
+  "&& 1"
+  [(set (match_dup 3)
+       (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT))
+   (set (match_dup 0)
+       (vshift_code:VSHIFT_MODE (match_dup 1)
+                                (match_dup 3)))]
+{
+  if (GET_CODE (operands[3]) == SCRATCH)
+    operands[3] = gen_reg_rtx (<MODE>mode);
+
+  operands[4] = ((GET_CODE (operands[2]) == CONST_VECTOR)
+                ? CONST_VECTOR_ELT (operands[2], 0)
+                : XEXP (operands[2], 0));
+})
+
+(define_insn "*altivec_<mode>_shift_const"
+  [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v")
+       (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")]
+                           UNSPEC_VECTOR_SHIFT))]
+  "TARGET_P8_VECTOR"
+{
+  if (UINTVAL (operands[1]) <= 15)
+    return "vspltisw %0,%1";
+  else if (TARGET_P9_VECTOR)
+    return "xxspltib %x0,%1";
+  else
+    gcc_unreachable ();
+})
+
 (define_insn "altivec_vsum4ubs"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index d23ce9a77a3f..4fca37bc9345 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -861,6 +861,69 @@
     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
 })
 
+;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element
+;; is the same constant, and the constant can be used for a shift operation.
+;; This is to prevent sub-optimal code, that needs to load up the constant and
+;; then zero extend it 32 or 64-bit vectors or load up the constant from the
+;; literal pool.
+;;
+;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by
+;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction.
+;; For V2DImode, we do this all of the time, since there is no convenient
+;; instruction to load up a vector long long splatted constant.
+;;
+;; If we can use XXSPLTIB, then allow constants up to 63.  If not, we restrict
+;; the constant to 0..15 that can be loaded with VSPLTISW.  V4SI shifts are
+;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31.  Values
+;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't
+;; need this optimization.
+(define_predicate "vector_shift_constant"
+  (match_code "const_vector,vec_duplicate")
+{
+  unsigned HOST_WIDE_INT min_value;
+
+  if (mode == V2DImode)
+    {
+      min_value = 0;
+      if (!TARGET_P8_VECTOR)
+       return 0;
+    }
+  else if (mode == V4SImode)
+    {
+      min_value = 16;
+      if (!TARGET_P9_VECTOR)
+       return 0;
+    }
+  else
+    return 0;
+
+  unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15;
+
+  if (GET_CODE (op) == CONST_VECTOR)
+    {
+      unsigned HOST_WIDE_INT first = UINTVAL (CONST_VECTOR_ELT (op, 0));
+      unsigned nunits = GET_MODE_NUNITS (mode);
+      unsigned i;
+
+      if (!IN_RANGE (first, min_value, max_value))
+       return 0;
+
+      for (i = 1; i < nunits; i++)
+       if (first != UINTVAL (CONST_VECTOR_ELT (op, i)))
+         return 0;
+
+      return 1;
+    }
+  else
+    {
+      rtx op0 = XEXP (op, 0);
+      if (!CONST_INT_P (op0))
+       return 0;
+
+      return IN_RANGE (UINTVAL (op0), min_value, max_value);
+    }
+})
+
 ;; Return 1 if operand is 0.0.
 (define_predicate "zero_fp_constant"
   (and (match_code "const_double")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89213.c 
b/gcc/testsuite/gcc.target/powerpc/pr89213.c
new file mode 100644
index 000000000000..601f9166d6eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89213.c
@@ -0,0 +1,107 @@
+/* { dg-do compile { target { powerpc64*-*-* && lp64 } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
"-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mcpu=power9 -O2" } */
+
+/* Optimize vector shifts by constants.  */
+
+#include <altivec.h>
+
+typedef vector long long vi64_t;
+typedef vector unsigned long long vui64_t;
+
+typedef vector int vi32_t;
+typedef vector unsigned int vui32_t;
+
+vi64_t
+shiftra_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi64_t
+shiftra_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi32_t
+shiftra_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+vi32_t
+shiftra_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsld\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvslw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvspltisw\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsrd\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrad\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mvsraw\M}     2 } } */

Reply via email to