https://gcc.gnu.org/g:3ca6fd8cbb8fc89b552ba220773bb3c972a5b77d

commit 3ca6fd8cbb8fc89b552ba220773bb3c972a5b77d
Author: Michael Meissner <meiss...@linux.ibm.com>
Date:   Mon Jun 16 16:40:52 2025 -0400

    PR target/108958 -- simplify mtvsrdd to zero extend GPR DImode to VSX TImode
    
    Before this patch GCC would zero extend a DImode GPR value to TImode by 
first
    zero extending the DImode value into a GPR TImode register pair, and then 
do a
    MTVSRDD to move this value to a VSX register.
    
    For example, consider the following code:
    
            #ifndef TYPE
            #define TYPE unsigned long long
            #endif
    
            void
            gpr_to_vsx (TYPE x, __uint128_t *p)
            {
              __uint128_t y = x;
              __asm__ (" # %x0" : "+wa" (y));
              *p = y;
            }
    
    Currently GCC generates:
    
            gpr_to_vsx:
                    mr 10,3
                    li 11,0
                    mtvsrdd 0,11,10
            #APP
                     # 0
            #NO_APP
                    stxv 0,0(4)
                    blr
    
    I.e. the mr and li instructions create the zero extended TImode value in a 
GPR,
    and then the mtvsrdd instruction moves both registers into a single vector
    register.
    
    Instead, GCC should generate the following code.  Since the mtvsrdd 
instruction
    will clear the upper 64 bits if the 2nd argument is 0 (non-zero values are 
a GPR
    to put in the upper 64 bits):
    
            gpr_to_vsx:
                    mtvsrdd 0,0,3
            #APP
                     # 0
            #NO_APP
                    stxv 0,0(4)
                    blr
    
    Originally, I posted a patch that added the zero_extendsiti2 insn.  I got 
some
    pushback about using reload_completed in the split portion of the
    define_insn_and_split.  However, this is a case where you absolutely have 
to use
    the reload_completed test, because if you split the code before register
    allocation to handle the normal, the split insns will not be compiled to
    generate the appropriate mtvsrdd without creating the TImode value in the 
GPR
    register.  I can imagine there might be concern about favoring generating 
code
    using the vector registers instead of using the GPR registers if the code 
does
    not require the TImode value to be in a vector register.
    
    I completely rewrote the patch.  This patch creates a peephole2 to catch 
this
    case, and it eliminates creating the TImode variable.  Instead it just does 
the
    MTVSRDD instruction directly.  That way it will not influence register
    allocation, and the code will only be generated in the specific case where 
we
    need the TImode value in a vector register.
    
    I have built GCC with the patches in this patch set applied on both little 
and
    big endian PowerPC systems and there were no regressions.  Can I apply this
    patch to GCC 16?
    
    2025-06-16  Michael Meissner  <meiss...@linux.ibm.com>
    
    gcc/
    
            PR target/108958
            * config/rs6000/rs6000.md (UNSPEC_ZERO_EXTEND): New unspec.
            (zero_extendsiti2 peephole2): Add a peephole2 to simplify zero 
extend
            between DImode value in a GPR to a TImode target in a vector 
register.
            (zero_extendsiti2_vsx): New insn.
    
    gcc/testsuite/
    
            PR target/108958
            * gcc.target/powerpc/pr108958.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000.md                 | 26 ++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr108958.c | 47 +++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 63d881ca1e0c..1503bc3fbbd1 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -173,6 +173,7 @@
    UNSPEC_XXSPLTIW_CONST
    UNSPEC_FMAX
    UNSPEC_FMIN
+   UNSPEC_ZERO_EXTEND
   ])
 
 ;;
@@ -969,6 +970,31 @@
    (set_attr "dot" "yes")
    (set_attr "length" "4,8")])
 
+;; Optimize zero_extendsiti2 from a GPR to a GPR and then moving the GPR to a
+;; VSX register
+(define_peephole2
+  [(set (match_operand:DI 0 "int_reg_operand")
+       (match_operand:DI 1 "int_reg_operand"))
+   (set (match_operand:DI 2 "int_reg_operand")
+       (const_int 0))
+   (set (match_operand:TI 3 "vsx_register_operand")
+       (match_operand:TI 4 "int_reg_operand"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && (reg_or_subregno (operands[0])
+       == reg_or_subregno (operands[4]) + !!WORDS_BIG_ENDIAN)
+   && (reg_or_subregno (operands[2])
+       == reg_or_subregno (operands[4]) + !WORDS_BIG_ENDIAN)
+   && peep2_reg_dead_p (3, operands[4])"
+  [(set (match_dup 3)
+       (unspec:TI [(match_dup 1)] UNSPEC_ZERO_EXTEND))])
+
+(define_insn "*zero_extendsiti2_vsx"
+  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
+       (unspec:TI [(match_operand:DI 1 "int_reg_operand" "r")]
+                  UNSPEC_ZERO_EXTEND))]
+  "TARGET_DIRECT_MOVE_64BIT"
+  "mtvsrdd %x0,0,%1"
+  [(set_attr "type" "mtvsr")])
 
 (define_insn "zero_extendsi<mode>2"
   [(set (match_operand:EXTSI 0 "gpc_reg_operand" "=r,r,d,wa,wa,r,wa")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108958.c 
b/gcc/testsuite/gcc.target/powerpc/pr108958.c
new file mode 100644
index 000000000000..21b3f2766918
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108958.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+
+#ifndef TYPE
+#define TYPE unsigned long long
+#endif
+
+/* PR target/108958, when zero extending a DImode to a TImode, and the TImode 
variable is in a VSX register, generate:
+
+       mtvsrdd vreg,0,gpr
+
+   instead of:
+
+       mr tmp,gpr
+       li tmp+1,0
+       mtvsrdd vreg,tmp+1,tmp.  */
+
+void
+gpr_to_vsx (TYPE x, __uint128_t *p)
+{
+  /* mtvsrdd 0,0,3
+     stvx 0,0(4)  */
+
+  __uint128_t y = x;
+  __asm__ (" # %x0" : "+wa" (y));
+  *p = y;
+}
+
+void
+gpr_to_gpr (TYPE x, __uint128_t *p)
+{
+  /* mr 2,3
+     li 3,0
+     std 2,0(4)
+     std 3,8(4)  */
+
+  __uint128_t y = x;
+  __asm__ (" # %0" : "+r" (y));
+  *p = y;
+}
+
+/* { dg-final { scan-assembler-times {\mli\M}              1 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd .*,0,.*\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstd\M}             2 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M}            1 } } */

Reply via email to