https://gcc.gnu.org/g:7e60829c19ac18dd39071d8877f5df4258531e8d

commit 7e60829c19ac18dd39071d8877f5df4258531e8d
Author: Michael Meissner <meiss...@linux.ibm.com>
Date:   Mon Jun 16 16:44:46 2025 -0400

    PR target/120528 -- Simplify zero extend from memory to VSX register on 
power10
    
    Previously GCC would zero extend a DImode value in memory to a TImode 
target in
    a vector register by firt zero extending the DImode value into a GPR TImode
    register pair, and then do a MTVSRDD to move this value to a VSX register.
    
    For example, consider the following code:
    
            #ifndef TYPE
            #define TYPE unsigned long long
            #endif
    
            void
            mem_to_vsx (TYPE *p, __uint128_t *q)
            {
              /* lxvrdx 0,0,3
                 stxv 0,0(4)  */
    
              __uint128_t x = *p;
              __asm__ (" # %x0" : "+wa" (x));
              *q = x;
    }
    
    It currently generates the following code on power10:
    
            mem_to_vsx:
                    ld 10,0(3)
                    li 11,0
                    mtvsrdd 0,11,10
            #APP
                     # 0
            #NO_APP
                    stxv 0,0(4)
                    blr
    
    Instead it could generate:
    
            mem_to_vsx:
                    lxvrdx 0,0,3
            #APP
                     # 0
            #NO_APP
                    stxv 0,0(4)
                    blr
    
    The lxvr{b,h,w,d}x instructions were added in power10, and they load up a 
vector
    register with a byte, half-word, word, or double-word value in the right 
most
    bits, and fill the remaining bits to 0.  I noticed this code when working 
on PR
    target/108958 (which I just posted the patch).
    
    This patch creates a peephole2 to catch this case, and it eliminates 
creating
    the TImode variable.  Instead it just does the LXVR{B,H,W,D}x instruction
    directly.
    
    I have built GCC with the patches in this patch set applied on both little 
and
    big endian PowerPC systems and there were no regressions.  Can I apply this
    patch to GCC 16?
    
    2025-06-16  Michael Meissner  <meiss...@linux.ibm.com>
    
    gcc/
    
            PR target/120528
            * config/rs6000/rs6000.md (zero_extend??ti2 peephole2): Add a 
peephole2
            to simplify zero extending a QI/HI/SI/DImode value in memory to a 
TImode
            target in a vector register to use the LXVR{B,H,W,D}X instructins.
    
    gcc/testsuite/
    
            PR target/120528
            * gcc.target/powerpc/pr120528.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000.md                 | 69 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr120528.c | 91 +++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 1503bc3fbbd1..696f743ec066 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -996,6 +996,75 @@
   "mtvsrdd %x0,0,%1"
   [(set_attr "type" "mtvsr")])
 
+;; On power10, optimize zero extending a QI/HI/SI/DImode value from memory that
+;; is going to a vector register target by generating a LXVR{B,H,W,D}X
+;; instruction without creating the TImode value in a GPR and using MTVSRDD to
+;; move it to the vector register.
+(define_peephole2
+  [(set (match_operand:DI 0 "int_reg_operand")
+       (match_operand:DI 1 "memory_operand"))
+   (set (match_operand:DI 2 "base_reg_operand")
+       (const_int 0))
+   (set (match_operand:TI 3 "vsx_register_operand")
+       (match_operand:TI 4 "int_reg_operand"))]
+  "TARGET_POWER10 && TARGET_POWERPC64
+   && (reg_or_subregno (operands[0])
+       == reg_or_subregno (operands[4]) + !!WORDS_BIG_ENDIAN)
+   && (reg_or_subregno (operands[2])
+       == reg_or_subregno (operands[4]) + !WORDS_BIG_ENDIAN)
+   && peep2_reg_dead_p (3, operands[4])
+   && (REG_P (XEXP (operands[1], 0))
+       || SUBREG_P (XEXP (operands[1], 0))
+       || GET_CODE (XEXP (operands[1], 0)) == PLUS)"
+  [(set (match_dup 3)
+       (zero_extend:TI (match_dup 5)))]
+{
+  rtx mem = operands[1];
+  rtx addr = XEXP (mem, 0);
+
+  if (indexed_or_indirect_address (addr, DImode))
+    operands[5] = mem;
+  else
+    {
+      rtx op2 = operands[2];
+      emit_insn (gen_rtx_SET (op2, addr));
+      operands[5] = change_address (mem, DImode, op2);
+    }
+})
+
+(define_peephole2
+  [(set (match_operand:DI 0 "int_reg_operand")
+       (zero_extend:DI
+        (match_operand:QHSI 1 "memory_operand")))
+   (set (match_operand:DI 2 "base_reg_operand")
+       (const_int 0))
+   (set (match_operand:TI 3 "vsx_register_operand")
+       (match_operand:TI 4 "int_reg_operand"))]
+  "TARGET_POWER10 && TARGET_POWERPC64
+   && (reg_or_subregno (operands[0])
+       == reg_or_subregno (operands[4]) + !!WORDS_BIG_ENDIAN)
+   && (reg_or_subregno (operands[2])
+       == reg_or_subregno (operands[4]) + !WORDS_BIG_ENDIAN)
+   && peep2_reg_dead_p (3, operands[4])
+   && (REG_P (XEXP (operands[1], 0))
+       || SUBREG_P (XEXP (operands[1], 0))
+       || GET_CODE (XEXP (operands[1], 0)) == PLUS)"
+  [(set (match_dup 3)
+       (zero_extend:TI (match_dup 5)))]
+{
+  rtx mem = operands[1];
+  rtx addr = XEXP (mem, 0);
+
+  if (indexed_or_indirect_address (addr, DImode))
+    operands[5] = mem;
+  else
+    {
+      rtx op2 = operands[2];
+      emit_insn (gen_rtx_SET (op2, addr));
+      operands[5] = change_address (mem, DImode, op2);
+    }
+})
+
 (define_insn "zero_extendsi<mode>2"
   [(set (match_operand:EXTSI 0 "gpc_reg_operand" "=r,r,d,wa,wa,r,wa")
        (zero_extend:EXTSI (match_operand:SI 1 "reg_or_mem_operand" 
"m,r,?Z,?Z,r,wa,wa")))]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr120528.c 
b/gcc/testsuite/gcc.target/powerpc/pr120528.c
new file mode 100644
index 000000000000..476725eaa4fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr120528.c
@@ -0,0 +1,91 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <stddef.h>
+
+#ifndef TYPE
+#define TYPE unsigned long long
+#endif
+
+void
+mem_to_vsx (TYPE *p, __uint128_t *q)
+{
+  /* lxvrdx 0,0,3
+     stxv 0,0(4)  */
+
+  __uint128_t x = *p;
+  __asm__ (" # %x0" : "+wa" (x));
+  *q = x;
+}
+
+void
+memx_to_vsx (TYPE *p, size_t n, __uint128_t *q)
+{
+  /* sldi 4,4,3
+     lxvrdx 0,3,4
+     stxv 0,0(4)  */
+
+  __uint128_t x = p[n];
+  __asm__ (" # %x0" : "+wa" (x));
+  *q = x;
+}
+
+void
+mem3_to_vsx (TYPE *p, __uint128_t *q)
+{
+  /* addi 2,3,24
+     lxvrdx 0,0,2
+     stxv 0,0(4)  */
+
+  __uint128_t x = p[3];
+  __asm__ (" # %x0" : "+wa" (x));
+  *q = x;
+}
+
+void
+mem_to_gpr (TYPE *p, __uint128_t *q)
+{
+  /* ld 2,0(3)
+     li 3,0
+     std 2,0(4)
+     std 3,8(8)  */
+
+  __uint128_t x = *p;
+  __asm__ (" # %0" : "+r" (x));
+  *q = x;
+}
+
+void
+memx_to_gpr (TYPE *p, size_t n, __uint128_t *q)
+{
+  /* sldi 4,4,3
+     ldx 2,3,4
+     li 3,0
+     std 2,0(4)
+     std 3,8(8)  */
+
+  __uint128_t x = p[n];
+  __asm__ (" # %0" : "+r" (x));
+  *q = x;
+}
+
+void
+mem3_to_gpr (TYPE *p, __uint128_t *q)
+{
+  /* ld 2,24(3)
+     li 3,0
+     std 2,0(4)
+     std 3,8(8)  */
+
+  __uint128_t x = p[3];
+  __asm__ (" # %0" : "+r" (x));
+  *q = x;
+}
+
+/* { dg-final { scan-assembler-times {\maddi\M}   1 } } */
+/* { dg-final { scan-assembler-times {\mli\M}     3 } } */
+/* { dg-final { scan-assembler-times {\mlxvrdx\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M}   3 } } */

Reply via email to