https://gcc.gnu.org/g:ed8cd42d138fa048e0c0eff1ea28b39f5abe1c29

commit r15-6836-ged8cd42d138fa048e0c0eff1ea28b39f5abe1c29
Author: Maciej W. Rozycki <ma...@orcam.me.uk>
Date:   Sun Jan 12 16:48:54 2025 +0000

    Alpha: Fix a block move pessimisation with zero-extension after LDWU
    
    For the BWX case we have a pessimisation in `alpha_expand_block_move'
    for HImode loads where we place the data loaded into a HImode register
    as well, therefore losing information that indeed the data loaded has
    already been zero-extended to the full DImode width of the register.
    Later on when we store this data in QImode quantities into an unaligned
    destination, we zero-extend it again for the purpose of right-shifting,
    such as with the test case included producing code at `-O2' as follows:
    
            ldah $2,unaligned_src_hi($29)           !gprelhigh
            lda $1,unaligned_src_hi($2)             !gprellow
            ldwu $6,unaligned_src_hi($2)            !gprellow
            ldwu $5,2($1)
            ldwu $4,4($1)
            bis $31,$31,$31
            zapnot $6,3,$3                          # Redundant!
            ldbu $7,6($1)
            zapnot $5,3,$2                          # Redundant!
            stb $6,0($16)
            zapnot $4,3,$1                          # Redundant!
            stb $5,2($16)
            srl $3,8,$3
            stb $4,4($16)
            srl $2,8,$2
            stb $3,1($16)
            srl $1,8,$1
            stb $2,3($16)
            stb $1,5($16)
            stb $7,6($16)
    
    The non-BWX case is unaffected, because there we use byte insertion, so
    we don't care that data is held in a HImode register.
    
    Address this by making the holding RTX a HImode subreg of the original
    DImode register, which the RTL passes can then see through and eliminate
    the zero-extension where otherwise required, resulting in this shortened
    code:
    
            ldah $2,unaligned_src_hi($29)           !gprelhigh
            lda $1,unaligned_src_hi($2)             !gprellow
            ldwu $4,unaligned_src_hi($2)            !gprellow
            ldwu $3,2($1)
            ldwu $2,4($1)
            bis $31,$31,$31
            srl $4,8,$6
            ldbu $1,6($1)
            srl $3,8,$5
            stb $4,0($16)
            stb $6,1($16)
            srl $2,8,$4
            stb $3,2($16)
            stb $5,3($16)
            stb $2,4($16)
            stb $4,5($16)
            stb $1,6($16)
    
    While at it reformat the enclosing do-while statement according to the
    GNU Coding Standards, observing that in this case it does not obfuscate
    the change owing to the odd original indentation.
    
            gcc/
            * config/alpha/alpha.cc (alpha_expand_block_move): Use a HImode
            subreg of a DImode register to hold data from an aligned HImode
            load.

Diff:
---
 gcc/config/alpha/alpha.cc                               | 17 +++++++++++------
 .../gcc.target/alpha/memcpy-hi-unaligned-dst.c          | 16 ++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/gcc/config/alpha/alpha.cc b/gcc/config/alpha/alpha.cc
index 8ec9e8c5d399..6965ece16d0b 100644
--- a/gcc/config/alpha/alpha.cc
+++ b/gcc/config/alpha/alpha.cc
@@ -3999,14 +3999,19 @@ alpha_expand_block_move (rtx operands[])
   if (bytes >= 2)
     {
       if (src_align >= 16)
-       {
-         do {
-           data_regs[nregs++] = tmp = gen_reg_rtx (HImode);
-           emit_move_insn (tmp, adjust_address (orig_src, HImode, ofs));
+       do
+         {
+           tmp = gen_reg_rtx (DImode);
+           emit_move_insn (tmp,
+                           expand_simple_unop (DImode, SET,
+                                               adjust_address (orig_src,
+                                                               HImode, ofs),
+                                               NULL_RTX, 1));
+           data_regs[nregs++] = gen_rtx_SUBREG (HImode, tmp, 0);
            bytes -= 2;
            ofs += 2;
-         } while (bytes >= 2);
-       }
+         }
+       while (bytes >= 2);
       else if (! TARGET_BWX)
        {
          data_regs[nregs++] = tmp = gen_reg_rtx (HImode);
diff --git a/gcc/testsuite/gcc.target/alpha/memcpy-hi-unaligned-dst.c 
b/gcc/testsuite/gcc.target/alpha/memcpy-hi-unaligned-dst.c
new file mode 100644
index 000000000000..4e3c02f5b906
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/memcpy-hi-unaligned-dst.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mbwx" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned short unaligned_src_hi[4];
+
+void
+memcpy_unaligned_dst_hi (void *dst)
+{
+  __builtin_memcpy (dst, unaligned_src_hi, 7);
+}
+
+/* { dg-final { scan-assembler-times "\\sldwu\\s" 3 } } */
+/* { dg-final { scan-assembler-times "\\sldbu\\s" 1 } } */
+/* { dg-final { scan-assembler-times "\\sstb\\s" 7 } } */
+/* { dg-final { scan-assembler-not "\\szapnot\\s" } } */

Reply via email to