Correct the offset adjustment made in the multi-word unaligned access 
helpers such that it is actually used by the unaligned load and store 
instructions, fixing a bug introduced with commit 1eb356b98df2 ("alpha 
gprel optimizations")[1] back in 2001, which replaced address changes 
made directly according to the argument of the MEM expression passed 
with one made according to an address previously extracted from said MEM 
expression.  The address is however incorrectly extracted from said MEM 
before an adjustment has been made to it for the offset supplied.

This bug is usually covered by the fact that our block move and clear 
operations are hardly ever provided with correct block alignment data 
and we also usually fail to fetch that information from the MEM supplied 
(although PR target/115459 shows it does happen sometimes).  Instead the 
bit alignment of 8 is usually conservatively used, meaning that a zero 
offset is passed to `alpha_expand_unaligned_store_words' and then code 
has been written such that neither `alpha_expand_unaligned_load_words' 
nor `alpha_expand_unaligned_store_words' cannot ever be called with 
nonzero offset from `alpha_expand_block_move'.

The only situation where `alpha_expand_unaligned_store_words' can be 
called with nonzero offset is from `alpha_expand_block_clear' with a BWX 
target for a misaligned block that has been embedded in a data object of 
a higher alignment such that there is a small unaligned prefix our code 
decides to handle so as to align further stores.

For instance it happens when a block clear is called for a block of 9 
bytes embedded at offset 1 in a structure aligned to a 2-byte word, as 
illustrated by the test case included.  Now this test case does not work 
without the change that comes next applied, because the backend cannot 
see the word alignment of the struct and uses the bit alignment of 8 
instead.

Should this change be swapped with the next one incorrect code such as:

        stb $31,1($16)
        lda $3,1($16)
        ldq_u $2,8($16)
        ldq_u $1,1($16)
        mskqh $2,$3,$2
        stq_u $2,8($16)
        mskql $1,$3,$1
        stq_u $1,1($16)

would be produced, where the unadjusted offsets of 1/8 can be seen with 
the LDQ_U/STQ_U operations along with byte masks calculated accordingly 
rather than the expected offsets of 2/9.  As a result the byte at the 
offset of 9 fails to get cleared.  In these circumstances this would 
also show as execution failures with the memclr.c test:

FAIL: gcc.c-torture/execute/memclr.c   -O1  execution test
FAIL: gcc.c-torture/execute/memclr.c   -Os  execution test

-- not at `-O0' though, as the higher alignment cannot be retrieved in 
that case, and then not at `-O2' or higher optimization levels either, 
because then we choose to open-code this block clear instead:

        ldbu $1,0($16)
        stw $31,8($16)
        stq $1,0($16)

avoiding the bug in `alpha_expand_unaligned_store_words'.

I am leaving the pattern match test case XFAIL-ed here for documentation 
purposes and it will be un-XFAIL-ed along with the fix to retrieve the 
correct alignment.  The run test is of course never expected to fail.

References:

[1] 
<https://inbox.sourceware.org/gcc-patches/20010909020708.a29...@twiddle.net/>

        gcc/
        * config/alpha/alpha.cc (alpha_expand_unaligned_load_words): 
        Move address extraction until after the MEM referred has been 
        adjusted for the offset supplied.
        (alpha_expand_unaligned_store_words): Likewise.

        gcc/testsuite/
        * gcc.target/alpha/memclr-a2-o1-c9-ptr.c: New file.
        * gcc.target/alpha/memclr-a2-o1-c9-run.c: New file.
---
 gcc/config/alpha/alpha.cc                            |   16 +++---
 gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-ptr.c |   50 +++++++++++++++++++
 gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-run.c |   25 +++++++++
 3 files changed, 83 insertions(+), 8 deletions(-)

gcc-alpha-unaligned-words-adjust-address.diff
Index: gcc/gcc/config/alpha/alpha.cc
===================================================================
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -3625,10 +3625,6 @@ alpha_expand_unaligned_load_words (rtx *
   rtx sreg, areg, tmp, smema;
   HOST_WIDE_INT i;
 
-  smema = XEXP (smem, 0);
-  if (GET_CODE (smema) == LO_SUM)
-    smema = force_reg (Pmode, smema);
-
   /* Generate all the tmp registers we need.  */
   for (i = 0; i < words; ++i)
     {
@@ -3640,6 +3636,10 @@ alpha_expand_unaligned_load_words (rtx *
   if (ofs != 0)
     smem = adjust_address (smem, GET_MODE (smem), ofs);
 
+  smema = XEXP (smem, 0);
+  if (GET_CODE (smema) == LO_SUM)
+    smema = force_reg (Pmode, smema);
+
   /* Load up all of the source data.  */
   for (i = 0; i < words; ++i)
     {
@@ -3698,10 +3698,6 @@ alpha_expand_unaligned_store_words (rtx
   rtx st_addr_1, st_addr_2, dmema;
   HOST_WIDE_INT i;
 
-  dmema = XEXP (dmem, 0);
-  if (GET_CODE (dmema) == LO_SUM)
-    dmema = force_reg (Pmode, dmema);
-
   /* Generate all the tmp registers we need.  */
   if (data_regs != NULL)
     for (i = 0; i < words; ++i)
@@ -3712,6 +3708,10 @@ alpha_expand_unaligned_store_words (rtx
   if (ofs != 0)
     dmem = adjust_address (dmem, GET_MODE (dmem), ofs);
 
+  dmema = XEXP (dmem, 0);
+  if (GET_CODE (dmema) == LO_SUM)
+    dmema = force_reg (Pmode, dmema);
+
   st_addr_2 = change_address (dmem, DImode,
                              gen_rtx_AND (DImode,
                                           plus_constant (DImode, dmema,
Index: gcc/gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-ptr.c
===================================================================
--- /dev/null
+++ gcc/gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-ptr.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-mbwx" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+typedef unsigned int __attribute__ ((mode (QI))) int08_t;
+typedef unsigned int __attribute__ ((mode (HI))) int16_t;
+
+typedef struct
+  {
+    int08_t v[9];
+  }
+s_t;
+
+typedef union
+  {
+    struct
+      {
+       int08_t c[1];
+       s_t s;
+       int08_t d[6];
+      };
+    int16_t a;
+  }
+u_t;
+
+void __attribute__ ((noinline))
+memclr_a2_o1_c9 (u_t *u)
+{
+  u->s = (s_t) { 0 };
+}
+
+/* Expect assembly such as:
+
+       ldq_u $2,9($16)
+       stb $31,1($16)
+       lda $3,2($16)
+       ldq_u $1,2($16)
+       mskqh $2,$3,$2
+       stq_u $2,9($16)
+       mskql $1,$3,$1
+       stq_u $1,2($16)
+
+   that is with a byte store at offset 1 and with two unaligned load/store
+   pairs at offsets 2 and 9 each.  */
+
+/* { dg-final { scan-assembler-times 
"\\sldq_u\\s\\\$\[0-9\]+,2\\\(\\\$16\\\)\\s" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times 
"\\sldq_u\\s\\\$\[0-9\]+,9\\\(\\\$16\\\)\\s" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "\\sstb\\s\\\$31,1\\\(\\\$16\\\)\\s" 1 { 
xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times 
"\\sstq_u\\s\\\$\[0-9\]+,2\\\(\\\$16\\\)\\s" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times 
"\\sstq_u\\s\\\$\[0-9\]+,9\\\(\\\$16\\\)\\s" 1 { xfail *-*-* } } } */
Index: gcc/gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-run.c
===================================================================
--- /dev/null
+++ gcc/gcc/testsuite/gcc.target/alpha/memclr-a2-o1-c9-run.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "" } */
+
+#include "memclr-a2-o1-c9-ptr.c"
+
+u_t u = { { { [0] = 0xaa }, {{ [0 ... 8] = 0xaa }}, { [0 ... 5] = 0xaa } } };
+
+int
+main (void)
+{
+  int i;
+
+  memclr_a2_o1_c9 (&u);
+  asm ("" : : : "memory");
+  for (i = 0; i < sizeof (u.c); i++)
+    if (u.c[i] != 0xaa)
+      __builtin_abort ();
+  for (i = 0; i < sizeof (u.s.v); i++)
+    if (u.s.v[i] != 0x00)
+      __builtin_abort ();
+  for (i = 0; i < sizeof (u.d); i++)
+    if (u.d[i] != 0xaa)
+      __builtin_abort ();
+  return 0;
+}

Reply via email to