In determination of base block alignment we only examine a COMPONENT_REF 
tree node at hand without ever checking if its ultimate alignment has 
been reduced by the combined offset going back to the outermost object.  
Consequently cases have been observed where quadword accesses have been 
produced for a memory location referring a nested struct member only 
aligned to the longword boundary, causing emulation to trigger.

Address this issue by recursing into COMPONENT_REF tree nodes until the 
outermost one has been reached, which is supposed to be a MEM_REF one, 
accumulating the offset as we go, fixing a commit e0dae4da4c45 ("Alpha: 
Also use tree information to get base block alignment") regression.

Bail out and refrain from using tree information for alignment if we end 
up at something different or we are unable to calculate the offset at 
any point.

        gcc/
        * config/alpha/alpha.cc 
        (alpha_get_mem_rtx_alignment_and_offset): Recurse into 
        COMPONENT_REF nodes.

        gcc/testsuite/
        * gcc.target/alpha/memcpy-nested-offset-long.c: New file.
        * gcc.target/alpha/memcpy-nested-offset-quad.c: New file.
---
Hi,

 This was observed as non-zero kernel-mode unaligned emulation count with 
Linux built by GCC with said commit applied in the course of implementing 
unaligned LDx_L/STx_C emulation[1] required to complement our `-msafe-bwa' 
and `-msafe-partial' support[2].  Previously the count tended to stay at 
zero, as expected with good code.

 I find it rather odd that no other backend has such code already, or at 
least I couldn't find it, but perhaps they just don't strive so hard to 
use the widest load/store operations available.  After all there's nothing 
wrong semantics-wise from using narrower operations, they're just less 
efficient.  And non-strict-alignment targets of course don't care and can 
always go for the widest access feasible.

 Anyway this change has removed the kernel regression for me and is in the 
course of verification with my EV45 system, which should complete sometime 
tomorrow.  OK to apply if no regressions?

 NB I mean to sort out the outstanding matters with the original series[2] 
in the next few days, as from this coming weekend I'm going to be enjoying 
my holiday lasting a fortnight during which time it may not be feasible if 
indeed desirable for me to touch any of this stuff.  I do hope for it to 
land in GCC 15 regardless.

References:

[1] "Alpha: Emulate unaligned LDx_L/STx_C for data consistency", 
    
<https://lore.kernel.org/r/alpine.deb.2.21.2502181912230.65...@angie.orcam.me.uk/>

[2] "Fix data races with sub-longword accesses on Alpha",
    
<https://inbox.sourceware.org/gcc-patches/alpine.deb.2.21.2501050246590.49...@angie.orcam.me.uk/>

  Maciej
---
 gcc/config/alpha/alpha.cc                                  |   23 +--
 gcc/testsuite/gcc.target/alpha/memcpy-nested-offset-long.c |   76 +++++++++++++
 gcc/testsuite/gcc.target/alpha/memcpy-nested-offset-quad.c |   64 ++++++++++
 3 files changed, 150 insertions(+), 13 deletions(-)

gcc-alpha-mem-object-alignment-recursive.diff
Index: gcc/gcc/config/alpha/alpha.cc
===================================================================
--- gcc.orig/gcc/config/alpha/alpha.cc
+++ gcc/gcc/config/alpha/alpha.cc
@@ -3806,14 +3806,10 @@ alpha_get_mem_rtx_alignment_and_offset (
 
   tree mem = MEM_EXPR (expr);
   if (mem != NULL_TREE)
-    switch (TREE_CODE (mem))
-      {
-      case MEM_REF:
-       tree_offset = mem_ref_offset (mem).force_shwi ();
-       tree_align = get_object_alignment (get_base_address (mem));
-       break;
+    {
+      HOST_WIDE_INT comp_offset = 0;
 
-      case COMPONENT_REF:
+      for (; TREE_CODE (mem) == COMPONENT_REF; mem = TREE_OPERAND (mem, 0))
        {
          tree byte_offset = component_ref_field_offset (mem);
          tree bit_offset = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (mem, 1));
@@ -3822,14 +3818,15 @@ alpha_get_mem_rtx_alignment_and_offset (
              || !poly_int_tree_p (byte_offset, &offset)
              || !tree_fits_shwi_p (bit_offset))
            break;
-         tree_offset = offset + tree_to_shwi (bit_offset) / BITS_PER_UNIT;
+         comp_offset += offset + tree_to_shwi (bit_offset) / BITS_PER_UNIT;
        }
-       tree_align = get_object_alignment (get_base_address (mem));
-       break;
 
-      default:
-       break;
-      }
+      if (TREE_CODE (mem) == MEM_REF)
+       {
+         tree_offset = comp_offset + mem_ref_offset (mem).force_shwi ();
+         tree_align = get_object_alignment (get_base_address (mem));
+       }
+    }
 
   if (reg_align > mem_align)
     {
Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-nested-offset-long.c
===================================================================
--- /dev/null
+++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-nested-offset-long.c
@@ -0,0 +1,76 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+typedef unsigned int __attribute__ ((mode (DI))) int64_t;
+typedef unsigned int __attribute__ ((mode (SI))) int32_t;
+
+typedef union
+  {
+    int32_t l[8];
+  }
+val;
+
+typedef struct
+  {
+    int32_t l[2];
+    val v;
+  }
+tre;
+
+typedef struct
+  {
+    int32_t l[3];
+    tre t;
+  }
+due;
+
+typedef struct
+  {
+    val v;
+    int64_t q;
+    int32_t l[2];
+    due d;
+  }
+uno;
+
+void
+memcpy_nested_offset_long (uno *u)
+{
+  u->d.t.v = u->v;
+}
+
+/* Expect assembly such as:
+
+       ldq $4,0($16)
+       ldq $3,8($16)
+       ldq $2,16($16)
+       srl $4,32,$7
+       ldq $1,24($16)
+       srl $3,32,$6
+       stl $4,68($16)
+       srl $2,32,$5
+       stl $7,72($16)
+       srl $1,32,$4
+       stl $3,76($16)
+       stl $6,80($16)
+       stl $2,84($16)
+       stl $5,88($16)
+       stl $1,92($16)
+       stl $4,96($16)
+
+   that is with four quadword loads at offsets 0, 8, 16, 24 each and
+   eight longword stores at offsets 68, 72, 76, 80, 84, 88, 92, 96 each.  */
+
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,0\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,8\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,16\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,24\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,68\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,72\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,76\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,80\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,84\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,88\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,92\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstl\\s\\\$\[0-9\]+,96\\\(\\\$16\\\)\\s" 1 } } */
Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-nested-offset-quad.c
===================================================================
--- /dev/null
+++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-nested-offset-quad.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+typedef unsigned int __attribute__ ((mode (DI))) int64_t;
+typedef unsigned int __attribute__ ((mode (SI))) int32_t;
+
+typedef union
+  {
+    int32_t l[8];
+  }
+val;
+
+typedef struct
+  {
+    int32_t l[2];
+    val v;
+  }
+tre;
+
+typedef struct
+  {
+    int32_t l[3];
+    tre t;
+  }
+due;
+
+typedef struct
+  {
+    val v;
+    int64_t q;
+    int32_t l[3];
+    due d;
+  }
+uno;
+
+void
+memcpy_nested_offset_quad (uno *u)
+{
+  u->d.t.v = u->v;
+}
+
+/* Expect assembly such as:
+
+       ldq $4,0($16)
+       ldq $3,8($16)
+       ldq $2,16($16)
+       ldq $1,24($16)
+       stq $4,72($16)
+       stq $3,80($16)
+       stq $2,88($16)
+       stq $1,96($16)
+
+   that is with four quadword loads at offsets 0, 8, 16, 24 each
+   and four quadword stores at offsets 72, 80, 88, 96 each.  */
+
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,0\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,8\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,16\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sldq\\s\\\$\[0-9\]+,24\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstq\\s\\\$\[0-9\]+,72\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstq\\s\\\$\[0-9\]+,80\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstq\\s\\\$\[0-9\]+,88\\\(\\\$16\\\)\\s" 1 } } */
+/* { dg-final { scan-assembler-times 
"\\sstq\\s\\\$\[0-9\]+,96\\\(\\\$16\\\)\\s" 1 } } */

Reply via email to