From: Pan Li <pan2...@intel.com>

The vector strided load doesn't include the (mem:BLK (scratch)) to
alias all other memories.  It will make the alias analysis only
consider the base address of strided load and promopt the store
before the strided load.  For example as below

  #define STEP 10

  char d[225];
  int e[STEP];

  int main() {
    // store 0, 10, 20, 30, 40, 50, 60, 70, 80, 90
    for (long h = 0; h < STEP; ++h)
      d[h * STEP] = 9;

    // load 30, 40, 50, 60, 70, 80, 90
    // store 3,  4,  5,  6,  7,  8,  9
    for (int h = 3; h < STEP; h += 1)
      e[h] = d[h * STEP];

    if (e[5] != 9) {
      __builtin_abort ();
    }

    return 0;
  }

The asm dump will be:
main:
        lui     a5,%hi(.LANCHOR0)
        addi    a5,a5,%lo(.LANCHOR0)
        li      a4,9
        sb      a4,30(a5)
        addi    a3,a5,30
        vsetivli        zero,7,e32,m1,ta,ma
        li      a2,10
        vlse8.v v2,0(a3),a2 // depends on 30(a5), 40(a5), ... 90(a5) but
                            // only 30(a5) has been promoted before vlse.
                            // It is store after load mistake.
        addi    a3,a5,252
        sb      a4,0(a5)
        sb      a4,10(a5)
        sb      a4,20(a5)
        sb      a4,40(a5)
        vzext.vf4       v1,v2
        sb      a4,50(a5)
        sb      a4,60(a5)
        vse32.v v1,0(a3)
        li      a0,0
        sb      a4,70(a5)
        sb      a4,80(a5)
        sb      a4,90(a5)
        lw      a5,260(a5)
        beq     a5,a4,.L4
        li      a0,123

After this patch:
main:
        vsetivli        zero,4,e32,m1,ta,ma
        vmv.v.i v1,9
        lui     a5,%hi(.LANCHOR0)
        addi    a5,a5,%lo(.LANCHOR0)
        addi    a4,a5,244
        vse32.v v1,0(a4)
        li      a4,9
        sb      a4,0(a5)
        sb      a4,10(a5)
        sb      a4,20(a5)
        sb      a4,30(a5)
        sb      a4,40(a5)
        sb      a4,50(a5)
        sb      a4,60(a5)
        sb      a4,70(a5)
        sb      a4,80(a5)
        sb      a4,90(a5)
        vsetivli        zero,3,e32,m1,ta,ma
        addi    a4,a5,70
        li      a3,10
        vlse8.v v2,0(a4),a3
        addi    a5,a5,260
        li      a0,0
        vzext.vf4       v1,v2
        vse32.v v1,0(a5)
        ret

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

        PR target/117990

gcc/ChangeLog:

        * config/riscv/vector.md: Add the (mem:BLK (scratch)) to the
        vector strided load.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/base/pr117990-run-1.c: New test.

Signed-off-by: Pan Li <pan2...@intel.com>
---
 gcc/config/riscv/vector.md                    |  1 +
 .../riscv/rvv/base/pr117990-run-1.c           | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr117990-run-1.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index a3b46ba751e..58406f3d17c 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -2381,6 +2381,7 @@ (define_insn "@pred_strided_load<mode>"
             (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
          (unspec:V_VLS
            [(match_operand:V_VLS 3 "memory_operand"         "     m,     m,    
 m,    m,     m,     m")
+            (mem:BLK (scratch))
             (match_operand 4 "<V_VLS:stride_predicate>"     
"<V_VLS:stride_load_constraint>")] UNSPEC_STRIDED)
          (match_operand:V_VLS 2 "vector_merge_operand"      "     0,    vu,    
vu,    0,    vu,    vu")))]
   "TARGET_VECTOR"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr117990-run-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr117990-run-1.c
new file mode 100644
index 00000000000..414bebdadbe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr117990-run-1.c
@@ -0,0 +1,24 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#define STEP 10
+
+char d[225];
+int e[STEP];
+
+int main() {
+  // store 0, 10, 20, 30, 40, 50, 60, 70, 80, 90
+  for (long h = 0; h < STEP; ++h)
+    d[h * STEP] = 9;
+
+  // load 30, 40, 50, 60, 70, 80, 90
+  // store 3,  4,  5,  6,  7,  8,  9
+  for (int h = 3; h < STEP; h += 1)
+    e[h] = d[h * STEP];
+
+  if (e[5] != 9) {
+    __builtin_abort ();
+  }
+
+  return 0;
+}
-- 
2.43.0

Reply via email to