From: Juzhe-Zhong <juzhe.zh...@rivai.ai>

This patch is to optimize the codegen of RVV VLS auto-vectorizaiton codegen due 
to
alignment.

void __attribute__((noinline, noclone))
f3 (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}

Before this patch:
f3:
        ble     a3,zero,.L1
        srli    a5,a1,2
        negw    a5,a5
        andi    a4,a5,3
        sext.w  a3,a3
        beq     a4,zero,.L3
        lw      a7,0(a1)
        lw      a6,0(a2)
        andi    a5,a5,2
        addw    a6,a6,a7
        sw      a6,0(a0)
        beq     a5,zero,.L3
        lw      a7,4(a1)
        lw      a5,4(a2)
        li      a6,3
        addw    a5,a5,a7
        sw      a5,4(a0)
        bne     a4,a6,.L3
        lw      a6,8(a2)
        lw      a5,8(a1)
        addw    a5,a5,a6
        sw      a5,8(a0)
.L3:
        subw    a3,a3,a4
        slli    a6,a4,2
        slli    a5,a3,32
        srli    a5,a5,32
        add     a1,a1,a6
        add     a2,a2,a6
        add     a0,a0,a6
        li      a3,4
.L6:
        mv      a4,a5
        bleu    a5,a3,.L5
        li      a4,4
.L5:
        vsetvli zero,a4,e32,m1,ta,ma
        vle32.v v1,0(a1)
        vle32.v v2,0(a2)
        vsetivli        zero,4,e32,m1,ta,ma
        sub     a5,a5,a4
        vadd.vv v1,v1,v2
        vsetvli zero,a4,e32,m1,ta,ma
        vse32.v v1,0(a0)
        addi    a1,a1,16
        addi    a2,a2,16
        addi    a0,a0,16
        bne     a5,zero,.L6
.L1:
        ret

After this patch:
f3:
        ble     a3,zero,.L1
        li      a4,4
.L4:
        mv      a5,a3
        bleu    a3,a4,.L3
        li      a5,4
.L3:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetivli        zero,4,e32,m1,ta,ma
        sub     a3,a3,a5
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        addi    a2,a2,16
        addi    a0,a0,16
        addi    a1,a1,16
        bne     a3,zero,.L4
.L1:
        ret

The TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE is directly coming from ARM SVE.

The TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST is same as GCN port that 
vectorize
all cases by default. We will need to support accurate vector cost model in the 
future.

gcc/ChangeLog:

        * config/riscv/riscv.cc (riscv_simd_vector_alignment_reachable): New 
function.
        (riscv_vectorization_cost): New function.
        (TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE): New target hook.
        (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): New target hook.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/align-2.c: New test.

---
 gcc/config/riscv/riscv.cc                     | 39 +++++++++++++++++++
 .../gcc.target/riscv/rvv/autovec/align-2.c    | 12 ++++++
 2 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index a5776a550b2..54306327cb3 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7517,6 +7517,39 @@ riscv_vectorize_preferred_vector_alignment (const_tree 
type)
   return TYPE_ALIGN (type);
 }
 
+/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
+
+static bool
+riscv_simd_vector_alignment_reachable (const_tree type, bool is_packed)
+{
+  if (is_packed)
+    return false;
+
+  /* For fixed-length vectors, check that the vectorizer will aim for
+     full-vector alignment.  This isn't true for generic GCC vectors
+     that are wider than the ABI maximum of 128 bits.  */
+  poly_uint64 preferred_alignment
+    = riscv_vectorize_preferred_vector_alignment (type);
+  if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+      && maybe_ne (wi::to_widest (TYPE_SIZE (type)), preferred_alignment))
+    return false;
+
+  /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
+  return true;
+}
+
+/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST.  */
+
+int
+riscv_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
+                         tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
+{
+  /* TODO: Always vectorize. The vectorization COST model is not accurate,
+     we will need to support accurate vectorization COST model according
+     to '-mtune' in the future.  */
+  return 1;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7792,6 +7825,12 @@ riscv_vectorize_preferred_vector_alignment (const_tree 
type)
 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
   riscv_vectorize_preferred_vector_alignment
+#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
+#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
+  riscv_simd_vector_alignment_reachable
+#undef  TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
+  riscv_vectorization_cost
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
new file mode 100644
index 00000000000..812584e9d25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param 
riscv-autovec-preference=fixed-vlmax" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
-- 
2.36.1

Reply via email to