From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>

Currently we use pred_mov to to do the codegen for vse intrinsics. However, it
generates inferior codegen when I am testing AVL model of VSETVL PASS using vse 
intrinsics.

Consider this following code:
void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
{
  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
  vbool64_t mask = *(vbool64_t*)mask_in;
  for (int i = 0; i < n; i++)
    {
      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);

      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);

      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + 
i + 200), 13);
      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);

      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 
300), 11);
      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);

      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i 
+ 500), 11);
      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);

      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 
600), 11);
      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);

      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
    }
}

Before this patch:
        csrr    t2,vlenb
        srli    t2,t2,1
        slli    s0,t2,2
        vsetvli zero,19,e16,mf2,ta,ma
        sub     s0,s0,t2
        csrr    t2,vlenb
        vle16.v v24,0(a3)
        mv      a4,a3
        vse16.v v24,0(a1)
        srli    t2,t2,1
        add     a2,a3,t6
        add     s0,s0,sp
        vsetvli zero,19,e32,mf2,ta,ma
        addi    a3,a3,4
        vle32.v v24,0(a3)
        vsetvli zero,t0,e32,mf2,ta,ma
        vse32.v v24,0(s0)
        slli    s0,t2,2
        sub     s0,s0,t2
        add     s0,s0,sp
        vsetvli t0,zero,e32,mf2,ta,ma
        vle32.v v24,0(s0)
        mv      s0,t2
        slli    t2,t2,2
        mv      a5,a1
        vsetvli zero,19,e32,mf2,ta,ma
        addi    a1,a1,4
        sub     t2,t2,s0
        vse32.v v24,0(a1)
        add     t2,t2,sp
        vsetvli t0,zero,e32,mf2,ta,ma
        addi    t1,a5,796
        vle32.v v24,0(t2)
        addi    t5,a4,1196
        addi    a7,a5,1196
        addi    t4,a4,1996
        addi    a6,a5,1996
        vsetvli zero,13,e32,mf2,ta,ma
        add     a4,a4,t3
        vse32.v v24,0(t1)
        add     a5,a5,t3
        vsetvli zero,11,e64,m1,tu,mu
        vle64.v v24,0(t5),v0.t
        vse64.v v24,0(a7)
        vle64.v v24,0(t4),v0.t
        vse64.v v24,0(a6)
        vle64.v v24,0(a4),v0.t
        vse64.v v24,0(a5),v0.t
        vsetvli zero,11,e8,mf4,ta,ma
        vle8.v  v24,0(a2)
        vse8.v  v24,0(a2)
        bne     a0,a3,.L8
        csrr    t0,vlenb
        slli    t1,t0,1
        add     sp,sp,t1
        lw      s0,12(sp)
        addi    sp,sp,16
        jr      ra

We are generating redundant spilling codes.
Here we introduce a dedicated pred_store pattern for vse intrinsics like
maskstore in ARM SVE.

After this patch:
vsetvli zero,19,e16,mf2,ta,ma
        mv      a5,a4
        vle16.v v24,0(a0)
        mv      a3,a0
        vse16.v 19,0(a4)
        addi    t1,a4,796
        vsetvli zero,19,e32,mf2,ta,ma
        addi    a0,a0,4
        addi    a4,a4,4
        vle32.v v24,0(a0)
        addi    t0,a3,1196
        vse32.v 19,0(a4)
        addi    a7,a5,1196
        addi    t6,a3,1996
        addi    a6,a5,1996
        add     t5,a3,t4
        vsetvli zero,13,e32,mf2,ta,ma
        add     a2,a5,t4
        vse32.v 13,0(t1)
        add     a3,a3,t3
        vsetvli zero,11,e64,m1,tu,mu
        add     a5,a5,t3
        vle64.v v24,0(t0),v0.t
        vse64.v 11,0(a7)
        vle64.v v24,0(t6),v0.t
        vse64.v 11,0(a6)
        vle64.v v24,0(t5),v0.t
        vse64.v 11,0(a2),v0.t
        vsetvli zero,11,e8,mf4,ta,ma
        vle8.v  v24,0(a3)
        vse8.v  11,0(a5)
        bne     a1,a4,.L8
.L6:
        ret

gcc/ChangeLog:

        * config/riscv/riscv-vector-builtins-bases.cc (class loadstore): use 
pred_store for vse.
        * config/riscv/riscv-vector-builtins.cc 
(function_expander::add_mem_operand): Refine function.
        (function_expander::use_contiguous_load_insn): Adjust new 
implementation.
        (function_expander::use_contiguous_store_insn): Ditto.
        * config/riscv/riscv-vector-builtins.h: Refine function.
        * config/riscv/vector.md (@pred_store<mode>): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/base/vse-constraint-1.c: New test.

---
 .../riscv/riscv-vector-builtins-bases.cc      |  2 +-
 gcc/config/riscv/riscv-vector-builtins.cc     | 22 +----
 gcc/config/riscv/riscv-vector-builtins.h      |  8 +-
 gcc/config/riscv/vector.md                    | 23 ++++-
 .../riscv/rvv/base/vse-constraint-1.c         | 97 +++++++++++++++++++
 5 files changed, 128 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 10373e5ccf2..af66b016b49 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -106,7 +106,7 @@ class loadstore : public function_base
   rtx expand (function_expander &e) const override
   {
     if (STORE_P)
-      return e.use_contiguous_store_insn (code_for_pred_mov (e.vector_mode 
()));
+      return e.use_contiguous_store_insn (code_for_pred_store (e.vector_mode 
()));
     else
       return e.use_contiguous_load_insn (code_for_pred_mov (e.vector_mode ()));
   }
diff --git a/gcc/config/riscv/riscv-vector-builtins.cc 
b/gcc/config/riscv/riscv-vector-builtins.cc
index e39bfea9636..47e01b647f8 100644
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -845,15 +845,15 @@ function_expander::add_vundef_operand (machine_mode mode)
 }
 
 /* Add a memory operand with mode MODE and address ADDR.  */
-rtx
-function_expander::add_mem_operand (machine_mode mode, rtx addr)
+void
+function_expander::add_mem_operand (machine_mode mode, unsigned argno)
 {
   gcc_assert (VECTOR_MODE_P (mode));
+  rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno));
   rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
   /* The memory is only guaranteed to be element-aligned.  */
   set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
   add_fixed_operand (mem);
-  return mem;
 }
 
 /* Use contiguous load INSN.  */
@@ -878,9 +878,7 @@ function_expander::use_contiguous_load_insn (insn_code 
icode)
   else
     add_vundef_operand (mode);
 
-  tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++);
-  rtx addr = expand_normal (addr_arg);
-  add_mem_operand (mode, addr);
+  add_mem_operand (mode, arg_offset++);
 
   for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
     add_input_operand (argno);
@@ -904,27 +902,17 @@ function_expander::use_contiguous_store_insn (insn_code 
icode)
   /* Record the offset to get the argument.  */
   int arg_offset = 0;
 
-  int addr_loc = use_real_mask_p (pred) ? 1 : 0;
-  tree addr_arg = CALL_EXPR_ARG (exp, addr_loc);
-  rtx addr = expand_normal (addr_arg);
-  rtx mem = add_mem_operand (mode, addr);
+  add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0);
 
   if (use_real_mask_p (pred))
     add_input_operand (arg_offset++);
   else
     add_all_one_mask_operand (mask_mode);
 
-  /* To model "+m" constraint, we include memory operand into input.  */
-  add_input_operand (mode, mem);
-
   arg_offset++;
   for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
     add_input_operand (argno);
 
-  add_input_operand (Pmode, get_tail_policy_for_pred (pred));
-  add_input_operand (Pmode, get_mask_policy_for_pred (pred));
-  add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
-
   return generate_insn (icode);
 }
 
diff --git a/gcc/config/riscv/riscv-vector-builtins.h 
b/gcc/config/riscv/riscv-vector-builtins.h
index c13df99cb5b..58d8d78043c 100644
--- a/gcc/config/riscv/riscv-vector-builtins.h
+++ b/gcc/config/riscv/riscv-vector-builtins.h
@@ -317,12 +317,12 @@ public:
   rtx expand ();
 
   void add_input_operand (machine_mode, rtx);
-  void add_input_operand (unsigned argno);
+  void add_input_operand (unsigned);
   void add_output_operand (machine_mode, rtx);
-  void add_all_one_mask_operand (machine_mode mode);
-  void add_vundef_operand (machine_mode mode);
+  void add_all_one_mask_operand (machine_mode);
+  void add_vundef_operand (machine_mode);
   void add_fixed_operand (rtx);
-  rtx add_mem_operand (machine_mode, rtx);
+  void add_mem_operand (machine_mode, unsigned);
 
   machine_mode vector_mode (void) const;
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 89810b183fc..3d0174f98a2 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -209,7 +209,7 @@
 
 ;; The index of operand[] to get the merge op.
 (define_attr "merge_op_idx" ""
-       (cond [(eq_attr "type" "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu")
+       (cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu")
         (const_int 2)]
        (const_int INVALID_ATTRIBUTE)))
 
@@ -647,7 +647,7 @@
             (reg:SI VL_REGNUM)
             (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
          (match_operand:V 3 "vector_move_operand"       "    m,     m,    vr,  
  vr, viWc0")
-         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,   vu0,  
 vu0,   vu0")))]
+         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,    vu,  
 vu0,   vu0")))]
   "TARGET_VECTOR"
   "@
    vle<sew>.v\t%0,%3%p1
@@ -663,6 +663,25 @@
   [(set_attr "type" "vlde,vlde,vste,vimov,vimov")
    (set_attr "mode" "<MODE>")])
 
+;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov 
pattern to include
+;; memory operand as input which will produce inferior codegen.
+(define_insn "@pred_store<mode>"
+  [(set (match_operand:V 0 "memory_operand"                 "+m")
+       (if_then_else:V
+         (unspec:<VM>
+           [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1")
+            (match_operand 3 "vector_length_operand"    "   rK")
+            (reg:SI VL_REGNUM)
+            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+         (match_operand:V 2 "register_operand"         "    vr")
+         (match_dup 0)))]
+  "TARGET_VECTOR"
+  "vse<sew>.v\t%2,%0%p1"
+  [(set_attr "type" "vste")
+   (set_attr "mode" "<MODE>")
+   (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX"))
+   (set_attr "vl_op_idx" "3")])
+
 ;; vlm.v/vsm.v/vmclr.m/vmset.m.
 ;; constraint alternative 0 match vlm.v.
 ;; constraint alternative 1 match vsm.v.
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
new file mode 100644
index 00000000000..5b8b9b41c7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
@@ -0,0 +1,97 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
+
+#include "riscv_vector.h"
+
+void f (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + 
i + 200), 13);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13);
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 
300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i 
+ 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i 
+ 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + 
i + 200), 13);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 
300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i 
+ 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i 
+ 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+void f3 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + 
i + 200), 13);
+      *(vint32mf2_t*)(out + i + 200) = v3;
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 
300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i 
+ 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i 
+ 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+/* It should not have redundant vector register spills which produce csrr 
vlenb instructions allocate stack.  */
+/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */
-- 
2.36.1

Reply via email to