Re: [PATCH] RISC-V: Support widening register overlap for vf4/vf8

Kito Cheng Wed, 29 Nov 2023 23:08:52 -0800

LGTM, thanks :)


On Thu, Nov 30, 2023 at 2:49 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote:
>
>
> size_t
> foo (char const *buf, size_t len)
> {
>   size_t sum = 0;
>   size_t vl = __riscv_vsetvlmax_e8m8 ();
>   size_t step = vl * 4;
>   const char *it = buf, *end = buf + len;
>   for (; it + step <= end;)
>     {
>       vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>       it += vl;
>       vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>       it += vl;
>       vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>       it += vl;
>       vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>       it += vl;
>
>       asm volatile("nop" ::: "memory");
>       vint64m8_t vw0 = __riscv_vsext_vf8_i64m8 (v0, vl);
>       vint64m8_t vw1 = __riscv_vsext_vf8_i64m8 (v1, vl);
>       vint64m8_t vw2 = __riscv_vsext_vf8_i64m8 (v2, vl);
>       vint64m8_t vw3 = __riscv_vsext_vf8_i64m8 (v3, vl);
>
>       asm volatile("nop" ::: "memory");
>       size_t sum0 = __riscv_vmv_x_s_i64m8_i64 (vw0);
>       size_t sum1 = __riscv_vmv_x_s_i64m8_i64 (vw1);
>       size_t sum2 = __riscv_vmv_x_s_i64m8_i64 (vw2);
>       size_t sum3 = __riscv_vmv_x_s_i64m8_i64 (vw3);
>
>       sum += sumation (sum0, sum1, sum2, sum3);
>     }
>   return sum;
> }
>
> Before this patch:
>
>         add     a3,s0,s1
>         add     a4,s6,s1
>         add     a5,s7,s1
>         vsetvli zero,s0,e64,m8,ta,ma
>         vle8.v  v4,0(s1)
>         vle8.v  v3,0(a3)
>         mv      s1,s2
>         vle8.v  v2,0(a4)
>         vle8.v  v1,0(a5)
>         nop
>         vsext.vf8       v8,v4
>         vsext.vf8       v16,v2
>         vs8r.v  v8,0(sp)
>         vsext.vf8       v24,v1
>         vsext.vf8       v8,v3
>         nop
>         vmv.x.s a1,v8
>         vl8re64.v       v8,0(sp)
>         vmv.x.s a3,v24
>         vmv.x.s a2,v16
>         vmv.x.s a0,v8
>         add     s2,s2,s5
>         call    sumation
>         add     s3,s3,a0
>         bgeu    s4,s2,.L5
>
> After this patch:
>
>         add     a3,s0,s1
>         add     a4,s6,s1
>         add     a5,s7,s1
>         vsetvli zero,s0,e64,m8,ta,ma
>         vle8.v  v15,0(s1)
>         vle8.v  v23,0(a3)
>         mv      s1,s2
>         vle8.v  v31,0(a4)
>         vle8.v  v7,0(a5)
>         vsext.vf8       v8,v15
>         vsext.vf8       v16,v23
>         vsext.vf8       v24,v31
>         vsext.vf8       v0,v7
>         vmv.x.s a3,v0
>         vmv.x.s a2,v24
>         vmv.x.s a1,v16
>         vmv.x.s a0,v8
>         add     s2,s2,s5
>         call    sumation
>         add     s3,s3,a0
>         bgeu    s4,s2,.L5
>
>         PR target/112431
>
> gcc/ChangeLog:
>
>         * config/riscv/vector.md: Add widening overlap of vf2/vf4.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/base/pr112431-16.c: New test.
>         * gcc.target/riscv/rvv/base/pr112431-17.c: New test.
>         * gcc.target/riscv/rvv/base/pr112431-18.c: New test.
>
> ---
>  gcc/config/riscv/vector.md                    | 38 ++++++-----
>  .../gcc.target/riscv/rvv/base/pr112431-16.c   | 68 +++++++++++++++++++
>  .../gcc.target/riscv/rvv/base/pr112431-17.c   | 51 ++++++++++++++
>  .../gcc.target/riscv/rvv/base/pr112431-18.c   | 51 ++++++++++++++
>  4 files changed, 190 insertions(+), 18 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index 6b891c11324..e5d62c6e58b 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -3704,43 +3704,45 @@
>
>  ;; Vector Quad-Widening Sign-extend and Zero-extend.
>  (define_insn "@pred_<optab><mode>_vf4"
> -  [(set (match_operand:VQEXTI 0 "register_operand"          "=&vr,&vr")
> +  [(set (match_operand:VQEXTI 0 "register_operand"               "=vr,   vr, 
>   vr,   vr, ?&vr, ?&vr")
>         (if_then_else:VQEXTI
>           (unspec:<VM>
> -           [(match_operand:<VM> 1 "vector_mask_operand"       "vmWc1,vmWc1")
> -            (match_operand 4 "vector_length_operand"          "   rK,   rK")
> -            (match_operand 5 "const_int_operand"              "    i,    i")
> -            (match_operand 6 "const_int_operand"              "    i,    i")
> -            (match_operand 7 "const_int_operand"              "    i,    i")
> +           [(match_operand:<VM> 1 "vector_mask_operand"       
> "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
> +            (match_operand 4 "vector_length_operand"          "   rK,   rK,  
>  rK,   rK,   rK,   rK")
> +            (match_operand 5 "const_int_operand"              "    i,    i,  
>   i,    i,    i,    i")
> +            (match_operand 6 "const_int_operand"              "    i,    i,  
>   i,    i,    i,    i")
> +            (match_operand 7 "const_int_operand"              "    i,    i,  
>   i,    i,    i,    i")
>              (reg:SI VL_REGNUM)
>              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
>           (any_extend:VQEXTI
> -           (match_operand:<V_QUAD_TRUNC> 3 "register_operand" "   vr,   vr"))
> -         (match_operand:VQEXTI 2 "vector_merge_operand"       "   vu,    
> 0")))]
> +           (match_operand:<V_QUAD_TRUNC> 3 "register_operand" "  W43,  W43,  
> W86,  W86,   vr,   vr"))
> +         (match_operand:VQEXTI 2 "vector_merge_operand"       "   vu,    0,  
>  vu,    0,   vu,    0")))]
>    "TARGET_VECTOR"
>    "v<sz>ext.vf4\t%0,%3%p1"
>    [(set_attr "type" "vext")
> -   (set_attr "mode" "<MODE>")])
> +   (set_attr "mode" "<MODE>")
> +   (set_attr "group_overlap" "W43,W43,W86,W86,none,none")])
>
>  ;; Vector Oct-Widening Sign-extend and Zero-extend.
>  (define_insn "@pred_<optab><mode>_vf8"
> -  [(set (match_operand:VOEXTI 0 "register_operand"         "=&vr,&vr")
> +  [(set (match_operand:VOEXTI 0 "register_operand"              "=vr,   vr, 
> ?&vr, ?&vr")
>         (if_then_else:VOEXTI
>           (unspec:<VM>
> -           [(match_operand:<VM> 1 "vector_mask_operand"      "vmWc1,vmWc1")
> -            (match_operand 4 "vector_length_operand"         "   rK,   rK")
> -            (match_operand 5 "const_int_operand"             "    i,    i")
> -            (match_operand 6 "const_int_operand"             "    i,    i")
> -            (match_operand 7 "const_int_operand"             "    i,    i")
> +           [(match_operand:<VM> 1 "vector_mask_operand"      
> "vmWc1,vmWc1,vmWc1,vmWc1")
> +            (match_operand 4 "vector_length_operand"         "   rK,   rK,   
> rK,   rK")
> +            (match_operand 5 "const_int_operand"             "    i,    i,   
>  i,    i")
> +            (match_operand 6 "const_int_operand"             "    i,    i,   
>  i,    i")
> +            (match_operand 7 "const_int_operand"             "    i,    i,   
>  i,    i")
>              (reg:SI VL_REGNUM)
>              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
>           (any_extend:VOEXTI
> -           (match_operand:<V_OCT_TRUNC> 3 "register_operand" "   vr,   vr"))
> -         (match_operand:VOEXTI 2 "vector_merge_operand"      "   vu,    
> 0")))]
> +           (match_operand:<V_OCT_TRUNC> 3 "register_operand" "  W87,  W87,   
> vr,   vr"))
> +         (match_operand:VOEXTI 2 "vector_merge_operand"      "   vu,    0,   
> vu,    0")))]
>    "TARGET_VECTOR"
>    "v<sz>ext.vf8\t%0,%3%p1"
>    [(set_attr "type" "vext")
> -   (set_attr "mode" "<MODE>")])
> +   (set_attr "mode" "<MODE>")
> +   (set_attr "group_overlap" "W87,W87,none,none")])
>
>  ;; Vector Widening Add/Subtract/Multiply.
>  (define_insn "@pred_dual_widen_<any_widen_binop:optab><any_extend:su><mode>"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c
> new file mode 100644
> index 00000000000..98f42458883
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c
> @@ -0,0 +1,68 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
> +         size_t sum5, size_t sum6, size_t sum7)
> +{
> +  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v4 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v5 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v6 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v7 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint32m4_t vw0 = __riscv_vsext_vf4_i32m4 (v0, vl);
> +      vint32m4_t vw1 = __riscv_vsext_vf4_i32m4 (v1, vl);
> +      vint32m4_t vw2 = __riscv_vsext_vf4_i32m4 (v2, vl);
> +      vint32m4_t vw3 = __riscv_vsext_vf4_i32m4 (v3, vl);
> +      vint32m4_t vw4 = __riscv_vsext_vf4_i32m4 (v4, vl);
> +      vint32m4_t vw5 = __riscv_vsext_vf4_i32m4 (v5, vl);
> +      vint32m4_t vw6 = __riscv_vsext_vf4_i32m4 (v6, vl);
> +      vint32m4_t vw7 = __riscv_vsext_vf4_i32m4 (v7, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i32m4_i32 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i32m4_i32 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i32m4_i32 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i32m4_i32 (vw3);
> +      size_t sum4 = __riscv_vmv_x_s_i32m4_i32 (vw4);
> +      size_t sum5 = __riscv_vmv_x_s_i32m4_i32 (vw5);
> +      size_t sum6 = __riscv_vmv_x_s_i32m4_i32 (vw6);
> +      size_t sum7 = __riscv_vmv_x_s_i32m4_i32 (vw7);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c
> new file mode 100644
> index 00000000000..9b60005344d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3)
> +{
> +  return sum0 + sum1 + sum2 + sum3;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m2_t v0 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v1 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v2 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v3 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint32m8_t vw0 = __riscv_vsext_vf4_i32m8 (v0, vl);
> +      vint32m8_t vw1 = __riscv_vsext_vf4_i32m8 (v1, vl);
> +      vint32m8_t vw2 = __riscv_vsext_vf4_i32m8 (v2, vl);
> +      vint32m8_t vw3 = __riscv_vsext_vf4_i32m8 (v3, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i32m8_i32 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i32m8_i32 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i32m8_i32 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i32m8_i32 (vw3);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c
> new file mode 100644
> index 00000000000..dd65b2fa098
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3)
> +{
> +  return sum0 + sum1 + sum2 + sum3;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint64m8_t vw0 = __riscv_vsext_vf8_i64m8 (v0, vl);
> +      vint64m8_t vw1 = __riscv_vsext_vf8_i64m8 (v1, vl);
> +      vint64m8_t vw2 = __riscv_vsext_vf8_i64m8 (v2, vl);
> +      vint64m8_t vw3 = __riscv_vsext_vf8_i64m8 (v3, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i64m8_i64 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i64m8_i64 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i64m8_i64 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i64m8_i64 (vw3);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> --
> 2.36.3
>

Re: [PATCH] RISC-V: Support widening register overlap for vf4/vf8

Reply via email to