LGTM, thanks :)
On Thu, Nov 30, 2023 at 2:49 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote: > > > size_t > foo (char const *buf, size_t len) > { > size_t sum = 0; > size_t vl = __riscv_vsetvlmax_e8m8 (); > size_t step = vl * 4; > const char *it = buf, *end = buf + len; > for (; it + step <= end;) > { > vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl); > it += vl; > vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl); > it += vl; > vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl); > it += vl; > vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl); > it += vl; > > asm volatile("nop" ::: "memory"); > vint64m8_t vw0 = __riscv_vsext_vf8_i64m8 (v0, vl); > vint64m8_t vw1 = __riscv_vsext_vf8_i64m8 (v1, vl); > vint64m8_t vw2 = __riscv_vsext_vf8_i64m8 (v2, vl); > vint64m8_t vw3 = __riscv_vsext_vf8_i64m8 (v3, vl); > > asm volatile("nop" ::: "memory"); > size_t sum0 = __riscv_vmv_x_s_i64m8_i64 (vw0); > size_t sum1 = __riscv_vmv_x_s_i64m8_i64 (vw1); > size_t sum2 = __riscv_vmv_x_s_i64m8_i64 (vw2); > size_t sum3 = __riscv_vmv_x_s_i64m8_i64 (vw3); > > sum += sumation (sum0, sum1, sum2, sum3); > } > return sum; > } > > Before this patch: > > add a3,s0,s1 > add a4,s6,s1 > add a5,s7,s1 > vsetvli zero,s0,e64,m8,ta,ma > vle8.v v4,0(s1) > vle8.v v3,0(a3) > mv s1,s2 > vle8.v v2,0(a4) > vle8.v v1,0(a5) > nop > vsext.vf8 v8,v4 > vsext.vf8 v16,v2 > vs8r.v v8,0(sp) > vsext.vf8 v24,v1 > vsext.vf8 v8,v3 > nop > vmv.x.s a1,v8 > vl8re64.v v8,0(sp) > vmv.x.s a3,v24 > vmv.x.s a2,v16 > vmv.x.s a0,v8 > add s2,s2,s5 > call sumation > add s3,s3,a0 > bgeu s4,s2,.L5 > > After this patch: > > add a3,s0,s1 > add a4,s6,s1 > add a5,s7,s1 > vsetvli zero,s0,e64,m8,ta,ma > vle8.v v15,0(s1) > vle8.v v23,0(a3) > mv s1,s2 > vle8.v v31,0(a4) > vle8.v v7,0(a5) > vsext.vf8 v8,v15 > vsext.vf8 v16,v23 > vsext.vf8 v24,v31 > vsext.vf8 v0,v7 > vmv.x.s a3,v0 > vmv.x.s a2,v24 > vmv.x.s a1,v16 > vmv.x.s a0,v8 > add s2,s2,s5 > call sumation > add s3,s3,a0 > bgeu s4,s2,.L5 > > PR target/112431 > > gcc/ChangeLog: > > * config/riscv/vector.md: Add widening overlap of vf2/vf4. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/base/pr112431-16.c: New test. > * gcc.target/riscv/rvv/base/pr112431-17.c: New test. > * gcc.target/riscv/rvv/base/pr112431-18.c: New test. > > --- > gcc/config/riscv/vector.md | 38 ++++++----- > .../gcc.target/riscv/rvv/base/pr112431-16.c | 68 +++++++++++++++++++ > .../gcc.target/riscv/rvv/base/pr112431-17.c | 51 ++++++++++++++ > .../gcc.target/riscv/rvv/base/pr112431-18.c | 51 ++++++++++++++ > 4 files changed, 190 insertions(+), 18 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c > > diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md > index 6b891c11324..e5d62c6e58b 100644 > --- a/gcc/config/riscv/vector.md > +++ b/gcc/config/riscv/vector.md > @@ -3704,43 +3704,45 @@ > > ;; Vector Quad-Widening Sign-extend and Zero-extend. > (define_insn "@pred_<optab><mode>_vf4" > - [(set (match_operand:VQEXTI 0 "register_operand" "=&vr,&vr") > + [(set (match_operand:VQEXTI 0 "register_operand" "=vr, vr, > vr, vr, ?&vr, ?&vr") > (if_then_else:VQEXTI > (unspec:<VM> > - [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1,vmWc1") > - (match_operand 4 "vector_length_operand" " rK, rK") > - (match_operand 5 "const_int_operand" " i, i") > - (match_operand 6 "const_int_operand" " i, i") > - (match_operand 7 "const_int_operand" " i, i") > + [(match_operand:<VM> 1 "vector_mask_operand" > "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1") > + (match_operand 4 "vector_length_operand" " rK, rK, > rK, rK, rK, rK") > + (match_operand 5 "const_int_operand" " i, i, > i, i, i, i") > + (match_operand 6 "const_int_operand" " i, i, > i, i, i, i") > + (match_operand 7 "const_int_operand" " i, i, > i, i, i, i") > (reg:SI VL_REGNUM) > (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > (any_extend:VQEXTI > - (match_operand:<V_QUAD_TRUNC> 3 "register_operand" " vr, vr")) > - (match_operand:VQEXTI 2 "vector_merge_operand" " vu, > 0")))] > + (match_operand:<V_QUAD_TRUNC> 3 "register_operand" " W43, W43, > W86, W86, vr, vr")) > + (match_operand:VQEXTI 2 "vector_merge_operand" " vu, 0, > vu, 0, vu, 0")))] > "TARGET_VECTOR" > "v<sz>ext.vf4\t%0,%3%p1" > [(set_attr "type" "vext") > - (set_attr "mode" "<MODE>")]) > + (set_attr "mode" "<MODE>") > + (set_attr "group_overlap" "W43,W43,W86,W86,none,none")]) > > ;; Vector Oct-Widening Sign-extend and Zero-extend. > (define_insn "@pred_<optab><mode>_vf8" > - [(set (match_operand:VOEXTI 0 "register_operand" "=&vr,&vr") > + [(set (match_operand:VOEXTI 0 "register_operand" "=vr, vr, > ?&vr, ?&vr") > (if_then_else:VOEXTI > (unspec:<VM> > - [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1,vmWc1") > - (match_operand 4 "vector_length_operand" " rK, rK") > - (match_operand 5 "const_int_operand" " i, i") > - (match_operand 6 "const_int_operand" " i, i") > - (match_operand 7 "const_int_operand" " i, i") > + [(match_operand:<VM> 1 "vector_mask_operand" > "vmWc1,vmWc1,vmWc1,vmWc1") > + (match_operand 4 "vector_length_operand" " rK, rK, > rK, rK") > + (match_operand 5 "const_int_operand" " i, i, > i, i") > + (match_operand 6 "const_int_operand" " i, i, > i, i") > + (match_operand 7 "const_int_operand" " i, i, > i, i") > (reg:SI VL_REGNUM) > (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > (any_extend:VOEXTI > - (match_operand:<V_OCT_TRUNC> 3 "register_operand" " vr, vr")) > - (match_operand:VOEXTI 2 "vector_merge_operand" " vu, > 0")))] > + (match_operand:<V_OCT_TRUNC> 3 "register_operand" " W87, W87, > vr, vr")) > + (match_operand:VOEXTI 2 "vector_merge_operand" " vu, 0, > vu, 0")))] > "TARGET_VECTOR" > "v<sz>ext.vf8\t%0,%3%p1" > [(set_attr "type" "vext") > - (set_attr "mode" "<MODE>")]) > + (set_attr "mode" "<MODE>") > + (set_attr "group_overlap" "W87,W87,none,none")]) > > ;; Vector Widening Add/Subtract/Multiply. > (define_insn "@pred_dual_widen_<any_widen_binop:optab><any_extend:su><mode>" > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c > b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c > new file mode 100644 > index 00000000000..98f42458883 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c > @@ -0,0 +1,68 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ > + > +#include "riscv_vector.h" > + > +size_t __attribute__ ((noinline)) > +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4, > + size_t sum5, size_t sum6, size_t sum7) > +{ > + return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7; > +} > + > +size_t > +foo (char const *buf, size_t len) > +{ > + size_t sum = 0; > + size_t vl = __riscv_vsetvlmax_e8m8 (); > + size_t step = vl * 4; > + const char *it = buf, *end = buf + len; > + for (; it + step <= end;) > + { > + vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v4 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v5 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v6 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v7 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + > + asm volatile("nop" ::: "memory"); > + vint32m4_t vw0 = __riscv_vsext_vf4_i32m4 (v0, vl); > + vint32m4_t vw1 = __riscv_vsext_vf4_i32m4 (v1, vl); > + vint32m4_t vw2 = __riscv_vsext_vf4_i32m4 (v2, vl); > + vint32m4_t vw3 = __riscv_vsext_vf4_i32m4 (v3, vl); > + vint32m4_t vw4 = __riscv_vsext_vf4_i32m4 (v4, vl); > + vint32m4_t vw5 = __riscv_vsext_vf4_i32m4 (v5, vl); > + vint32m4_t vw6 = __riscv_vsext_vf4_i32m4 (v6, vl); > + vint32m4_t vw7 = __riscv_vsext_vf4_i32m4 (v7, vl); > + > + asm volatile("nop" ::: "memory"); > + size_t sum0 = __riscv_vmv_x_s_i32m4_i32 (vw0); > + size_t sum1 = __riscv_vmv_x_s_i32m4_i32 (vw1); > + size_t sum2 = __riscv_vmv_x_s_i32m4_i32 (vw2); > + size_t sum3 = __riscv_vmv_x_s_i32m4_i32 (vw3); > + size_t sum4 = __riscv_vmv_x_s_i32m4_i32 (vw4); > + size_t sum5 = __riscv_vmv_x_s_i32m4_i32 (vw5); > + size_t sum6 = __riscv_vmv_x_s_i32m4_i32 (vw6); > + size_t sum7 = __riscv_vmv_x_s_i32m4_i32 (vw7); > + > + sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7); > + } > + return sum; > +} > + > +/* { dg-final { scan-assembler-not {vmv1r} } } */ > +/* { dg-final { scan-assembler-not {vmv2r} } } */ > +/* { dg-final { scan-assembler-not {vmv4r} } } */ > +/* { dg-final { scan-assembler-not {vmv8r} } } */ > +/* { dg-final { scan-assembler-not {csrr} } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c > b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c > new file mode 100644 > index 00000000000..9b60005344d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c > @@ -0,0 +1,51 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ > + > +#include "riscv_vector.h" > + > +size_t __attribute__ ((noinline)) > +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3) > +{ > + return sum0 + sum1 + sum2 + sum3; > +} > + > +size_t > +foo (char const *buf, size_t len) > +{ > + size_t sum = 0; > + size_t vl = __riscv_vsetvlmax_e8m8 (); > + size_t step = vl * 4; > + const char *it = buf, *end = buf + len; > + for (; it + step <= end;) > + { > + vint8m2_t v0 = __riscv_vle8_v_i8m2 ((void *) it, vl); > + it += vl; > + vint8m2_t v1 = __riscv_vle8_v_i8m2 ((void *) it, vl); > + it += vl; > + vint8m2_t v2 = __riscv_vle8_v_i8m2 ((void *) it, vl); > + it += vl; > + vint8m2_t v3 = __riscv_vle8_v_i8m2 ((void *) it, vl); > + it += vl; > + > + asm volatile("nop" ::: "memory"); > + vint32m8_t vw0 = __riscv_vsext_vf4_i32m8 (v0, vl); > + vint32m8_t vw1 = __riscv_vsext_vf4_i32m8 (v1, vl); > + vint32m8_t vw2 = __riscv_vsext_vf4_i32m8 (v2, vl); > + vint32m8_t vw3 = __riscv_vsext_vf4_i32m8 (v3, vl); > + > + asm volatile("nop" ::: "memory"); > + size_t sum0 = __riscv_vmv_x_s_i32m8_i32 (vw0); > + size_t sum1 = __riscv_vmv_x_s_i32m8_i32 (vw1); > + size_t sum2 = __riscv_vmv_x_s_i32m8_i32 (vw2); > + size_t sum3 = __riscv_vmv_x_s_i32m8_i32 (vw3); > + > + sum += sumation (sum0, sum1, sum2, sum3); > + } > + return sum; > +} > + > +/* { dg-final { scan-assembler-not {vmv1r} } } */ > +/* { dg-final { scan-assembler-not {vmv2r} } } */ > +/* { dg-final { scan-assembler-not {vmv4r} } } */ > +/* { dg-final { scan-assembler-not {vmv8r} } } */ > +/* { dg-final { scan-assembler-not {csrr} } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c > b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c > new file mode 100644 > index 00000000000..dd65b2fa098 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c > @@ -0,0 +1,51 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ > + > +#include "riscv_vector.h" > + > +size_t __attribute__ ((noinline)) > +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3) > +{ > + return sum0 + sum1 + sum2 + sum3; > +} > + > +size_t > +foo (char const *buf, size_t len) > +{ > + size_t sum = 0; > + size_t vl = __riscv_vsetvlmax_e8m8 (); > + size_t step = vl * 4; > + const char *it = buf, *end = buf + len; > + for (; it + step <= end;) > + { > + vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl); > + it += vl; > + > + asm volatile("nop" ::: "memory"); > + vint64m8_t vw0 = __riscv_vsext_vf8_i64m8 (v0, vl); > + vint64m8_t vw1 = __riscv_vsext_vf8_i64m8 (v1, vl); > + vint64m8_t vw2 = __riscv_vsext_vf8_i64m8 (v2, vl); > + vint64m8_t vw3 = __riscv_vsext_vf8_i64m8 (v3, vl); > + > + asm volatile("nop" ::: "memory"); > + size_t sum0 = __riscv_vmv_x_s_i64m8_i64 (vw0); > + size_t sum1 = __riscv_vmv_x_s_i64m8_i64 (vw1); > + size_t sum2 = __riscv_vmv_x_s_i64m8_i64 (vw2); > + size_t sum3 = __riscv_vmv_x_s_i64m8_i64 (vw3); > + > + sum += sumation (sum0, sum1, sum2, sum3); > + } > + return sum; > +} > + > +/* { dg-final { scan-assembler-not {vmv1r} } } */ > +/* { dg-final { scan-assembler-not {vmv2r} } } */ > +/* { dg-final { scan-assembler-not {vmv4r} } } */ > +/* { dg-final { scan-assembler-not {vmv8r} } } */ > +/* { dg-final { scan-assembler-not {csrr} } } */ > -- > 2.36.3 >