Nice catch, LGTM :)
On Thu, Jan 4, 2024 at 4:28 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote: > > Consider this following case: > > void > f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) > { > for (int i = 0; i < n; i++) > { > int tmp = b[i] + 15; > int tmp2 = tmp + b[i]; > c[i] = tmp2 + b[i]; > d[i] = tmp + tmp2 + b[i]; > } > } > > Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as > consuming 1 vector register group which is not accurate. > > We teach the dynamic LMUL cost model be aware of the potential vi variant > instructions > transformation, so that we can choose LMUL = 8 according to more accurate > cost model. > > After this patch: > > f: > ble a4,zero,.L5 > .L3: > vsetvli a5,a4,e32,m8,ta,ma > slli a0,a5,2 > vle32.v v16,0(a1) > vadd.vi v24,v16,15 > vadd.vv v8,v24,v16 > vadd.vv v0,v8,v16 > vse32.v v0,0(a2) > vadd.vv v8,v8,v24 > vadd.vv v8,v8,v16 > vse32.v v8,0(a3) > add a1,a1,a0 > add a2,a2,a0 > add a3,a3,a0 > sub a4,a4,a5 > bne a4,zero,.L3 > .L5: > ret > > Tested on both RV32 and RV64 no regression. Ok for trunk ? > > gcc/ChangeLog: > > * config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach > vi variant. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test. > > --- > gcc/config/riscv/riscv-vector-costs.cc | 30 ++++++-- > .../costmodel/riscv/rvv/dynamic-lmul8-13.c | 74 +++++++++++++++++++ > 2 files changed, 97 insertions(+), 7 deletions(-) > create mode 100644 > gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c > > diff --git a/gcc/config/riscv/riscv-vector-costs.cc > b/gcc/config/riscv/riscv-vector-costs.cc > index 21f8a81c89c..7f083b04edd 100644 > --- a/gcc/config/riscv/riscv-vector-costs.cc > +++ b/gcc/config/riscv/riscv-vector-costs.cc > @@ -255,6 +255,29 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree > var, bool lhs_p) > return false; > } > } > + else if (is_gimple_assign (stmt)) > + { > + tree_code tcode = gimple_assign_rhs_code (stmt); > + /* vi variant doesn't need to allocate such statement. > + E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi > + so the INTEGER_CST '1' doesn't need vector a register. */ > + switch (tcode) > + { > + case PLUS_EXPR: > + case BIT_IOR_EXPR: > + case BIT_XOR_EXPR: > + case BIT_AND_EXPR: > + return TREE_CODE (var) != INTEGER_CST > + || !IN_RANGE (tree_to_shwi (var), -16, 15); > + case MINUS_EXPR: > + return TREE_CODE (var) != INTEGER_CST > + || !IN_RANGE (tree_to_shwi (var), -16, 15) > + || gimple_assign_rhs1 (stmt) != var; > + default: > + break; > + } > + } > + > if (lhs_p) > return is_gimple_reg (var) > && (!POINTER_TYPE_P (TREE_TYPE (var)) > @@ -331,13 +354,6 @@ compute_local_live_ranges ( > for (i = 0; i < gimple_num_args (stmt); i++) > { > tree var = gimple_arg (stmt, i); > - /* Both IMM and REG are included since a VECTOR_CST may be > - potentially held in a vector register. However, it's not > - accurate, since a PLUS_EXPR can be vectorized into > vadd.vi > - if IMM is -16 ~ 15. > - > - TODO: We may elide the cases that the unnecessary IMM in > - the future. */ > if (variable_vectorized_p (program_point.stmt_info, var, > false)) > { > diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c > b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c > new file mode 100644 > index 00000000000..baef4e39014 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c > @@ -0,0 +1,74 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param > riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */ > + > +void > +f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) > +{ > + for (int i = 0; i < n; i++) > + { > + int tmp = b[i] + 15; > + int tmp2 = tmp + b[i]; > + c[i] = tmp2 + b[i]; > + d[i] = tmp + tmp2 + b[i]; > + } > +} > + > +void > +f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int > n) > +{ > + for (int i = 0; i < n; i++) > + { > + int tmp = 15 - b[i]; > + int tmp2 = tmp * b[i]; > + c[i] = tmp2 * b[i]; > + d[i] = tmp * tmp2 * b[i]; > + } > +} > + > +void > +f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int > n) > +{ > + for (int i = 0; i < n; i++) > + { > + int tmp = b[i] & 15; > + int tmp2 = tmp * b[i]; > + c[i] = tmp2 * b[i]; > + d[i] = tmp * tmp2 * b[i]; > + } > +} > + > +void > +f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int > n) > +{ > + for (int i = 0; i < n; i++) > + { > + int tmp = b[i] | 15; > + int tmp2 = tmp * b[i]; > + c[i] = tmp2 * b[i]; > + d[i] = tmp * tmp2 * b[i]; > + } > +} > + > +void > +f5 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int > n) > +{ > + for (int i = 0; i < n; i++) > + { > + int tmp = b[i] ^ 15; > + int tmp2 = tmp * b[i]; > + c[i] = tmp2 * b[i]; > + d[i] = tmp * tmp2 * b[i]; > + } > +} > + > +/* { dg-final { scan-assembler-times {e32,m8} 5 } } */ > +/* { dg-final { scan-assembler-not {csrr} } } */ > +/* { dg-final { scan-assembler-not {jr} } } */ > +/* { dg-final { scan-assembler-not {e32,m4} } } */ > +/* { dg-final { scan-assembler-not {e32,m2} } } */ > +/* { dg-final { scan-assembler-not {e32,m1} } } */ > +/* { dg-final { scan-assembler-times {ret} 5 } } */ > +/* { dg-final { scan-tree-dump-not "Preferring smaller LMUL loop because it > has unexpected spills" "vect" } } */ > +/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 5 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 5 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 5 "vect" } } */ > -- > 2.36.3 > >