Nice catch, LGTM :)

On Thu, Jan 4, 2024 at 4:28 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote:
>
> Consider this following case:
>
> void
> f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
> {
>   for (int i = 0; i < n; i++)
>     {
>       int tmp = b[i] + 15;
>       int tmp2 = tmp + b[i];
>       c[i] = tmp2 + b[i];
>       d[i] = tmp + tmp2 + b[i];
>     }
> }
>
> Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as
> consuming 1 vector register group which is not accurate.
>
> We teach the dynamic LMUL cost model be aware of the potential vi variant 
> instructions
> transformation, so that we can choose LMUL = 8 according to more accurate 
> cost model.
>
> After this patch:
>
> f:
>         ble     a4,zero,.L5
> .L3:
>         vsetvli a5,a4,e32,m8,ta,ma
>         slli    a0,a5,2
>         vle32.v v16,0(a1)
>         vadd.vi v24,v16,15
>         vadd.vv v8,v24,v16
>         vadd.vv v0,v8,v16
>         vse32.v v0,0(a2)
>         vadd.vv v8,v8,v24
>         vadd.vv v8,v8,v16
>         vse32.v v8,0(a3)
>         add     a1,a1,a0
>         add     a2,a2,a0
>         add     a3,a3,a0
>         sub     a4,a4,a5
>         bne     a4,zero,.L3
> .L5:
>         ret
>
> Tested on both RV32 and RV64 no regression. Ok for trunk ?
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach 
> vi variant.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test.
>
> ---
>  gcc/config/riscv/riscv-vector-costs.cc        | 30 ++++++--
>  .../costmodel/riscv/rvv/dynamic-lmul8-13.c    | 74 +++++++++++++++++++
>  2 files changed, 97 insertions(+), 7 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
>
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
> b/gcc/config/riscv/riscv-vector-costs.cc
> index 21f8a81c89c..7f083b04edd 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -255,6 +255,29 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree 
> var, bool lhs_p)
>             return false;
>         }
>      }
> +  else if (is_gimple_assign (stmt))
> +    {
> +      tree_code tcode = gimple_assign_rhs_code (stmt);
> +      /* vi variant doesn't need to allocate such statement.
> +        E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi
> +        so the INTEGER_CST '1' doesn't need vector a register.  */
> +      switch (tcode)
> +       {
> +       case PLUS_EXPR:
> +       case BIT_IOR_EXPR:
> +       case BIT_XOR_EXPR:
> +       case BIT_AND_EXPR:
> +         return TREE_CODE (var) != INTEGER_CST
> +                || !IN_RANGE (tree_to_shwi (var), -16, 15);
> +       case MINUS_EXPR:
> +         return TREE_CODE (var) != INTEGER_CST
> +                || !IN_RANGE (tree_to_shwi (var), -16, 15)
> +                || gimple_assign_rhs1 (stmt) != var;
> +       default:
> +         break;
> +       }
> +    }
> +
>    if (lhs_p)
>      return is_gimple_reg (var)
>            && (!POINTER_TYPE_P (TREE_TYPE (var))
> @@ -331,13 +354,6 @@ compute_local_live_ranges (
>               for (i = 0; i < gimple_num_args (stmt); i++)
>                 {
>                   tree var = gimple_arg (stmt, i);
> -                 /* Both IMM and REG are included since a VECTOR_CST may be
> -                    potentially held in a vector register.  However, it's not
> -                    accurate, since a PLUS_EXPR can be vectorized into 
> vadd.vi
> -                    if IMM is -16 ~ 15.
> -
> -                    TODO: We may elide the cases that the unnecessary IMM in
> -                    the future.  */
>                   if (variable_vectorized_p (program_point.stmt_info, var,
>                                              false))
>                     {
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c 
> b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
> new file mode 100644
> index 00000000000..baef4e39014
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
> @@ -0,0 +1,74 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param 
> riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
> +
> +void
> +f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      int tmp = b[i] + 15;
> +      int tmp2 = tmp + b[i];
> +      c[i] = tmp2 + b[i];
> +      d[i] = tmp + tmp2 + b[i];
> +    }
> +}
> +
> +void
> +f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int 
> n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      int tmp = 15 - b[i];
> +      int tmp2 = tmp * b[i];
> +      c[i] = tmp2 * b[i];
> +      d[i] = tmp * tmp2 * b[i];
> +    }
> +}
> +
> +void
> +f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int 
> n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      int tmp = b[i] & 15;
> +      int tmp2 = tmp * b[i];
> +      c[i] = tmp2 * b[i];
> +      d[i] = tmp * tmp2 * b[i];
> +    }
> +}
> +
> +void
> +f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int 
> n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      int tmp = b[i] | 15;
> +      int tmp2 = tmp * b[i];
> +      c[i] = tmp2 * b[i];
> +      d[i] = tmp * tmp2 * b[i];
> +    }
> +}
> +
> +void
> +f5 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int 
> n)
> +{
> +  for (int i = 0; i < n; i++)
> +    {
> +      int tmp = b[i] ^ 15;
> +      int tmp2 = tmp * b[i];
> +      c[i] = tmp2 * b[i];
> +      d[i] = tmp * tmp2 * b[i];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {e32,m8} 5 } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> +/* { dg-final { scan-assembler-not {jr} } } */
> +/* { dg-final { scan-assembler-not {e32,m4} } } */
> +/* { dg-final { scan-assembler-not {e32,m2} } } */
> +/* { dg-final { scan-assembler-not {e32,m1} } } */
> +/* { dg-final { scan-assembler-times {ret} 5 } } */
> +/* { dg-final { scan-tree-dump-not "Preferring smaller LMUL loop because it 
> has unexpected spills" "vect" } } */
> +/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 5 "vect" } } */
> --
> 2.36.3
>
>

Reply via email to