On 2/24/25 16:07, Edwin Lu wrote:
> See [1] thread for original patch which spawned this one.
>
> We are currently seeing the following code where we perform a vsetvl
> before a branching instruction against the avl.
>
>         vsetvli a5,a1,e32,m1,tu,ma
>         vle32.v v2,0(a0)
>         sub     a1,a1,a5 <-- a1 potentially set to 0
>         sh2add  a0,a5,a0
>         vfmacc.vv       v1,v2,v2
>         vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
>         beq     a1,zero,.L12 <-- check if avl is 0
>
> Since we are branching off of the avl, we don't need to update vl until
> after the branch is taken. Search the ready queue for vsetvls scheduled
> before branching instructions that branch off of the same regno and
> promote the branches to execute first. This can improve performancy by
> potentially avoiding setting VL=0 which may be expensive on some uarches.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
>
>       PR/117974
>
> gcc/ChangeLog:
>
>       * config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
>       (insn_increases_zeroness_p): Ditto.
>       (riscv_promote_ready): Ditto.
>       (riscv_sched_reorder): Implement hook.
>       (TARGET_SCHED_REORDER): Define Hook.
>       * config/riscv/riscv.opt: New flag.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
>
> Signed-off-by: Edwin Lu <e...@rivosinc.com>
> Co-authored-by: Palmer Dabbelt <pal...@rivosinc.com>
> ---
>  gcc/config/riscv/riscv.cc                     | 103 ++++++++++++++++++
>  gcc/config/riscv/riscv.opt                    |   4 +
>  .../gcc.target/riscv/rvv/vsetvl/pr117974.c    |  15 +++
>  3 files changed, 122 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 89aa25d5da9..cf0866fa3fb 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -14035,6 +14035,106 @@ bool need_shadow_stack_push_pop_p ()
>    return is_zicfiss_p () && riscv_save_return_addr_reg_p ();
>  }
>
> +static int
> +vsetvl_avl_regno(rtx_insn *insn)
> +{
> +  if (recog_memoized (insn) < 0)
> +    return -1;
> +
> +  if (get_attr_type (insn) != TYPE_VSETVL
> +      && get_attr_type (insn) != TYPE_VSETVL_PRE)
> +    return -1;
> +
> +  extract_insn (insn);
> +  /* From vector.md, vsetvl operands are as follows:
> +      ;; operands[0]: VL.
> +      ;; operands[1]: AVL.
> +      ;; operands[2]: SEW
> +      ;; operands[3]: LMUL
> +      ;; operands[4]: Tail policy 0 or 1 (undisturbed/agnostic)
> +      ;; operands[5]: Mask policy 0 or 1 (undisturbed/agnostic)
> +     Return regno of avl operand.  */
> +  return REGNO (recog_data.operand[1]);
> +}
> +
> +static bool
> +insn_increases_zeroness_p(rtx_insn *insn, int regno)

Mnir point, but this is just one case of introducing VL=0 zeroness specific to
branch, not the general case.

> +{
> +  /* Check for branching against zero.  */
> +  if (JUMP_P (insn))
> +    {
> +      extract_insn (insn);
> +      bool match_reg = false;
> +      bool comp_zero = false;
> +      for (int i = 0; i < recog_data.n_operands; i++)
> +     {
> +       if (REG_P (recog_data.operand[i])
> +           && REGNO (recog_data.operand[i]) == regno)
> +         match_reg = true;
> +       if (CONST_INT_P (recog_data.operand[i])
> +           && XINT (recog_data.operand[i], 0) == 0
> +           && XWINT (recog_data.operand[i], 0) == 0)
> +         comp_zero = true;
> +     }
> +      return match_reg && comp_zero;
> +    }
> +  return false;
> +}
> +
> +/* Copied from MIPS.  Removes the instruction at index LOWER from ready
> +   queue READY and reinserts it in from of the instruction at index
> +   HIGHER.  LOWER must be <= HIGHER.  */
> +static void
> +riscv_promote_ready (rtx_insn **ready, int lower, int higher)
> +{
> +  rtx_insn *new_head;
> +  int i;
> +
> +  new_head = ready[lower];
> +  for (i = lower; i < higher; i++)
> +    ready[i] = ready[i + 1];
> +  ready[i] = new_head;
> +}
> +
> +/* Attempt to avoid issuing VSETVL-type instructions before a branch that
> +   ensures they are non-zero, as setting VL=0 dynamically can be slow.  */
> +static int
> +riscv_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose 
> ATTRIBUTE_UNUSED,
> +                  rtx_insn **ready, int *nreadyp, int cycle ATTRIBUTE_UNUSED)
> +{
> +    if (! TARGET_AVOID_VL_EQ_0)
> +      return riscv_issue_rate ();
> +
> +    for (int i = *nreadyp - 1; i >= 0; i--)
> +      {
> +     /* Find the vsetvl.  */
> +     int avl_regno = vsetvl_avl_regno (ready[i]);
> +     if (avl_regno == -1 || i == 0)
> +       continue;
> +     for (int j = i - 1; j >= 0; j--)
> +       {
> +         /* Exit if another vsetvl is found before finding a branch insn
> +            in the ready queue.  */
> +         if (recog_memoized (ready[j]) >= 0
> +             && get_attr_type (ready[j]) == TYPE_VSETVL
> +             && get_attr_type (ready[j]) == TYPE_VSETVL_PRE)
> +           break;
> +         /* Find branch.  */
> +         if (recog_memoized (ready[j]) >= 0
> +             && insn_increases_zeroness_p (ready[j], avl_regno))
> +           {
> +             /* Right now the only zeroness-increasing pattern we recognize
> +                is a branch-not-zero, so there's no sense in looking for any
> +                more zeroness at that point.  */
> +             riscv_promote_ready (ready, j, i);
> +             break;
> +           }
> +       }
> +      }
> +
> +    return riscv_issue_rate ();
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -14430,6 +14530,9 @@ bool need_shadow_stack_push_pop_p ()
>  #undef TARGET_DOCUMENTATION_NAME
>  #define TARGET_DOCUMENTATION_NAME "RISC-V"
>
> +#undef TARGET_SCHED_REORDER
> +#define TARGET_SCHED_REORDER riscv_sched_reorder
> +
>  struct gcc_target targetm = TARGET_INITIALIZER;
>
>  #include "gt-riscv.h"
> diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
> index 7515c8ea13d..c6cab61fdc0 100644
> --- a/gcc/config/riscv/riscv.opt
> +++ b/gcc/config/riscv/riscv.opt
> @@ -681,3 +681,7 @@ Specifies whether the fence.tso instruction should be 
> used.
>  mautovec-segment
>  Target Integer Var(riscv_mautovec_segment) Init(1)
>  Enable (default) or disable generation of vector segment load/store 
> instructions.
> +
> +mavoid-vl0
> +Target Var(TARGET_AVOID_VL_EQ_0) Init(1)
> +Avoid (default) code that dynamically sets VL=0 where possible.

As stated above, this is not the general case of VL=0 avoidance but intersection
of VL=0 and a branch, so better to phrase it that way,

Also as others have said in earlier threads, this would be better as a cpu tune,
specifically part of vector tuning, maybe default for OoO tune.

Otherwise looks pretty cool  !

-Vineet

> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> new file mode 100644
> index 00000000000..275922eb0bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl 
> -Ofast" } */
> +
> +float g(float q[], int N){
> +    float dqnorm = 0.0;
> +
> +    #pragma GCC unroll 4
> +
> +    for (int i=0; i < N; i++) {
> +        dqnorm = dqnorm + q[i] * q[i];
> +    }
> +    return dqnorm;
> +}
> +
> +/* { dg-final { scan-assembler-times {beq\s+[a-x0-9]+,zero,.L12\s+vsetvli} 3 
> } } */
> --
> 2.43.0
>

Reply via email to