https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111470

            Bug ID: 111470
           Summary: RISC-V: autovec fma generate redundant vmerge
                    instruction
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: lehua.ding at rivai dot ai
  Target Milestone: ---

Consider this code:

void foo (float *restrict a, float *restrict b, float *restrict c,
          float *restrict merged, int *restrict pred,
          float *restrict out, int n)
{
  for (int i = 0; i < n; i += 1)
    out[i] = pred[i] ? (a[i] * b[i] + c[i]) : merged[i];
}

assembly (this second vmerge is redundant):

foo:
        ble     a6,zero,.L5
        vsetvli t4,zero,e32,m1,ta,ma
        vmv.v.i v7,0
.L3:
        vsetvli a7,a6,e32,m1,ta,ma
        vle32.v v0,0(a4)
        vsetvli t4,zero,e32,m1,ta,ma
        vmseq.vi        v1,v0,0
        vmsne.vi        v2,v0,0
        vmv1r.v v0,v2
        vsetvli zero,a7,e32,m1,ta,ma
        vle32.v v6,0(a0),v0.t
        vmv1r.v v0,v1
        vle32.v v1,0(a3),v0.t
        vmv1r.v v0,v2
        vle32.v v5,0(a1),v0.t
        vle32.v v4,0(a2),v0.t
        vmerge.vvm      v3,v7,v4,v0
        vsetvli zero,zero,e32,m1,tu,mu
        vfmacc.vv       v3,v6,v5,v0.t
        vsetvli t3,zero,e32,m1,ta,ma
        slli    t1,a7,2
        vmerge.vvm      v1,v1,v3,v0
        sub     a6,a6,a7
        vsetvli zero,a7,e32,m1,ta,ma
        vse32.v v1,0(a5)
        add     a4,a4,t1
        add     a0,a0,t1
        add     a1,a1,t1
        add     a2,a2,t1
        add     a3,a3,t1
        add     a5,a5,t1
        bne     a6,zero,.L3
.L5:
        ret

>From the vect dump, the .COND_LEN_ADD and .VCOND_MASK can be combined into a
single .COND_LEN_ADD. For .COND_ADD and .VCOND_MASK, the match.pd has the
simplify pattern (like bellow) to handle this combine. So I think we should add
.COND_LEN_ADD in match.pd too.

  /* Detect cases in which a VEC_COND_EXPR effectively replaces the
   "else" value of an IFN_COND_*.  */
  (for cond_op (COND_BINARY)
   (simplify
    (vec_cond @0 (view_convert? (cond_op @0 @1 @2 @3)) @4)
    (with { tree op_type = TREE_TYPE (@3); }
     (if (element_precision (type) == element_precision (op_type))
      (view_convert (cond_op @0 @1 @2 (view_convert:op_type @4))))))
   (simplify
    (vec_cond @0 @1 (view_convert? (cond_op @2 @3 @4 @5)))
    (with { tree op_type = TREE_TYPE (@5); }
     (if (inverse_conditions_p (@0, @2)
          && element_precision (type) == element_precision (op_type))
      (view_convert (cond_op @2 @3 @4 (view_convert:op_type @1)))))))

Reply via email to