https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116425

            Bug ID: 116425
           Summary: RISC-V missed optimization: vector lowering along lmul
                    boundaries
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ewlu at rivosinc dot com
  Target Milestone: ---

Godbolt: https://godbolt.org/z/4T68qnsjc

Flags: -O3 -march=rv64gcv_zvl256b -ftree-vectorize -mrvv-max-lmul=m1

Testcase
```
__attribute__ ((noipa)) void permute_vnx8si (vnx8si values1, vnx8si values2,
vnx8si *out) {
    vnx8si v = 
        __builtin_shufflevector (values1, values2, 1, 1, 1, 1, 1, 1, 1, 1);
    *(vnx8si *) out = v;
} 
__attribute__ ((noipa)) void permute_vnx16si (vnx16si values1, vnx16si values2,
vnx16si *out) {
    vnx16si v = 
        __builtin_shufflevector (values1, values2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1);
    *(vnx16si *) out = v;
} 
```

Assembly:
```
permute_vnx8si:
        vsetivli        zero,8,e32,m1,ta,ma
        vle32.v v2,0(a0)
        vrgather.vi     v1,v2,1
        vse32.v v1,0(a2)
        ret
permute_vnx16si:
        lw      a5,4(a0)
        sw      a5,0(a2)
        sw      a5,4(a2)
        sw      a5,8(a2)
        sw      a5,12(a2)
        sw      a5,16(a2)
        sw      a5,20(a2)
        sw      a5,24(a2)
        sw      a5,28(a2)
        sw      a5,32(a2)
        sw      a5,36(a2)
        sw      a5,40(a2)
        sw      a5,44(a2)
        sw      a5,48(a2)
        sw      a5,52(a2)
        sw      a5,56(a2)
        sw      a5,60(a2)
        ret
```

For permute_vnx16si, it should be possible to replace all the scalar stores
with two vrgather.vi like

permute_vnx16si:
        vsetivli        zero,8,e32,m1,ta,ma
        vle32.v v2,0(a0)
        vrgather.vi     v8,v2,1
        vrgather.vi     v9,v2,1
        vse32.v v8,0(a2)
        vse32.v v9,32(a2)
        ret

This essentially mimics what a larger lmul would do. As an optimization, the
second vrgather could also be optimized away so we end up with something like

permute_vnx16si:
        vsetivli        zero,8,e32,m1,ta,ma
        vle32.v v2,0(a0)
        vrgather.vi     v8,v2,1
        vse32.v v8,0(a2)
        vse32.v v8,32(a2)
        ret

Currently in the veclower pass, permute_vnx16si is being lowered like so

__attribute__((noipa, noinline, noclone, no_icf))
void permute_vnx16si (vnx16si values1, vnx16si values2, vnx16si * out)
{
  vnx16si v;
  int _6;
  int _7;
  int _8;
  int _9;
  int _10;
  int _11;
  int _12;
  int _13;
  int _14;
  int _15;
  int _16;
  int _17;
  int _18;
  int _19;
  int _20;
  int _21;

  <bb 2> [local count: 1073741824]:
  _6 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _7 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _8 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _9 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _10 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _11 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _12 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _13 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _14 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _15 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _16 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _17 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _18 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _19 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _20 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  _21 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  v_2 = {_6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20,
_21};
  *out_4(D) = v_2;
  return;

}

I was thinking that modifying lower_vec_perm to check (if the vector could be
split) and split the vector would make the most sense. 

This would allow the veclower pass to output something along the lines of

  vnx8si v1;
  vnx8si v2;

  <bb 2> [local count: 1073741824]:
  v_1 = VEC_PERM_EXPR <values1_1(D), values1_1(D), { 1, 1, 1, 1, 1, 1, 1, 1 }>;
  v_2 = VEC_PERM_EXPR <values1_1(D), values1_1(D), { 1, 1, 1, 1, 1, 1, 1, 1 }>;
  *out_4(D) = v_1;
  *(out_4(D) + 8) = v_2;
  return;

However, the !VECTOR_MODE_P (mode) condition check in can_check_perm_var_p
would be rather messy since the current vector mode is E_BLKmode.

The other possibility I was thinking of was with modifying the forwprop pass
since the forwprop pass converts the vector constructor into 

__attribute__((noipa, noinline, noclone, no_icf))
void permute_vnx16si (vnx16si values1, vnx16si values2, vnx16si * out)
{
  vnx16si v;
  int _6;

  <bb 2> [local count: 1073741824]:
  _6 = BIT_FIELD_REF <values1_1(D), 32, 32>;
  BIT_FIELD_REF <*out_4(D), 32, 0> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 32> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 64> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 96> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 128> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 160> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 192> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 224> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 256> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 288> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 320> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 352> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 384> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 416> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 448> = _6;
  BIT_FIELD_REF <*out_4(D), 32, 480> = _6;
  return;

}

Ideally, the forwprop pass should be able to recognize that 
v_2 = {_6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20,
_21};
could be split into two vectors and end up with the same resulting
representation as before. I think this would be less invasive of a change but
it could be the wrong pass to add this in.

I might also be looking in the wrong location completely...

Reply via email to