Hi, Richard.

>> I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
>> the case that the patch is handling?  It seems that:

>>   .EXTRACT_LAST (len, vec)

>> is equivalent to:

>>   vec[len - 1]

>> I think eventually there'll be the temptation to lower/fold it like that.

Current BIT_FIELD_REF doesn't make use of LOOP_LEN.

Consider this following case:

#include <stdint.h>
#define EXTRACT_LAST(TYPE)                      \
  TYPE __attribute__ ((noinline, noclone))      \
  test_##TYPE (TYPE *x, int n, TYPE value)      \
  {                                             \
    TYPE last;                                  \
    for (int j = 0; j < n; ++j)                 \
      {                                         \
        last = x[j];                            \
        x[j] = last * value;                    \
      }                                         \
    return last;                                \
  }
#define TEST_ALL(T)                             \
  T (uint8_t)                                   \
TEST_ALL (EXTRACT_LAST)

The assembly:
https://godbolt.org/z/z1PPT948b

test_uint8_t:
        mv      a3,a0
        ble     a1,zero,.L10
        addiw   a5,a1,-1
        li      a4,14
        sext.w  a0,a1
        bleu    a5,a4,.L11
        srliw   a4,a0,4
        slli    a4,a4,4
        mv      a5,a3
        add     a4,a4,a3
        vsetivli        zero,16,e8,m1,ta,ma
        vmv.v.x v3,a2
.L4:
        vl1re8.v        v1,0(a5)
        vmul.vv v2,v1,v3
        vs1r.v  v2,0(a5)
        addi    a5,a5,16
        bne     a4,a5,.L4
        andi    a4,a1,-16
        mv      a5,a4
        vsetivli        zero,8,e8,mf2,ta,ma
        beq     a0,a4,.L16
.L3:
        subw    a0,a0,a4
        addiw   a7,a0,-1
        li      a6,6
        bleu    a7,a6,.L7
        slli    a4,a4,32
        srli    a4,a4,32
        add     a4,a3,a4
        andi    a6,a0,-8
        vle8.v  v2,0(a4)
        vmv.v.x v1,a2
        andi    a0,a0,7
        vmul.vv v1,v1,v2
        vse8.v  v1,0(a4)
        addw    a5,a6,a5
        beq     a0,zero,.L8
.L7:
        add     a6,a3,a5
        lbu     a0,0(a6)
        addiw   a4,a5,1
        mulw    a7,a0,a2
        sb      a7,0(a6)
        bge     a4,a1,.L14
        add     a4,a3,a4
        lbu     a0,0(a4)
        addiw   a6,a5,2
        mulw    a7,a2,a0
        sb      a7,0(a4)
        ble     a1,a6,.L14
        add     a6,a3,a6
        lbu     a0,0(a6)
        addiw   a4,a5,3
        mulw    a7,a2,a0
        sb      a7,0(a6)
        ble     a1,a4,.L14
        add     a4,a3,a4
        lbu     a0,0(a4)
        addiw   a6,a5,4
        mulw    a7,a2,a0
        sb      a7,0(a4)
        ble     a1,a6,.L14
        add     a6,a3,a6
        lbu     a0,0(a6)
        addiw   a4,a5,5
        mulw    a7,a2,a0
        sb      a7,0(a6)
        ble     a1,a4,.L14
        add     a4,a3,a4
        lbu     a0,0(a4)
        addiw   a5,a5,6
        mulw    a6,a2,a0
        sb      a6,0(a4)
        ble     a1,a5,.L14
        add     a3,a3,a5
        lbu     a0,0(a3)
        mulw    a2,a2,a0
        sb      a2,0(a3)
        ret
.L10:
        li      a0,0
.L14:
        ret
.L8:
        vslidedown.vi   v2,v2,7
        vmv.x.s a0,v2
        andi    a0,a0,0xff
        ret
.L11:
        li      a4,0
        li      a5,0
        vsetivli        zero,8,e8,mf2,ta,ma
        j       .L3
.L16:
        vsetivli        zero,16,e8,m1,ta,ma
        vslidedown.vi   v1,v1,15
        vmv.x.s a0,v1
        andi    a0,a0,0xff
        ret


This patch is trying to optimize the codegen for RVV for length control,
after this patch:

Gimple IR:

  <bb 4> [local count: 955630224]:
  # vectp_x.6_22 = PHI <vectp_x.6_23(4), x_11(D)(3)>
  # vectp_x.10_30 = PHI <vectp_x.10_31(4), x_11(D)(3)>
  # ivtmp_34 = PHI <ivtmp_35(4), _33(3)>
  _36 = .SELECT_VL (ivtmp_34, 16);
  vect_last_12.8_24 = .MASK_LEN_LOAD (vectp_x.6_22, 8B, { -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, _36, 0);
  vect__4.9_28 = vect_last_12.8_24 * vect_cst__27;
  .MASK_LEN_STORE (vectp_x.10_30, 8B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1 }, _36, 0, vect__4.9_28);
  vectp_x.6_23 = vectp_x.6_22 + _36;
  vectp_x.10_31 = vectp_x.10_30 + _36;
  ivtmp_35 = ivtmp_34 - _36;
  if (ivtmp_35 != 0)
    goto <bb 4>; [89.00%]
  else
    goto <bb 5>; [11.00%]

  <bb 5> [local count: 105119324]:
  _26 = .EXTRACT_LAST (_36, vect_last_12.8_24); [tail call]

ASM:
test_uint8_t:
ble a1,zero,.L4
mv a4,a0
vsetivli zero,16,e8,m1,ta,ma
vmv.v.x v3,a2
.L3:
vsetvli a5,a1,e8,m1,ta,ma
vle8.v v1,0(a0)
vsetivli zero,16,e8,m1,ta,ma
sub a1,a1,a5
vmul.vv v2,v1,v3
vsetvli zero,a5,e8,m1,ta,ma
vse8.v v2,0(a4)
add a0,a0,a5
add a4,a4,a5
bne a1,zero,.L3
addi a5,a5,-1
vsetivli zero,16,e8,m1,ta,ma
vslidedown.vx v1,v1,a5
vmv.x.s a0,v1
andi a0,a0,0xff
ret
.L4:
li a0,0
ret

I think this codegen is much better with this patch.

>> FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
>> with a mask, vector, length and bias.  But even then, I think there'll
>> be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
>> So I think the function only makes sense if we have a use case where
>> the mask might not be all-1s.

I am wondering that, instead of adding 
IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN.
Does it make sense I use VEC_EXTRACT as follows :?

Loop:
loop_len_22 = SELECT_VL.
....

Epilogue:
new_loop_len_22 = PHI <loop_len_22>
scalar_res = VEC_EXTRACT (v, new_loop_len_22  - 1 - BIAS)

Feel free to correct me if I am wrong.

Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Sandiford
Date: 2023-08-09 21:30
To: juzhe.zhong\@rivai.ai
CC: rguenther; gcc-patches
Subject: Re: [PATCH] VECT: Support loop len control on EXTRACT_LAST 
vectorization
"juzhe.zh...@rivai.ai" <juzhe.zh...@rivai.ai> writes:
> Hi, Richi.
>
>>> that should be
>
>>>   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
>>>       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>
>>> I think.  It seems to imply that SLP isn't supported with
>>> masking/lengthing.
>
> Oh, yes.  At first glance, the original code is quite suspicious and your 
> comments make sense to me.
>
>>> Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently?
>>> Don't you need some CFN_LEN_EXTRACT_LAST instead?
>
> I think CFN_EXTRACT_LAST always has either loop mask or loop len.
>
> When both mask and length are not needed, IMHO, I think current BIT_FIELD_REF 
> flow is good enough:
> https://godbolt.org/z/Yr5M9hcc6
 
I'm a bit behind of email, but why isn't BIT_FIELD_REF enough for
the case that the patch is handling?  It seems that:
 
  .EXTRACT_LAST (len, vec)
 
is equivalent to:
 
  vec[len - 1]
 
I think eventually there'll be the temptation to lower/fold it like that.
 
FWIW, I agree a IFN_LEN_EXTRACT_LAST/IFN_EXTRACT_LAST_LEN would be OK,
with a mask, vector, length and bias.  But even then, I think there'll
be a temptation to lower calls with all-1 masks to val[len - 1 - bias].
So I think the function only makes sense if we have a use case where
the mask might not be all-1s.
 
Thanks,
Richard
 

Reply via email to