on 2021/7/13 下午8:42, Richard Biener wrote:
> On Tue, Jul 13, 2021 at 12:25 PM Kewen.Lin <[email protected]> wrote:
>>
>> Hi Richi,
>>
>> Thanks for the comments!
>>
>> on 2021/7/13 下午5:35, Richard Biener wrote:
>>> On Tue, Jul 13, 2021 at 10:53 AM Kewen.Lin <[email protected]> wrote:
>>>>
>>>> Hi,
>>>>
>>>> When I added the support for Power10 newly introduced multiply
>>>> highpart instrutions, I noticed that currently vectorizer
>>>> doesn't try to vectorize multiply highpart pattern, I hope
>>>> this isn't intentional?
>>>>
>>>> This patch is to extend the existing pattern mulhs handlings
>>>> to cover multiply highpart. Another alternative seems to
>>>> recog mul_highpart operation in a general place applied for
>>>> scalar code when the target supports the optab for the scalar
>>>> operation, it's based on the assumption that one target which
>>>> supports vector version of multiply highpart should have the
>>>> scalar version. I noticed that the function can_mult_highpart_p
>>>> can check/handle mult_highpart well even without mul_highpart
>>>> optab support, I think to recog this pattern in vectorizer
>>>> is better. Is it on the right track?
>>>
>>> I think it's on the right track, using IFN_LAST is a bit awkward
>>> in case yet another case pops up so maybe you can use
>>> a code_helper instance instead which unifies tree_code,
>>> builtin_code and internal_fn?
>>>
>>
>> If there is one new requirement which doesn't have/introduce IFN
>> stuffs but have one existing tree_code, can we add one more field
>> with type tree_code, then for the IFN_LAST path we can check the
>> different requirements under the guard with that tree_code variable?
>>
>>> I also notice that can_mult_highpart_p will return true if
>>> only vec_widen_[us]mult_{even,odd,hi,lo} are available,
>>> but then the result might be less optimal (or even not
>>> handled later)?
>>>
>>
>> I think it will be handled always? The expander calls
>>
>> rtx
>> expand_mult_highpart (machine_mode mode, rtx op0, rtx op1,
>> rtx target, bool uns_p)
>>
>> which will further check with can_mult_highpart_p.
>>
>> For the below case,
>>
>> #define SHT_CNT 16
>>
>> __attribute__ ((noipa)) void
>> test ()
>> {
>> for (int i = 0; i < N; i++)
>> sh_c[i] = ((SI) sh_a[i] * (SI) sh_b[i]) >> 16;
>> }
>>
>> Without this patch, it use widen_mult like below:
>>
>> vect__1.5_19 = MEM <vector(8) short int> [(short int *)&sh_a + ivtmp.18_24
>> * 1];
>> vect__3.8_14 = MEM <vector(8) short int> [(short int *)&sh_b + ivtmp.18_24
>> * 1];
>> vect_patt_22.9_13 = WIDEN_MULT_LO_EXPR <vect__3.8_14, vect__1.5_19>;
>> vect_patt_22.9_9 = WIDEN_MULT_HI_EXPR <vect__3.8_14, vect__1.5_19>;
>> vect__6.10_25 = vect_patt_22.9_13 >> 16;
>> vect__6.10_26 = vect_patt_22.9_9 >> 16;
>> vect__7.11_27 = VEC_PACK_TRUNC_EXPR <vect__6.10_25, vect__6.10_26>;
>> MEM <vector(8) short int> [(short int *)&sh_c + ivtmp.18_24 * 1] =
>> vect__7.11_27;
>>
>> .L2:
>> lxvx 33,7,9
>> lxvx 32,8,9
>> vmulosh 13,0,1 // widen mult
>> vmulesh 0,0,1
>> xxmrglw 33,32,45 // merge
>> xxmrghw 32,32,45
>> vsraw 1,1,12 // shift
>> vsraw 0,0,12
>> vpkuwum 0,0,1 // pack
>> stxvx 32,10,9
>> addi 9,9,16
>> bdnz .L2
>>
>>
>> With this patch, it ends up with:
>>
>> vect__1.5_14 = MEM <vector(8) short int> [(short int *)&sh_a + ivtmp.17_24
>> * 1];
>> vect__3.8_8 = MEM <vector(8) short int> [(short int *)&sh_b + ivtmp.17_24
>> * 1];
>> vect_patt_21.9_25 = vect__3.8_8 h* vect__1.5_14;
>> MEM <vector(8) short int> [(short int *)&sh_c + ivtmp.17_24 * 1] =
>> vect_patt_21.9_25;
>
> Yes, so I'm curious what it ends up with/without the patch on x86_64 which
> can do vec_widen_[us]mult_{even,odd} but not [us]mul_highpart.
>
For test case:
```
#define N 32
typedef signed int bigType;
typedef signed short smallType;
#define SH_CNT 16
extern smallType small_a[N], small_b[N], small_c[N];
__attribute__((noipa)) void test_si(int n) {
for (int i = 0; i < n; i++)
small_c[i] = ((bigType)small_a[i] * (bigType)small_b[i]) >> SH_CNT;
}
```
on x86_64, with option set: -mfpmath=sse -msse2 -O2 -ftree-vectorize
1) without this patch, the pattern isn't recognized, the IR looks like:
<bb 4> [local count: 94607391]:
bnd.5_34 = niters.4_25 >> 3;
_13 = (sizetype) bnd.5_34;
_29 = _13 * 16;
<bb 5> [local count: 378429566]:
# ivtmp.34_4 = PHI <ivtmp.34_5(5), 0(4)>
vect__1.10_40 = MEM <vector(8) short int> [(short int *)&small_a + ivtmp.34_4
* 1];
vect__3.13_43 = MEM <vector(8) short int> [(short int *)&small_b + ivtmp.34_4
* 1];
vect_patt_18.14_44 = WIDEN_MULT_LO_EXPR <vect__1.10_40, vect__3.13_43>;
vect_patt_18.14_45 = WIDEN_MULT_HI_EXPR <vect__1.10_40, vect__3.13_43>;
vect__6.15_46 = vect_patt_18.14_44 >> 16;
vect__6.15_47 = vect_patt_18.14_45 >> 16;
vect__7.16_48 = VEC_PACK_TRUNC_EXPR <vect__6.15_46, vect__6.15_47>;
MEM <vector(8) short int> [(short int *)&small_c + ivtmp.34_4 * 1] =
vect__7.16_48;
ivtmp.34_5 = ivtmp.34_4 + 16;
if (ivtmp.34_5 != _29)
goto <bb 5>; [75.00%]
else
goto <bb 6>; [25.00%]
...
*asm*:
.L4:
movdqu small_b(%rax), %xmm3
movdqu small_a(%rax), %xmm1
addq $16, %rax
movdqu small_a-16(%rax), %xmm2
pmullw %xmm3, %xmm1
pmulhw %xmm3, %xmm2
movdqa %xmm1, %xmm0
punpcklwd %xmm2, %xmm0
punpckhwd %xmm2, %xmm1
psrad $16, %xmm1
psrad $16, %xmm0
movdqa %xmm0, %xmm2
punpcklwd %xmm1, %xmm0
punpckhwd %xmm1, %xmm2
movdqa %xmm0, %xmm1
punpckhwd %xmm2, %xmm1
punpcklwd %xmm2, %xmm0
punpcklwd %xmm1, %xmm0
movups %xmm0, small_c-16(%rax)
cmpq %rdx, %rax
jne .L4
movl %edi, %eax
andl $-8, %eax
testb $7, %dil
je .L14
*insn dist in loop*
1 addq
3 movdqa
3 movdqu
1 movups
1 pmulhw
1 pmullw
2 psrad
3 punpckhwd
4 punpcklwd
2) with this patch but make the mul_highpart optab query return false, the IR
looks
like:
<bb 4> [local count: 94607391]:
bnd.5_37 = niters.4_22 >> 3;
_13 = (sizetype) bnd.5_37;
_32 = _13 * 16;
<bb 5> [local count: 378429566]:
# ivtmp.33_4 = PHI <ivtmp.33_5(5), 0(4)>
vect__1.10_43 = MEM <vector(8) short int> [(short int *)&small_a + ivtmp.33_4
* 1];
vect__3.13_46 = MEM <vector(8) short int> [(short int *)&small_b + ivtmp.33_4
* 1];
vect_patt_26.14_47 = vect__1.10_43 h* vect__3.13_46;
MEM <vector(8) short int> [(short int *)&small_c + ivtmp.33_4 * 1] =
vect_patt_26.14_47;
ivtmp.33_5 = ivtmp.33_4 + 16;
if (ivtmp.33_5 != _32)
goto <bb 5>; [75.00%]
else
goto <bb 6>; [25.00%]
*asm*:
.L4:
movdqu small_b(%rax), %xmm3
movdqu small_a(%rax), %xmm1
addq $16, %rax
movdqu small_a-16(%rax), %xmm2
pmullw %xmm3, %xmm1
pmulhw %xmm3, %xmm2
movdqa %xmm1, %xmm0
punpcklwd %xmm2, %xmm0
punpckhwd %xmm2, %xmm1
movdqa %xmm0, %xmm2
punpcklwd %xmm1, %xmm0
punpckhwd %xmm1, %xmm2
movdqa %xmm0, %xmm1
punpckhwd %xmm2, %xmm1
punpcklwd %xmm2, %xmm0
punpckhwd %xmm1, %xmm0
movups %xmm0, small_c-16(%rax)
cmpq %rdx, %rax
jne .L4
movl %edi, %eax
andl $-8, %eax
testb $7, %dil
je .L14
*insn dist*:
1 addq
3 movdqa
3 movdqu
1 movups
1 pmulhw
1 pmullw
4 punpckhwd
3 punpcklwd
I know little on i386 asm, but this seems better from insn distribution,
the one without this patch uses two more psrad (s).
3) FWIW with this patch as well as existing optab supports, the IR looks like:
<bb 4> [local count: 94607391]:
bnd.5_40 = niters.4_19 >> 3;
_75 = (sizetype) bnd.5_40;
_91 = _75 * 16;
<bb 5> [local count: 378429566]:
# ivtmp.48_53 = PHI <ivtmp.48_45(5), 0(4)>
vect__1.10_48 = MEM <vector(8) short int> [(short int *)&small_a +
ivtmp.48_53 * 1];
vect__3.13_51 = MEM <vector(8) short int> [(short int *)&small_b +
ivtmp.48_53 * 1];
vect_patt_26.14_52 = vect__1.10_48 h* vect__3.13_51;
MEM <vector(8) short int> [(short int *)&small_c + ivtmp.48_53 * 1] =
vect_patt_26.14_52;
ivtmp.48_45 = ivtmp.48_53 + 16;
if (ivtmp.48_45 != _91)
goto <bb 5>; [75.00%]
else
goto <bb 6>; [25.00%]
<bb 6> [local count: 94607391]:
niters_vector_mult_vf.6_41 = niters.4_19 & 4294967288;
tmp.7_42 = (int) niters_vector_mult_vf.6_41;
_61 = niters.4_19 & 7;
if (_61 == 0)
goto <bb 12>; [12.50%]
else
goto <bb 7>; [87.50%]
<bb 7> [local count: 93293400]:
# i_38 = PHI <tmp.7_42(6), 0(3)>
# _44 = PHI <niters_vector_mult_vf.6_41(6), 0(3)>
niters.18_60 = niters.4_19 - _44;
_76 = niters.18_60 + 4294967295;
if (_76 <= 2)
goto <bb 9>; [10.00%]
else
goto <bb 8>; [90.00%]
<bb 8> [local count: 167928121]:
_85 = (sizetype) _44;
_86 = _85 * 2;
vectp_small_a.23_84 = &small_a + _86;
vectp_small_b.26_90 = &small_b + _86;
vectp_small_c.31_98 = &small_c + _86;
vect__9.24_89 = MEM <vector(4) short int> [(short int *)vectp_small_a.23_84];
vect__28.27_95 = MEM <vector(4) short int> [(short int *)vectp_small_b.26_90];
vect_patt_23.28_96 = vect__9.24_89 h* vect__28.27_95;
MEM <vector(4) short int> [(short int *)vectp_small_c.31_98] =
vect_patt_23.28_96;
niters_vector_mult_vf.20_80 = niters.18_60 & 4294967292;
_82 = (int) niters_vector_mult_vf.20_80;
tmp.21_81 = i_38 + _82;
_74 = niters.18_60 & 3;
if (_74 == 0)
goto <bb 11>; [25.00%]
else
goto <bb 9>; [75.00%]
...
*asm*:
.L4:
movdqu small_a(%rax), %xmm0
movdqu small_b(%rax), %xmm2
addq $16, %rax
pmulhw %xmm2, %xmm0
movups %xmm0, small_c-16(%rax)
cmpq %rdx, %rax
jne .L4
movl %ecx, %edx
andl $-8, %edx
movl %edx, %eax
testb $7, %cl
je .L19
.L3:
movl %ecx, %esi
subl %edx, %esi
leal -1(%rsi), %edi
cmpl $2, %edi
jbe .L6
movq small_a(%rdx,%rdx), %xmm0
movq small_b(%rdx,%rdx), %xmm1
pmulhw %xmm1, %xmm0
movq %xmm0, small_c(%rdx,%rdx)
movl %esi, %edx
andl $-4, %edx
addl %edx, %eax
andl $3, %esi
je .L1
I guess the proposed IFN would be directly mapped for [us]mul_highpart?
or do you expect it will also cover what we support in can_mult_highpart_p?
BR,
Kewen