Richard Biener <[email protected]> writes:
> On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool
> <[email protected]> wrote:
>>
>> Hi!
>>
>> On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote:
>> > Btw, on x86_64 the following produces sth reasonable:
>> >
>> > #define N 32
>> > typedef int T;
>> > typedef T V __attribute__((vector_size(N)));
>> > V setg (V v, int idx, T val)
>> > {
>> > V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>> > V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv);
>> > v = (v & ~mask) | (valv & mask);
>> > return v;
>> > }
>> >
>> > vmovd %edi, %xmm1
>> > vpbroadcastd %xmm1, %ymm1
>> > vpcmpeqd .LC0(%rip), %ymm1, %ymm2
>> > vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
>> > ret
>> >
>> > I'm quite sure you could do sth similar on power?
>>
>> This only allows inserting aligned elements. Which is probably fine
>> of course (we don't allow elements that straddle vector boundaries
>> either, anyway).
>>
>> And yes, we can do that :-)
>>
>> That should be
>> #define N 32
>> typedef int T;
>> typedef T V __attribute__((vector_size(N)));
>> V setg (V v, int idx, T val)
>> {
>> V valv = (V){val, val, val, val, val, val, val, val};
>> V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>> V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
>> v = (v & ~mask) | (valv & mask);
>> return v;
>> }
>
> Whoops yeah, simplified it a bit too much ;)
>
>> after which I get (-march=znver2)
>>
>> setg:
>> vmovd %edi, %xmm1
>> vmovd %esi, %xmm2
>> vpbroadcastd %xmm1, %ymm1
>> vpbroadcastd %xmm2, %ymm2
>> vpcmpeqd .LC0(%rip), %ymm1, %ymm1
>> vpandn %ymm0, %ymm1, %ymm0
>> vpand %ymm2, %ymm1, %ymm1
>> vpor %ymm0, %ymm1, %ymm0
>> ret
>
> I get with -march=znver2 -O2
>
> vmovd %edi, %xmm1
> vmovd %esi, %xmm2
> vpbroadcastd %xmm1, %ymm1
> vpbroadcastd %xmm2, %ymm2
> vpcmpeqd .LC0(%rip), %ymm1, %ymm1
> vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
>
> and with -mavx512vl
>
> vpbroadcastd %edi, %ymm1
> vpcmpd $0, .LC0(%rip), %ymm1, %k1
> vpbroadcastd %esi, %ymm0{%k1}
>
> broadcast-with-mask - heh, would be interesting if we manage
> to combine v[idx1] = val; v[idx2] = val; ;)
>
> Now, with SSE4.2 the 16byte case compiles to
>
> setg:
> .LFB0:
> .cfi_startproc
> movd %edi, %xmm3
> movdqa %xmm0, %xmm1
> movd %esi, %xmm4
> pshufd $0, %xmm3, %xmm0
> pcmpeqd .LC0(%rip), %xmm0
> movdqa %xmm0, %xmm2
> pandn %xmm1, %xmm2
> pshufd $0, %xmm4, %xmm1
> pand %xmm1, %xmm0
> por %xmm2, %xmm0
> ret
>
> since there's no blend with a variable mask IIRC.
>
> with aarch64 and SVE it doesn't handle the 32byte case at all,
FWIW, the SVE version with -msve-vector-bits=256 is:
ptrue p0.b, vl32
mov z1.s, w1
index z2.s, #0, #1
ld1w z0.s, p0/z, [x0]
cmpeq p1.s, p0/z, z1.s, z2.s
mov z0.s, p1/m, w2
st1w z0.s, p0, [x8]
where the ptrue, ld1w and st1w are just because generic 256-bit
vectors are passed in memory; the real operation is:
mov z1.s, w1
index z2.s, #0, #1
cmpeq p1.s, p0/z, z1.s, z2.s
mov z0.s, p1/m, w2
Thanks,
Richard