https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88461

--- Comment #3 from Daniel Fruzynski <bugzi...@poradnik-webmastera.com> ---
Good catch, mask should be 16-bit. Here is fixed version:

[code]
#include <immintrin.h>
#include <stdint.h>

int test(uint16_t* data, int a)
{
    __m128i v = _mm_load_si128((const __m128i*)data);
    __mmask16 m = _mm_testn_epi16_mask(v, v);
    m = _kshiftli_mask16(m, 1);
    m = _kandn_mask16(m, a);
    return m;
}
[/code]

[asm]
test(unsigned short*, int):
        vmovdqa64       xmm0, XMMWORD PTR [rdi]
        kmovw   k4, esi
        vptestnmw       k1, xmm0, xmm0
        kmovb   eax, k1
        kmovw   k2, eax
        kshiftlw        k0, k2, 1
        kandnw  k3, k0, k4
        kmovw   eax, k3
        ret
[/asm]

This still can be optimized, there is no need to move value from k1 to eax and
then to k2 - vptestnmw zeroes upper bits if k register.

Reply via email to