https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125357

            Bug ID: 125357
           Summary: Fails to utilize vpermilps for dynamic shuffle
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: amonakov at gcc dot gnu.org
  Target Milestone: ---
            Target: x86_64-*-*, i?86-*-*

typedef int ivec __attribute__((vector_size(16)));
typedef int vec __attribute__((vector_size(16)));

vec shuf(vec x, ivec i)
{
#ifdef __clang__
        return (vec){x[i[0]], x[i[1]], x[i[2]], x[i[3]]};
#else
        return __builtin_shuffle(x, i);
#endif
}

gcc -O2 -mavx generates

        vpcmpeqd        xmm2, xmm2, xmm2
        vpsrld  xmm2, xmm2, 30
        vpand   xmm1, xmm1, xmm2
        vpslld  xmm1, xmm1, 2
        vpshufb xmm1, xmm1, XMMWORD PTR .LC1[rip]
        vpaddb  xmm1, xmm1, XMMWORD PTR .LC2[rip]
        vpshufb xmm0, xmm0, xmm1
        ret

gcc -O2 -mavx2 generates

        vinserti128     ymm1, ymm1, xmm1, 1
        vinserti128     ymm0, ymm0, xmm0, 1
        vpermd  ymm0, ymm1, ymm0
        vzeroupper
        ret

but both could have been simply

        vpermilps       xmm0, xmm0, xmm1
        ret

(also, why is gcc duplicating the vectors in avx2 codegen? it would have been
cheaper to zero-extend them to 256 bits instead)

Reply via email to