https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94908

            Bug ID: 94908
           Summary: Failure to optimally optimize certain shuffle patterns
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gabravier at gmail dot com
  Target Milestone: ---

typedef float v4sf __attribute__((vector_size(16)));

v4sf g();

v4sf f(v4sf a, v4sf b)
{
    return (v4sf){g()[1], a[1], a[2], a[3]};
}

With -O3, LLVM outputs this :

f(float __vector(4), float __vector(4)): # @f(float __vector(4), float
__vector(4))
  sub rsp, 24
  movaps xmmword ptr [rsp], xmm0 # 16-byte Spill
  call g()
  movaps xmm1, xmmword ptr [rsp] # 16-byte Reload
  shufps xmm0, xmm1, 17 # xmm0 = xmm0[1,0],xmm1[1,0]
  shufps xmm0, xmm1, 232 # xmm0 = xmm0[0,2],xmm1[2,3]
  add rsp, 24
  ret

GCC outputs this : 

f(float __vector(4), float __vector(4)):
  sub rsp, 24
  movaps XMMWORD PTR [rsp], xmm0
  call g()
  movaps xmm1, XMMWORD PTR [rsp]
  add rsp, 24
  shufps xmm0, xmm0, 85
  movaps xmm2, xmm1
  shufps xmm2, xmm1, 85
  movaps xmm3, xmm2
  movaps xmm2, xmm1
  unpckhps xmm2, xmm1
  unpcklps xmm0, xmm3
  shufps xmm1, xmm1, 255
  unpcklps xmm2, xmm1
  movlhps xmm0, xmm2
  ret

This also seems to occurs on powerpc64le, so I haven't marked it as
target-specific.

Reply via email to