https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61299

            Bug ID: 61299
           Summary: [4.9 Regression] Performance regression for the SIMD
                    rotate operation with GCC vector extensions
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: siarhei.siamashka at gmail dot com

A small test:

/**************/
typedef unsigned int uint32x4 __attribute__ ((vector_size(16)));
typedef struct { uint32x4 a, b; } prng_t;
void foo(prng_t *x)
{
        x->a ^= ((x->b << 17) ^ (x->b >> (32 - 17)));
}
/**************/

Gets compiled into the following slow code with GCC 4.9 (CFLAGS="-O3"):

0000000000000000 <foo>:
   0:   66 0f 6f 47 10          movdqa 0x10(%rdi),%xmm0
   5:   66 0f 70 c8 55          pshufd $0x55,%xmm0,%xmm1
   a:   66 0f 7e c0             movd   %xmm0,%eax
   e:   c1 c8 0f                ror    $0xf,%eax
  11:   89 44 24 e8             mov    %eax,-0x18(%rsp)
  15:   66 0f 7e c8             movd   %xmm1,%eax
  19:   66 0f 6f c8             movdqa %xmm0,%xmm1
  1d:   c1 c8 0f                ror    $0xf,%eax
  20:   66 0f 6a c8             punpckhdq %xmm0,%xmm1
  24:   89 44 24 ec             mov    %eax,-0x14(%rsp)
  28:   66 0f 70 c0 ff          pshufd $0xff,%xmm0,%xmm0
  2d:   66 0f 6e 5c 24 ec       movd   -0x14(%rsp),%xmm3
  33:   66 0f 7e c8             movd   %xmm1,%eax
  37:   c1 c8 0f                ror    $0xf,%eax
  3a:   89 44 24 f0             mov    %eax,-0x10(%rsp)
  3e:   66 0f 7e c0             movd   %xmm0,%eax
  42:   66 0f 6e 44 24 e8       movd   -0x18(%rsp),%xmm0
  48:   66 0f 6e 4c 24 f0       movd   -0x10(%rsp),%xmm1
  4e:   c1 c8 0f                ror    $0xf,%eax
  51:   66 0f 62 c3             punpckldq %xmm3,%xmm0
  55:   89 44 24 f4             mov    %eax,-0xc(%rsp)
  59:   66 0f 6e 54 24 f4       movd   -0xc(%rsp),%xmm2
  5f:   66 0f 62 ca             punpckldq %xmm2,%xmm1
  63:   66 0f 6c c1             punpcklqdq %xmm1,%xmm0
  67:   66 0f ef 07             pxor   (%rdi),%xmm0
  6b:   0f 29 07                movaps %xmm0,(%rdi)
  6e:   c3                      retq   

It used to be a lot better with GCC 4.8 (CFLAGS="-O3"):

0000000000000000 <foo>:
   0:   66 0f 6f 4f 10          movdqa 0x10(%rdi),%xmm1
   5:   66 0f 6f c1             movdqa %xmm1,%xmm0
   9:   66 0f 72 d1 0f          psrld  $0xf,%xmm1
   e:   66 0f 72 f0 11          pslld  $0x11,%xmm0
  13:   66 0f ef c1             pxor   %xmm1,%xmm0
  17:   66 0f ef 07             pxor   (%rdi),%xmm0
  1b:   66 0f 7f 07             movdqa %xmm0,(%rdi)
  1f:   c3                      retq

Reply via email to