[Bug tree-optimization/39821] New: 120% slowdown with vectorizer

ramiro86 at hotmail dot com Sun, 19 Apr 2009 17:23:16 -0700

The vectorizer produces horrible code with this testcase:

$ cat dotproduct.c 
#include "inttypes.h"


int64_t dotproduct(int32_t *v1, int32_t *v2, int order)
{
    int64_t accum = 0;
    while (order--)
        accum += (int64_t) *v1++ * *v2++;
    return accum;
}

int64_t dotproduct_order4(int32_t *v1, int32_t *v2, int order)
{
    return dotproduct(v1, v2, 4);
}
$ gcc-4.4rc1 -o dotproduct.o -c dotproduct.c -O3
$ gcc-4.4rc1 -o dotproduct-no-vectorize.o -c dotproduct.c -O3
-fno-tree-vectorize
$ objdump -d dotproduct.o

dotproduct.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <dotproduct>:
   0:   31 c0                   xor    %eax,%eax
   2:   85 d2                   test   %edx,%edx
   4:   0f 84 4e 01 00 00       je     158 <dotproduct+0x158>
   a:   41 89 d0                mov    %edx,%r8d
   d:   44 8d 52 ff             lea    -0x1(%rdx),%r10d
  11:   41 c1 e8 02             shr    $0x2,%r8d
  15:   83 fa 03                cmp    $0x3,%edx
  18:   46 8d 0c 85 00 00 00    lea    0x0(,%r8,4),%r9d
  1f:   00 
  20:   76 05                   jbe    27 <dotproduct+0x27>
  22:   45 85 c9                test   %r9d,%r9d
  25:   75 09                   jne    30 <dotproduct+0x30>
  27:   31 c0                   xor    %eax,%eax
  29:   e9 fc 00 00 00          jmpq   12a <dotproduct+0x12a>
  2e:   66 90                   xchg   %ax,%ax
  30:   66 0f ef c0             pxor   %xmm0,%xmm0
  34:   31 c0                   xor    %eax,%eax
  36:   66 45 0f ef c9          pxor   %xmm9,%xmm9
  3b:   31 c9                   xor    %ecx,%ecx
  3d:   0f 1f 00                nopl   (%rax)
  40:   f3 0f 6f 14 07          movdqu (%rdi,%rax,1),%xmm2
  45:   83 c1 01                add    $0x1,%ecx
  48:   66 41 0f 6f d9          movdqa %xmm9,%xmm3
  4d:   f3 0f 6f 24 06          movdqu (%rsi,%rax,1),%xmm4
  52:   66 45 0f 6f c1          movdqa %xmm9,%xmm8
  57:   66 0f 6f ea             movdqa %xmm2,%xmm5
  5b:   48 83 c0 10             add    $0x10,%rax
  5f:   66 0f 66 dc             pcmpgtd %xmm4,%xmm3
  63:   66 0f 6f fc             movdqa %xmm4,%xmm7
  67:   66 44 0f 66 c2          pcmpgtd %xmm2,%xmm8
  6c:   41 39 c8                cmp    %ecx,%r8d
  6f:   66 0f 62 fb             punpckldq %xmm3,%xmm7
  73:   66 41 0f 62 e8          punpckldq %xmm8,%xmm5
  78:   66 0f 6a e3             punpckhdq %xmm3,%xmm4
  7c:   66 41 0f 6a d0          punpckhdq %xmm8,%xmm2
  81:   66 0f 6f cf             movdqa %xmm7,%xmm1
  85:   66 0f 6f f5             movdqa %xmm5,%xmm6
  89:   66 44 0f 6f d7          movdqa %xmm7,%xmm10
  8e:   66 0f f4 cd             pmuludq %xmm5,%xmm1
  92:   66 0f 6f da             movdqa %xmm2,%xmm3
  96:   66 0f 73 d6 20          psrlq  $0x20,%xmm6
  9b:   66 0f f4 f7             pmuludq %xmm7,%xmm6
  9f:   66 41 0f 73 d2 20       psrlq  $0x20,%xmm10
  a5:   66 0f 73 f6 20          psllq  $0x20,%xmm6
  aa:   66 41 0f f4 ea          pmuludq %xmm10,%xmm5
  af:   66 0f d4 ce             paddq  %xmm6,%xmm1
  b3:   66 0f 73 f5 20          psllq  $0x20,%xmm5
  b8:   66 0f d4 cd             paddq  %xmm5,%xmm1
  bc:   66 0f 6f ec             movdqa %xmm4,%xmm5
  c0:   66 0f d4 c8             paddq  %xmm0,%xmm1
  c4:   66 0f 73 d3 20          psrlq  $0x20,%xmm3
  c9:   66 0f 6f c4             movdqa %xmm4,%xmm0
  cd:   66 0f f4 dc             pmuludq %xmm4,%xmm3
  d1:   66 0f 73 f3 20          psllq  $0x20,%xmm3
  d6:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
  db:   66 0f f4 c2             pmuludq %xmm2,%xmm0
  df:   66 0f f4 d5             pmuludq %xmm5,%xmm2
  e3:   66 0f d4 c3             paddq  %xmm3,%xmm0
  e7:   66 0f 73 f2 20          psllq  $0x20,%xmm2
  ec:   66 0f d4 c2             paddq  %xmm2,%xmm0
  f0:   66 0f d4 c1             paddq  %xmm1,%xmm0
  f4:   0f 87 46 ff ff ff       ja     40 <dotproduct+0x40>
  fa:   42 8d 0c 8d 00 00 00    lea    0x0(,%r9,4),%ecx
 101:   00 
 102:   66 0f 6f c8             movdqa %xmm0,%xmm1
 106:   45 29 ca                sub    %r9d,%r10d
 109:   89 c9                   mov    %ecx,%ecx
 10b:   66 0f 73 d9 08          psrldq $0x8,%xmm1
 110:   66 0f d4 c1             paddq  %xmm1,%xmm0
 114:   48 01 cf                add    %rcx,%rdi
 117:   48 01 ce                add    %rcx,%rsi
 11a:   44 39 ca                cmp    %r9d,%edx
 11d:   66 0f d6 44 24 f8       movq   %xmm0,-0x8(%rsp)
 123:   48 8b 44 24 f8          mov    -0x8(%rsp),%rax
 128:   74 2e                   je     158 <dotproduct+0x158>
 12a:   45 89 d2                mov    %r10d,%r10d
 12d:   31 d2                   xor    %edx,%edx
 12f:   4e 8d 0c 95 04 00 00    lea    0x4(,%r10,4),%r9
 136:   00 
 137:   66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
 13e:   00 00 
 140:   48 63 0c 16             movslq (%rsi,%rdx,1),%rcx
 144:   4c 63 04 17             movslq (%rdi,%rdx,1),%r8
 148:   48 83 c2 04             add    $0x4,%rdx
 14c:   49 0f af c8             imul   %r8,%rcx
 150:   48 01 c8                add    %rcx,%rax
 153:   4c 39 ca                cmp    %r9,%rdx
 156:   75 e8                   jne    140 <dotproduct+0x140>
 158:   f3 c3                   repz retq 
 15a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000160 <dotproduct_order4>:
 160:   66 0f ef c0             pxor   %xmm0,%xmm0
 164:   f3 0f 6f 0f             movdqu (%rdi),%xmm1
 168:   f3 0f 6f 1e             movdqu (%rsi),%xmm3
 16c:   66 0f 6f d0             movdqa %xmm0,%xmm2
 170:   66 0f 6f f1             movdqa %xmm1,%xmm6
 174:   66 0f 66 c1             pcmpgtd %xmm1,%xmm0
 178:   66 0f 6f fb             movdqa %xmm3,%xmm7
 17c:   66 0f 66 d3             pcmpgtd %xmm3,%xmm2
 180:   66 0f 62 f0             punpckldq %xmm0,%xmm6
 184:   66 0f 62 fa             punpckldq %xmm2,%xmm7
 188:   66 0f 6a da             punpckhdq %xmm2,%xmm3
 18c:   66 0f 6a c8             punpckhdq %xmm0,%xmm1
 190:   66 0f 6f ee             movdqa %xmm6,%xmm5
 194:   66 44 0f 6f c7          movdqa %xmm7,%xmm8
 199:   66 0f 6f e7             movdqa %xmm7,%xmm4
 19d:   66 0f 6f c3             movdqa %xmm3,%xmm0
 1a1:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
 1a6:   66 44 0f f4 c6          pmuludq %xmm6,%xmm8
 1ab:   66 0f f4 ef             pmuludq %xmm7,%xmm5
 1af:   66 0f 6f d1             movdqa %xmm1,%xmm2
 1b3:   66 0f 73 d4 20          psrlq  $0x20,%xmm4
 1b8:   66 0f 73 f5 20          psllq  $0x20,%xmm5
 1bd:   66 0f f4 e6             pmuludq %xmm6,%xmm4
 1c1:   66 41 0f d4 e8          paddq  %xmm8,%xmm5
 1c6:   66 0f 73 f4 20          psllq  $0x20,%xmm4
 1cb:   66 0f d4 e5             paddq  %xmm5,%xmm4
 1cf:   66 0f 6f eb             movdqa %xmm3,%xmm5
 1d3:   66 0f f4 c1             pmuludq %xmm1,%xmm0
 1d7:   66 0f 73 d2 20          psrlq  $0x20,%xmm2
 1dc:   66 0f f4 d3             pmuludq %xmm3,%xmm2
 1e0:   66 0f 73 f2 20          psllq  $0x20,%xmm2
 1e5:   66 0f d4 c2             paddq  %xmm2,%xmm0
 1e9:   66 0f 73 d5 20          psrlq  $0x20,%xmm5
 1ee:   66 0f f4 cd             pmuludq %xmm5,%xmm1
 1f2:   66 0f 73 f1 20          psllq  $0x20,%xmm1
 1f7:   66 0f d4 c1             paddq  %xmm1,%xmm0
 1fb:   66 0f d4 c4             paddq  %xmm4,%xmm0
 1ff:   66 0f 6f c8             movdqa %xmm0,%xmm1
 203:   66 0f 73 d9 08          psrldq $0x8,%xmm1
 208:   66 0f d4 c1             paddq  %xmm1,%xmm0
 20c:   66 0f d6 44 24 f8       movq   %xmm0,-0x8(%rsp)
 212:   48 8b 44 24 f8          mov    -0x8(%rsp),%rax
 217:   c3                      retq   
$ objdump -d dotproduct-no-vectorize.o

dotproduct-no-vectorize.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <dotproduct>:
   0:   31 c0                   xor    %eax,%eax
   2:   85 d2                   test   %edx,%edx
   4:   74 2a                   je     30 <dotproduct+0x30>
   6:   83 ea 01                sub    $0x1,%edx
   9:   4c 8d 0c 95 04 00 00    lea    0x4(,%rdx,4),%r9
  10:   00 
  11:   31 d2                   xor    %edx,%edx
  13:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  18:   48 63 0c 16             movslq (%rsi,%rdx,1),%rcx
  1c:   4c 63 04 17             movslq (%rdi,%rdx,1),%r8
  20:   48 83 c2 04             add    $0x4,%rdx
  24:   49 0f af c8             imul   %r8,%rcx
  28:   48 01 c8                add    %rcx,%rax
  2b:   4c 39 ca                cmp    %r9,%rdx
  2e:   75 e8                   jne    18 <dotproduct+0x18>
  30:   f3 c3                   repz retq 
  32:   66 66 66 66 66 2e 0f    nopw   %cs:0x0(%rax,%rax,1)
  39:   1f 84 00 00 00 00 00 

0000000000000040 <dotproduct_order4>:
  40:   48 63 07                movslq (%rdi),%rax
  43:   48 63 16                movslq (%rsi),%rdx
  46:   48 63 4f 04             movslq 0x4(%rdi),%rcx
  4a:   48 0f af d0             imul   %rax,%rdx
  4e:   48 63 46 04             movslq 0x4(%rsi),%rax
  52:   48 0f af c1             imul   %rcx,%rax
  56:   48 63 4f 08             movslq 0x8(%rdi),%rcx
  5a:   48 01 c2                add    %rax,%rdx
  5d:   48 63 46 08             movslq 0x8(%rsi),%rax
  61:   48 0f af c1             imul   %rcx,%rax
  65:   48 63 4f 0c             movslq 0xc(%rdi),%rcx
  69:   48 01 c2                add    %rax,%rdx
  6c:   48 63 46 0c             movslq 0xc(%rsi),%rax
  70:   48 0f af c1             imul   %rcx,%rax
  74:   48 01 d0                add    %rdx,%rax
  77:   c3                      retq


-- 
           Summary: 120% slowdown with vectorizer
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: ramiro86 at hotmail dot com
GCC target triplet: x86_64-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821

[Bug tree-optimization/39821] New: 120% slowdown with vectorizer

Reply via email to