http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51179
Bug #: 51179 Summary: poor vectorization on interlagos. Classification: Unclassified Product: gcc Version: 4.6.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target AssignedTo: unassig...@gcc.gnu.org ReportedBy: joost.vandevond...@mat.ethz.ch The following code executes significantly faster when compiled with the cray compiler (gcc: 43.4s cray:7.7s for 100000000 calls) SUBROUTINE smm_dnn_4_10_10_4_1_2_1(A,B,C) REAL(KIND=KIND(0.0D0)) :: C(4,10), B(10,10), A(4,10) INTEGER ::i,j,l DO j= 1 , 10 , 2 DO l= 1 , 10 , 1 DO i= 1 , 4 , 1 C(i+0,j+0)=C(i+0,j+0)+A(i+0,l+0)*B(l+0,j+0) C(i+0,j+1)=C(i+0,j+1)+A(i+0,l+0)*B(l+0,j+1) ENDDO ENDDO ENDDO END SUBROUTINE cray options: -h noomp -e m -F -ra -O2 -Oipa1 -v tst.f90 gfortran: -O3 -march=native -ffast-math which yields for gfortran: -march=bdver1 -mcx16 -msahf -mno-movbe -maes -mpclmul -mpopcnt -mabm -mlwp -mno-fma -mfma4 -mxop -mno-bmi -mno-tbm -mavx -msse4.2 -msse4.1 The cray code looks nice: 0000000000000000 <smm_dnn_4_10_10_4_1_2_1_>: 0: 48 89 7c 24 f8 mov %rdi,-0x8(%rsp) 5: 48 89 74 24 f0 mov %rsi,-0x10(%rsp) a: 48 89 54 24 e8 mov %rdx,-0x18(%rsp) f: c5 fc 10 02 vmovups (%rdx),%ymm0 13: c5 fc 10 4a 20 vmovups 0x20(%rdx),%ymm1 18: c5 fc 10 52 40 vmovups 0x40(%rdx),%ymm2 1d: c5 fc 10 5a 60 vmovups 0x60(%rdx),%ymm3 22: c5 fc 10 a2 80 00 00 vmovups 0x80(%rdx),%ymm4 29: 00 2a: c5 fc 10 aa a0 00 00 vmovups 0xa0(%rdx),%ymm5 31: 00 32: c5 fc 10 b2 c0 00 00 vmovups 0xc0(%rdx),%ymm6 39: 00 3a: c5 fc 10 ba e0 00 00 vmovups 0xe0(%rdx),%ymm7 41: 00 42: c5 7c 10 82 00 01 00 vmovups 0x100(%rdx),%ymm8 49: 00 4a: c5 7c 10 8a 20 01 00 vmovups 0x120(%rdx),%ymm9 51: 00 52: 31 c0 xor %eax,%eax 54: 48 89 c1 mov %rax,%rcx 57: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) 5e: 00 00 60: c4 62 7d 19 94 c6 d0 vbroadcastsd 0x2d0(%rsi,%rax,8),%ymm10 67: 02 00 00 6a: c5 7c 10 1c 0f vmovups (%rdi,%rcx,1),%ymm11 6f: c4 43 a5 69 c9 a0 vfmaddpd %ymm9,%ymm10,%ymm11,%ymm9 75: c4 62 7d 19 94 c6 80 vbroadcastsd 0x280(%rsi,%rax,8),%ymm10 7c: 02 00 00 7f: c4 43 a5 69 c0 a0 vfmaddpd %ymm8,%ymm10,%ymm11,%ymm8 85: c4 62 7d 19 94 c6 30 vbroadcastsd 0x230(%rsi,%rax,8),%ymm10 8c: 02 00 00 8f: c4 e3 a5 69 ff a0 vfmaddpd %ymm7,%ymm10,%ymm11,%ymm7 95: c4 62 7d 19 94 c6 e0 vbroadcastsd 0x1e0(%rsi,%rax,8),%ymm10 9c: 01 00 00 9f: c4 e3 a5 69 f6 a0 vfmaddpd %ymm6,%ymm10,%ymm11,%ymm6 a5: c4 62 7d 19 94 c6 90 vbroadcastsd 0x190(%rsi,%rax,8),%ymm10 ac: 01 00 00 af: c4 e3 a5 69 ed a0 vfmaddpd %ymm5,%ymm10,%ymm11,%ymm5 b5: c4 62 7d 19 94 c6 40 vbroadcastsd 0x140(%rsi,%rax,8),%ymm10 bc: 01 00 00 bf: c4 e3 a5 69 e4 a0 vfmaddpd %ymm4,%ymm10,%ymm11,%ymm4 c5: c4 62 7d 19 94 c6 f0 vbroadcastsd 0xf0(%rsi,%rax,8),%ymm10 cc: 00 00 00 cf: c4 e3 a5 69 db a0 vfmaddpd %ymm3,%ymm10,%ymm11,%ymm3 d5: c4 62 7d 19 94 c6 a0 vbroadcastsd 0xa0(%rsi,%rax,8),%ymm10 dc: 00 00 00 df: c4 e3 a5 69 d2 a0 vfmaddpd %ymm2,%ymm10,%ymm11,%ymm2 e5: c4 62 7d 19 54 c6 50 vbroadcastsd 0x50(%rsi,%rax,8),%ymm10 ec: c4 e3 a5 69 c9 a0 vfmaddpd %ymm1,%ymm10,%ymm11,%ymm1 f2: c4 62 7d 19 14 c6 vbroadcastsd (%rsi,%rax,8),%ymm10 f8: c4 e3 a5 69 c0 a0 vfmaddpd %ymm0,%ymm10,%ymm11,%ymm0 fe: 48 83 c1 20 add $0x20,%rcx 102: 48 ff c0 inc %rax 105: 48 83 f8 0a cmp $0xa,%rax 109: 0f 8c 51 ff ff ff jl 60 <smm_dnn_4_10_10_4_1_2_1_+0x60> 10f: c5 78 11 8a 20 01 00 vmovups %xmm9,0x120(%rdx) 116: 00 117: c4 63 7d 19 8a 30 01 vextractf128 $0x1,%ymm9,0x130(%rdx) 11e: 00 00 01 121: c5 78 11 82 00 01 00 vmovups %xmm8,0x100(%rdx) 128: 00 129: c4 63 7d 19 82 10 01 vextractf128 $0x1,%ymm8,0x110(%rdx) 130: 00 00 01 133: c5 f8 11 ba e0 00 00 vmovups %xmm7,0xe0(%rdx) 13a: 00 13b: c4 e3 7d 19 ba f0 00 vextractf128 $0x1,%ymm7,0xf0(%rdx) 142: 00 00 01 145: c5 f8 11 b2 c0 00 00 vmovups %xmm6,0xc0(%rdx) 14c: 00 14d: c4 e3 7d 19 b2 d0 00 vextractf128 $0x1,%ymm6,0xd0(%rdx) 154: 00 00 01 157: c5 f8 11 aa a0 00 00 vmovups %xmm5,0xa0(%rdx) 15e: 00 15f: c4 e3 7d 19 aa b0 00 vextractf128 $0x1,%ymm5,0xb0(%rdx) 166: 00 00 01 169: c5 f8 11 a2 80 00 00 vmovups %xmm4,0x80(%rdx) 170: 00 171: c4 e3 7d 19 a2 90 00 vextractf128 $0x1,%ymm4,0x90(%rdx) 178: 00 00 01 17b: c5 f8 11 5a 60 vmovups %xmm3,0x60(%rdx) 180: c4 e3 7d 19 5a 70 01 vextractf128 $0x1,%ymm3,0x70(%rdx) 187: c5 f8 11 52 40 vmovups %xmm2,0x40(%rdx) 18c: c4 e3 7d 19 52 50 01 vextractf128 $0x1,%ymm2,0x50(%rdx) 193: c5 f8 11 4a 20 vmovups %xmm1,0x20(%rdx) 198: c4 e3 7d 19 4a 30 01 vextractf128 $0x1,%ymm1,0x30(%rdx) 19f: c5 f8 11 02 vmovups %xmm0,(%rdx) 1a3: c4 e3 7d 19 42 10 01 vextractf128 $0x1,%ymm0,0x10(%rdx) 1aa: c5 f8 77 vzeroupper 1ad: c3 retq 1ae: 66 90 xchg %ax,%ax gcc's code looks more involved: smm_dnn_4_10_10_4_1_2_1_: .LFB0: pushq %rbp .LCFI0: movl $1, %eax movq %rsp, %rbp .LCFI1: andq $-32, %rsp subq $616, %rsp .LCFI2: vmovupd 96(%rdi), %ymm0 vmovupd (%rdi), %ymm3 vmovupd 32(%rdi), %ymm1 vmovsd 280(%rdi), %xmm13 vmovupd 64(%rdi), %ymm2 vmovsd 288(%rdi), %xmm15 vmovsd 256(%rdi), %xmm4 vmovsd 264(%rdi), %xmm6 vmovsd 272(%rdi), %xmm7 vmovupd 128(%rdi), %ymm12 vmovsd %xmm13, 296(%rsp) vmovupd 160(%rdi), %ymm11 vperm2f128 $32, %ymm1, %ymm3, %ymm13 vmovsd %xmm15, 288(%rsp) vperm2f128 $49, %ymm1, %ymm3, %ymm1 vmovsd %xmm4, 320(%rsp) vperm2f128 $32, %ymm0, %ymm2, %ymm15 vmovsd 296(%rdi), %xmm4 vperm2f128 $49, %ymm0, %ymm2, %ymm2 vmovsd %xmm6, 312(%rsp) vmovaps %ymm1, 40(%rsp) vunpcklpd %ymm1, %ymm13, %ymm1 vmovsd 304(%rdi), %xmm6 vunpcklpd %ymm2, %ymm15, %ymm0 vmovsd %xmm7, 304(%rsp) vmovsd 312(%rdi), %xmm7 vmovaps %ymm2, -24(%rsp) vperm2f128 $32, %ymm0, %ymm1, %ymm2 vmovupd 192(%rdi), %ymm10 vperm2f128 $49, %ymm0, %ymm1, %ymm0 vmovsd %xmm4, 280(%rsp) vmovsd %xmm6, 336(%rsp) vmovaps %ymm13, %ymm4 vmovsd %xmm7, 328(%rsp) vmovaps %ymm15, %ymm6 vmovaps %ymm2, %ymm7 vunpcklpd %ymm0, %ymm2, %ymm8 vmovupd 224(%rdi), %ymm9 vmovaps %ymm13, 72(%rsp) vmovaps %ymm15, 8(%rsp) vmovaps %ymm2, -56(%rsp) vmovaps %ymm0, -88(%rsp) vxorps %xmm0, %xmm0, %xmm0 .L3: vunpckhpd 40(%rsp), %ymm4, %ymm3 vmovupd (%rsi), %ymm4 vunpckhpd -24(%rsp), %ymm6, %ymm1 vunpckhpd -88(%rsp), %ymm7, %ymm5 vperm2f128 $32, %ymm1, %ymm3, %ymm2 vperm2f128 $49, %ymm1, %ymm3, %ymm1 vfmaddpd %ymm0, %ymm5, %ymm4, %ymm15 vfmaddpd %ymm0, %ymm8, %ymm4, %ymm3 vunpcklpd %ymm1, %ymm2, %ymm6 vunpckhpd %ymm1, %ymm2, %ymm2 vmovupd 80(%rsi), %ymm1 vfmaddpd %ymm0, %ymm6, %ymm4, %ymm13 vfmaddpd %ymm0, %ymm2, %ymm4, %ymm4 vmovaps %ymm15, 200(%rsp) vmovsd 320(%rsp), %xmm15 vfmaddpd %ymm0, %ymm8, %ymm1, %ymm14 vfmaddpd %ymm0, %ymm6, %ymm1, %ymm6 vfmaddpd %ymm0, %ymm5, %ymm1, %ymm5 vfmaddpd %ymm0, %ymm2, %ymm1, %ymm1 vperm2f128 $32, %ymm11, %ymm12, %ymm2 vmovaps %ymm13, -120(%rsp) vmovsd 64(%rsi), %xmm13 vmovaps %ymm4, 136(%rsp) vmovaps %ymm6, 232(%rsp) vfmaddsd (%rdx), %xmm15, %xmm13, %xmm15 vmovaps %ymm1, 104(%rsp) vperm2f128 $49, %ymm11, %ymm12, %ymm1 vmovaps %ymm5, 168(%rsp) vperm2f128 $32, %ymm9, %ymm10, %ymm5 vunpcklpd %ymm1, %ymm2, %ymm6 vmovsd %xmm13, 344(%rsp) vunpckhpd %ymm1, %ymm2, %ymm2 vperm2f128 $49, %ymm9, %ymm10, %ymm1 vunpcklpd %ymm1, %ymm5, %ymm4 vmovsd %xmm15, 352(%rsp) vunpckhpd %ymm1, %ymm5, %ymm1 vperm2f128 $32, %ymm4, %ymm6, %ymm5 vperm2f128 $49, %ymm4, %ymm6, %ymm4 vunpcklpd %ymm4, %ymm5, %ymm7 vunpckhpd %ymm4, %ymm5, %ymm5 vperm2f128 $32, %ymm1, %ymm2, %ymm4 vperm2f128 $49, %ymm1, %ymm2, %ymm1 vmovupd 32(%rsi), %ymm2 vunpcklpd %ymm1, %ymm4, %ymm6 vunpckhpd %ymm1, %ymm4, %ymm4 vmovupd 112(%rsi), %ymm1 vfmaddpd %ymm3, %ymm7, %ymm2, %ymm3 vfmaddpd %ymm14, %ymm7, %ymm1, %ymm7 vhaddpd %ymm3, %ymm3, %ymm3 vhaddpd %ymm7, %ymm7, %ymm7 vperm2f128 $1, %ymm3, %ymm3, %ymm15 vaddpd %ymm15, %ymm3, %ymm3 vmovaps %ymm3, 584(%rsp) vmovsd 352(%rsp), %xmm3 vaddsd 584(%rsp), %xmm3, %xmm3 vmovsd 144(%rsi), %xmm15 vmovsd %xmm3, 264(%rsp) vmovsd 320(%rsp), %xmm3 vfmaddsd 32(%rdx), %xmm3, %xmm15, %xmm13 vperm2f128 $1, %ymm7, %ymm7, %ymm3 vaddpd %ymm3, %ymm7, %ymm3 vmovaps %ymm3, 552(%rsp) vmovsd 312(%rsp), %xmm3 vaddsd 552(%rsp), %xmm13, %xmm13 vmovsd %xmm13, 272(%rsp) vmovsd 344(%rsp), %xmm13 vfmaddsd 8(%rdx), %xmm3, %xmm13, %xmm7 vfmaddpd -120(%rsp), %ymm6, %ymm2, %ymm13 vhaddpd %ymm13, %ymm13, %ymm13 vperm2f128 $1, %ymm13, %ymm13, %ymm3 vaddpd %ymm3, %ymm13, %ymm3 vmovaps %ymm3, 520(%rsp) vaddsd 520(%rsp), %xmm7, %xmm7 vmovsd %xmm7, 352(%rsp) vfmaddpd 232(%rsp), %ymm6, %ymm1, %ymm6 vmovsd 312(%rsp), %xmm13 vfmaddsd 40(%rdx), %xmm13, %xmm15, %xmm7 vhaddpd %ymm6, %ymm6, %ymm6 vperm2f128 $1, %ymm6, %ymm6, %ymm3 vaddpd %ymm3, %ymm6, %ymm3 vmovsd 304(%rsp), %xmm6 vmovaps %ymm3, 488(%rsp) vmovsd 344(%rsp), %xmm3 vaddsd 488(%rsp), %xmm7, %xmm13 vfmaddsd 16(%rdx), %xmm6, %xmm3, %xmm7 vfmaddpd 200(%rsp), %ymm5, %ymm2, %ymm3 vfmaddpd 168(%rsp), %ymm5, %ymm1, %ymm5 vfmaddpd 136(%rsp), %ymm4, %ymm2, %ymm2 vfmaddpd 104(%rsp), %ymm4, %ymm1, %ymm1 vmovsd 288(%rsp), %xmm4 vhaddpd %ymm3, %ymm3, %ymm3 vhaddpd %ymm5, %ymm5, %ymm5 vhaddpd %ymm2, %ymm2, %ymm2 vhaddpd %ymm1, %ymm1, %ymm1 vperm2f128 $1, %ymm3, %ymm3, %ymm6 vaddpd %ymm6, %ymm3, %ymm3 vmovaps %ymm3, 456(%rsp) vperm2f128 $1, %ymm5, %ymm5, %ymm3 vaddpd %ymm3, %ymm5, %ymm3 vaddsd 456(%rsp), %xmm7, %xmm14 vmovsd 304(%rsp), %xmm7 vfmaddsd 48(%rdx), %xmm7, %xmm15, %xmm6 vmovsd 296(%rsp), %xmm7 vmovaps %ymm3, 424(%rsp) vmovsd 344(%rsp), %xmm3 vfmaddsd 24(%rdx), %xmm7, %xmm3, %xmm5 vperm2f128 $1, %ymm2, %ymm2, %ymm3 vaddpd %ymm3, %ymm2, %ymm2 vaddsd 424(%rsp), %xmm6, %xmm6 vmovaps %ymm2, 392(%rsp) vperm2f128 $1, %ymm1, %ymm1, %ymm2 vaddpd %ymm2, %ymm1, %ymm1 vfmaddsd 56(%rdx), %xmm7, %xmm15, %xmm15 vaddsd 392(%rsp), %xmm5, %xmm5 vmovaps %ymm1, 360(%rsp) vmovsd 72(%rsi), %xmm2 vmovsd 152(%rsi), %xmm1 addq $160, %rsi vaddsd 360(%rsp), %xmm15, %xmm15 vfmaddsd 264(%rsp), %xmm4, %xmm2, %xmm3 vmovsd %xmm3, (%rdx) vfmaddsd 272(%rsp), %xmm4, %xmm1, %xmm3 vmovsd %xmm3, 32(%rdx) vmovsd 280(%rsp), %xmm3 vfmaddsd 352(%rsp), %xmm3, %xmm2, %xmm7 vmovsd %xmm7, 8(%rdx) vfmaddsd %xmm13, %xmm3, %xmm1, %xmm7 vfmaddsd %xmm6, 336(%rsp), %xmm1, %xmm6 vfmaddsd %xmm5, 328(%rsp), %xmm2, %xmm5 vfmaddsd %xmm15, 328(%rsp), %xmm1, %xmm1 vmovsd %xmm7, 40(%rdx) vfmaddsd %xmm14, 336(%rsp), %xmm2, %xmm7 vmovsd %xmm6, 48(%rdx) vmovsd %xmm5, 24(%rdx) vmovsd %xmm1, 56(%rdx) vmovsd %xmm7, 16(%rdx) addq $64, %rdx cmpl $9, %eax je .L1 addl $2, %eax vmovaps 72(%rsp), %ymm4 vmovaps 8(%rsp), %ymm6 vmovaps -56(%rsp), %ymm7 jmp .L3 .p2align 5,,7 .p2align 3 .L1: leave .LCFI3: