http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57169
Bug #: 57169 Summary: fully unrolled matrix multiplication not vectorized Classification: Unclassified Product: gcc Version: 4.9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: vincenzo.innoce...@cern.ch a lot of legacy code still fully unroll linear algebra for small dimensions As shown below gcc fails to vectorized a unrolled 4x4 matrix multiplication while vectorize well the corresponding loop expression sample code alignas(32) float a[4][4]; alignas(32) float b[4][4]; alignas(32) float c[4][4]; void matmul() { for (int i=0;i!=4;++i) for (int j=0;j!=4;++j) { float sum=0; for (int k=0;k!=4;++k) sum += a[i][k]*b[k][j]; c[i][j]=sum; } } alignas(32) float src1[4][4]; alignas(32) float src2[4][4]; alignas(32) float dest[4][4]; void matmulU(){ dest[0][0] = src1[0][0] * src2[0][0] + src1[0][1] * src2[1][0] + src1[0][2] * src2[2][0] + src1[0][3] * src2[3][0]; dest[0][1] = src1[0][0] * src2[0][1] + src1[0][1] * src2[1][1] + src1[0][2] * src2[2][1] + src1[0][3] * src2[3][1]; dest[0][2] = src1[0][0] * src2[0][2] + src1[0][1] * src2[1][2] + src1[0][2] * src2[2][2] + src1[0][3] * src2[3][2]; dest[0][3] = src1[0][0] * src2[0][3] + src1[0][1] * src2[1][3] + src1[0][2] * src2[2][3] + src1[0][3] * src2[3][3]; dest[1][0] = src1[1][0] * src2[0][0] + src1[1][1] * src2[1][0] + src1[1][2] * src2[2][0] + src1[1][3] * src2[3][0]; dest[1][1] = src1[1][0] * src2[0][1] + src1[1][1] * src2[1][1] + src1[1][2] * src2[2][1] + src1[1][3] * src2[3][1]; dest[1][2] = src1[1][0] * src2[0][2] + src1[1][1] * src2[1][2] + src1[1][2] * src2[2][2] + src1[1][3] * src2[3][2]; dest[1][3] = src1[1][0] * src2[0][3] + src1[1][1] * src2[1][3] + src1[1][2] * src2[2][3] + src1[1][3] * src2[3][3]; dest[2][0] = src1[2][0] * src2[0][0] + src1[2][1] * src2[1][0] + src1[2][2] * src2[2][0] + src1[2][3] * src2[3][0]; dest[2][1] = src1[2][0] * src2[0][1] + src1[2][1] * src2[1][1] + src1[2][2] * src2[2][1] + src1[2][3] * src2[3][1]; dest[2][2] = src1[2][0] * src2[0][2] + src1[2][1] * src2[1][2] + src1[2][2] * src2[2][2] + src1[2][3] * src2[3][2]; dest[2][3] = src1[2][0] * src2[0][3] + src1[2][1] * src2[1][3] + src1[2][2] * src2[2][3] + src1[2][3] * src2[3][3]; dest[3][0] = src1[3][0] * src2[0][0] + src1[3][1] * src2[1][0] + src1[3][2] * src2[2][0] + src1[3][3] * src2[3][0]; dest[3][1] = src1[3][0] * src2[0][1] + src1[3][1] * src2[1][1] + src1[3][2] * src2[2][1] + src1[3][3] * src2[3][1]; dest[3][2] = src1[3][0] * src2[0][2] + src1[3][1] * src2[1][2] + src1[3][2] * src2[2][2] + src1[3][3] * src2[3][2]; dest[3][3] = src1[3][0] * src2[0][3] + src1[3][1] * src2[1][3] + src1[3][2] * src2[2][3] + src1[3][3] * src2[3][3]; }; generated asm c++ -v Using built-in specs. COLLECT_GCC=c++ COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-apple-darwin12.3.0/4.9.0/lto-wrapper Target: x86_64-apple-darwin12.3.0 Configured with: ./configure --disable-multilib --disable-bootstrap --enable-lto -disable-libitm --enable-languages=c,c++,fortran,lto --no-create --no-recursion Thread model: posix gcc version 4.9.0 20130428 (experimental) [trunk revision 198366] (GCC) Vincenzos-MacBook-Pro:vectorize innocent$ c++ -O3 -march=corei7-avx -std=c++11 -S matmul.cc -mavx2 -mfma Vincenzos-MacBook-Pro:vectorize innocent$ cat matmul.s .text .align 4,0x90 .globl __Z6matmulv __Z6matmulv: LFB0: vmovss 8+_b(%rip), %xmm7 vmovss 24+_b(%rip), %xmm1 vinsertps $0x10, 12+_b(%rip), %xmm7, %xmm0 vmovss _b(%rip), %xmm7 vmovss 16+_b(%rip), %xmm2 vinsertps $0x10, 4+_b(%rip), %xmm7, %xmm8 vmovss 40+_b(%rip), %xmm3 vmovlhps %xmm0, %xmm8, %xmm8 vmovss 32+_b(%rip), %xmm4 vinsertf128 $1, %xmm8, %ymm8, %ymm8 vinsertps $0x10, 28+_b(%rip), %xmm1, %xmm0 vmovss 56+_b(%rip), %xmm7 vinsertps $0x10, 20+_b(%rip), %xmm2, %xmm6 vmovlhps %xmm0, %xmm6, %xmm6 vmovss 48+_b(%rip), %xmm1 vinsertf128 $1, %xmm6, %ymm6, %ymm6 vinsertps $0x10, 44+_b(%rip), %xmm3, %xmm0 vinsertps $0x10, 36+_b(%rip), %xmm4, %xmm5 vmovlhps %xmm0, %xmm5, %xmm5 vinsertps $0x10, 60+_b(%rip), %xmm7, %xmm0 vinsertps $0x10, 52+_b(%rip), %xmm1, %xmm4 vmovlhps %xmm0, %xmm4, %xmm4 vxorps %xmm7, %xmm7, %xmm7 vmovaps _a(%rip), %ymm0 vinsertf128 $1, %xmm5, %ymm5, %ymm5 vinsertf128 $1, %xmm4, %ymm4, %ymm4 vpermilps $255, %ymm0, %ymm1 vpermilps $170, %ymm0, %ymm2 vpermilps $85, %ymm0, %ymm3 vpermilps $0, %ymm0, %ymm0 vfmadd132ps %ymm8, %ymm7, %ymm0 vfmadd132ps %ymm6, %ymm0, %ymm3 vmovaps 32+_a(%rip), %ymm0 vfmadd132ps %ymm5, %ymm3, %ymm2 vfmadd132ps %ymm4, %ymm2, %ymm1 vmovaps %ymm1, _c(%rip) vpermilps $170, %ymm0, %ymm2 vpermilps $255, %ymm0, %ymm1 vpermilps $85, %ymm0, %ymm3 vpermilps $0, %ymm0, %ymm0 vfmadd132ps %ymm8, %ymm7, %ymm0 vfmadd132ps %ymm6, %ymm0, %ymm3 vfmadd132ps %ymm5, %ymm3, %ymm2 vfmadd132ps %ymm4, %ymm2, %ymm1 vmovaps %ymm1, 32+_c(%rip) vzeroupper ret LFE0: .align 4,0x90 .globl __Z7matmulUv __Z7matmulUv: LFB1: vmovss 4+_src1(%rip), %xmm5 vmovss 16+_src2(%rip), %xmm15 vmovss _src1(%rip), %xmm4 vmulss %xmm15, %xmm5, %xmm1 vmovss 8+_src1(%rip), %xmm2 vmovss 12+_src1(%rip), %xmm0 vmovss _src2(%rip), %xmm14 vmovss 32+_src2(%rip), %xmm13 vmovss 48+_src2(%rip), %xmm12 vfmadd231ss %xmm14, %xmm4, %xmm1 vmovss 20+_src2(%rip), %xmm11 vfmadd231ss %xmm13, %xmm2, %xmm1 vfmadd231ss %xmm12, %xmm0, %xmm1 vmovss %xmm1, _dest(%rip) vmovss 4+_src2(%rip), %xmm10 vmulss %xmm11, %xmm5, %xmm1 vmovss 36+_src2(%rip), %xmm9 vmovss 52+_src2(%rip), %xmm8 vmovss 24+_src2(%rip), %xmm7 vmovss 28+_src2(%rip), %xmm6 vfmadd231ss %xmm10, %xmm4, %xmm1 vfmadd231ss %xmm9, %xmm2, %xmm1 vfmadd231ss %xmm8, %xmm0, %xmm1 vmovss %xmm1, 4+_dest(%rip) vmulss %xmm7, %xmm5, %xmm1 vmovss 44+_src2(%rip), %xmm3 vmulss %xmm6, %xmm5, %xmm5 vfmadd231ss 8+_src2(%rip), %xmm4, %xmm1 vfmadd231ss 40+_src2(%rip), %xmm2, %xmm1 vfmadd231ss 56+_src2(%rip), %xmm0, %xmm1 vfmadd231ss 12+_src2(%rip), %xmm4, %xmm5 vfmadd231ss %xmm3, %xmm2, %xmm5 vfmadd231ss 60+_src2(%rip), %xmm0, %xmm5 vmovss %xmm5, 12+_dest(%rip) vmovss 20+_src1(%rip), %xmm5 vmovss %xmm1, 8+_dest(%rip) vmovss 16+_src1(%rip), %xmm4 vmulss %xmm5, %xmm15, %xmm1 vmovss 24+_src1(%rip), %xmm2 vmovss 28+_src1(%rip), %xmm0 vfmadd231ss %xmm4, %xmm14, %xmm1 vfmadd231ss %xmm2, %xmm13, %xmm1 vfmadd231ss %xmm0, %xmm12, %xmm1 vmovss %xmm1, 16+_dest(%rip) vmulss %xmm5, %xmm11, %xmm1 vfmadd231ss %xmm4, %xmm10, %xmm1 vfmadd231ss %xmm2, %xmm9, %xmm1 vfmadd231ss %xmm0, %xmm8, %xmm1 vmovss %xmm1, 20+_dest(%rip) vmulss %xmm5, %xmm7, %xmm1 vmulss %xmm5, %xmm6, %xmm5 vfmadd231ss 8+_src2(%rip), %xmm4, %xmm1 vfmadd231ss 40+_src2(%rip), %xmm2, %xmm1 vfmadd231ss 56+_src2(%rip), %xmm0, %xmm1 vmovss %xmm1, 24+_dest(%rip) vfmadd231ss 12+_src2(%rip), %xmm4, %xmm5 vfmadd231ss %xmm2, %xmm3, %xmm5 vfmadd231ss 60+_src2(%rip), %xmm0, %xmm5 vmovss %xmm5, 28+_dest(%rip) vmovss 36+_src1(%rip), %xmm5 vmovss 32+_src1(%rip), %xmm4 vmulss %xmm5, %xmm15, %xmm1 vmovss 40+_src1(%rip), %xmm2 vmovss 44+_src1(%rip), %xmm0 vfmadd231ss %xmm4, %xmm14, %xmm1 vfmadd231ss %xmm2, %xmm13, %xmm1 vfmadd231ss %xmm0, %xmm12, %xmm1 vmovss %xmm1, 32+_dest(%rip) vmulss %xmm5, %xmm11, %xmm1 vfmadd231ss %xmm4, %xmm10, %xmm1 vfmadd231ss %xmm2, %xmm9, %xmm1 vfmadd231ss %xmm0, %xmm8, %xmm1 vmovss %xmm1, 36+_dest(%rip) vmulss %xmm5, %xmm7, %xmm1 vmulss %xmm5, %xmm6, %xmm5 vfmadd231ss 8+_src2(%rip), %xmm4, %xmm1 vfmadd231ss 40+_src2(%rip), %xmm2, %xmm1 vfmadd231ss 56+_src2(%rip), %xmm0, %xmm1 vfmadd231ss 12+_src2(%rip), %xmm4, %xmm5 vfmadd231ss %xmm2, %xmm3, %xmm5 vfmadd231ss 60+_src2(%rip), %xmm0, %xmm5 vmovss %xmm5, 44+_dest(%rip) vmovss 52+_src1(%rip), %xmm5 vmovss 48+_src1(%rip), %xmm4 vmovss %xmm1, 40+_dest(%rip) vmulss %xmm5, %xmm15, %xmm15 vmovss 56+_src1(%rip), %xmm2 vmulss %xmm5, %xmm11, %xmm11 vmovss 60+_src1(%rip), %xmm0 vmulss %xmm5, %xmm7, %xmm7 vmulss %xmm5, %xmm6, %xmm5 vfmadd231ss %xmm4, %xmm14, %xmm15 vfmadd231ss %xmm2, %xmm13, %xmm15 vfmadd231ss %xmm0, %xmm12, %xmm15 vfmadd132ss %xmm4, %xmm11, %xmm10 vmovss %xmm15, 48+_dest(%rip) vfmadd132ss %xmm2, %xmm10, %xmm9 vfmadd231ss 8+_src2(%rip), %xmm4, %xmm7 vfmadd231ss %xmm0, %xmm8, %xmm9 vfmadd231ss 40+_src2(%rip), %xmm2, %xmm7 vfmadd132ss 12+_src2(%rip), %xmm5, %xmm4 vfmadd132ss %xmm3, %xmm4, %xmm2 vfmadd231ss 56+_src2(%rip), %xmm0, %xmm7 vfmadd231ss 60+_src2(%rip), %xmm0, %xmm2 vmovss %xmm9, 52+_dest(%rip) vmovss %xmm7, 56+_dest(%rip) vmovss %xmm2, 60+_dest(%rip) ret