[Bug target/104188] New: gcc omitting AVX-512 broadcast instruction
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188 Bug ID: 104188 Summary: gcc omitting AVX-512 broadcast instruction Product: gcc Version: 11.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: kvr000 at gmail dot com Target Milestone: --- Hi, there is a bug when generating AVX-512 instructions from intrinsics. The code is generated correctly in gcc-10 but gcc-11 completely omits the vbroadcastf32x4 . gcc version: 11.2.0-7ubuntu2 - 11.2.0 Source code of minimal working example: // Matrix 4*4 multiplication: #ifndef NO_VECTORIZE #ifdef __x86_64__ #include #include #endif #ifdef __aarch64__ #include #endif #endif union Mat44 { float m[4][4]; #ifndef NO_VECTORIZE #ifdef __x86_64__ __m128 row[4]; __m256 rowDuet[2]; __m512 rowQuad; #endif #ifdef __aarch64__ float32x4_t row[4]; #endif #endif }; void matmult_avx512(union Mat44 *out, union Mat44 *a, union Mat44 *b) { __m512 a0123 = _mm512_loadu_ps(a->m[0]); __m512 b = _mm512_broadcast_f32x4(b->row[0]); __m512 b = _mm512_broadcast_f32x4(b->row[1]); __m512 b = _mm512_broadcast_f32x4(b->row[2]); __m512 b = _mm512_broadcast_f32x4(b->row[3]); __m512 result = _mm512_mul_ps(_mm512_permute_ps(a0123, 0x00), b); result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0x55), b, result); result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xaa), b, result); result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xff), b, result); _mm512_storeu_ps(out->m[0], result); } gcc-10 (correct): endbr64 vmovups (%rsi), %zmm0 vbroadcastf32x4 (%rdx), %zmm6 // note here vpermilps $0, %zmm0, %zmm1 vmulps %zmm6, %zmm1, %zmm1 vbroadcastf32x4 16(%rdx), %zmm5 // note here vpermilps $85, %zmm0, %zmm2 vbroadcastf32x4 32(%rdx), %zmm4 // note here vbroadcastf32x4 48(%rdx), %zmm3 // note here vfmadd132ps %zmm5, %zmm1, %zmm2 vpermilps $170, %zmm0, %zmm1 vpermilps $255, %zmm0, %zmm0 vfmadd132ps %zmm4, %zmm2, %zmm1 vfmadd132ps %zmm3, %zmm1, %zmm0 vmovups %zmm0, (%rdi) vzeroupper ret gcc-11 (missing vbroadcasatf32x4) : endbr64 vmovups (%rsi), %zmm0 vpermilps $0, %zmm0, %zmm1 vmulps (%rdx){1to16}, %zmm1, %zmm1 vpermilps $85, %zmm0, %zmm2 vfmadd132ps 16(%rdx){1to16}, %zmm1, %zmm2 vpermilps $170, %zmm0, %zmm1 vpermilps $255, %zmm0, %zmm0 vfmadd132ps 32(%rdx){1to16}, %zmm2, %zmm1 vfmadd132ps 48(%rdx){1to16}, %zmm1, %zmm0 vmovups %zmm0, (%rdi) vzeroupper ret
[Bug target/104188] gcc omitting AVX-512 broadcast instruction
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188 --- Comment #2 from Zbynek Vyskovsky --- > {1to16} says to broadcast from first element to all 16. The vbroadcastf32x4 is supposed to copy first four elmenents to 4-7, 8-11 and 12-15 . > Why do you think this is wrong code? It doesn't work. It produces the same number for each column for the same row, likely as a result of above as it uses single element to multiply instead of four different elements.
[Bug target/104188] gcc omitting AVX-512 broadcast instruction
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188 --- Comment #4 from Zbynek Vyskovsky --- Sure, the code: #include #ifndef NO_VECTORIZE #ifdef __x86_64__ #include #include #endif #ifdef __aarch64__ #include #endif #endif typedef union Mat44 { float m[4][4]; #ifndef NO_VECTORIZE #ifdef __x86_64__ __m128 row[4]; __m256 rowDuet[2]; __m512 rowQuad; #endif #ifdef __aarch64__ float32x4_t row[4]; #endif #endif } Mat44; __attribute__((noipa)) void matmult_avx512(union Mat44 *out, const Mat44 *a, const Mat44 *b) { __m512 a0123 = _mm512_loadu_ps(a->m[0]); __m512 b = _mm512_broadcast_f32x4(b->row[0]); __m512 b = _mm512_broadcast_f32x4(b->row[1]); __m512 b = _mm512_broadcast_f32x4(b->row[2]); __m512 b = _mm512_broadcast_f32x4(b->row[3]); __m512 result = _mm512_mul_ps(_mm512_permute_ps(a0123, 0x00), b); result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0x55), b, result); result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xaa), b, result); result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xff), b, result); _mm512_storeu_ps(out->m[0], result); } __attribute__((noipa)) void matmult_ref(Mat44 *out, const Mat44 *a, const Mat44 *b) { Mat44 t; for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { t.m[i][j] = a->m[i][0]*b->m[0][j] + a->m[i][1]*b->m[1][j] + a->m[i][2]*b->m[2][j] + a->m[i][3]*b->m[3][j]; } } *out = t; } int main(void) { Mat44 in = { m: { { 1, 2, 3, 4 }, { 5, 6, 7, 8 }, { 9, 10, 11, 12 }, { 13, 14, 15, 16 } } }; Mat44 avx512_out; Mat44 ref_out; matmult_ref(&ref_out, &in, &in); matmult_avx512(&avx512_out, &in, &in); for (int r = 0; r < 4; ++r) { printf("%5.0f %5.0f %5.0f %5.0f %5.0f %5.0f %5.0f %5.0f\n", avx512_out.m[r][0], avx512_out.m[r][1], avx512_out.m[r][2], avx512_out.m[r][3], ref_out.m[r][0], ref_out.m[r][1], ref_out.m[r][2], ref_out.m[r][3]); } return 0; } Output (note the repeating first column on first side, caused by duplicating single element instead of four): 90909090 90 100 110 120 202 202 202 202202 228 254 280 314 314 314 314314 356 398 440 426 426 426 426426 484 542 600
[Bug target/104188] [11/12 Regression] gcc omitting AVX-512 broadcast instruction
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188 --- Comment #10 from Zbynek Vyskovsky --- Thanks for quick fix!