[Bug target/104188] New: gcc omitting AVX-512 broadcast instruction

2022-01-22 Thread kvr000 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188

Bug ID: 104188
   Summary: gcc omitting AVX-512 broadcast instruction
   Product: gcc
   Version: 11.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: kvr000 at gmail dot com
  Target Milestone: ---

Hi,
there is a bug when generating AVX-512 instructions from intrinsics.  The code
is generated correctly in gcc-10 but gcc-11 completely omits the
vbroadcastf32x4 .

gcc version: 11.2.0-7ubuntu2 - 11.2.0

Source code of minimal working example:
// Matrix 4*4 multiplication:

#ifndef NO_VECTORIZE
#ifdef __x86_64__
#include 
#include 
#endif
#ifdef __aarch64__
#include 
#endif
#endif

union Mat44 {
float m[4][4];
#ifndef NO_VECTORIZE
#ifdef __x86_64__
__m128 row[4];
__m256 rowDuet[2];
__m512 rowQuad;
#endif
#ifdef __aarch64__
float32x4_t row[4];
#endif
#endif
};

void matmult_avx512(union Mat44 *out, union Mat44 *a, union Mat44 *b)
{
__m512 a0123 = _mm512_loadu_ps(a->m[0]);
__m512 b = _mm512_broadcast_f32x4(b->row[0]);
__m512 b = _mm512_broadcast_f32x4(b->row[1]);
__m512 b = _mm512_broadcast_f32x4(b->row[2]);
__m512 b = _mm512_broadcast_f32x4(b->row[3]);

__m512 result = _mm512_mul_ps(_mm512_permute_ps(a0123, 0x00), b);
result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0x55), b,
result);
result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xaa), b,
result);
result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xff), b,
result);

_mm512_storeu_ps(out->m[0], result);
}


gcc-10 (correct):

endbr64
vmovups (%rsi), %zmm0
vbroadcastf32x4 (%rdx), %zmm6   // note here
vpermilps   $0, %zmm0, %zmm1
vmulps  %zmm6, %zmm1, %zmm1
vbroadcastf32x4 16(%rdx), %zmm5 // note here
vpermilps   $85, %zmm0, %zmm2
vbroadcastf32x4 32(%rdx), %zmm4 // note here
vbroadcastf32x4 48(%rdx), %zmm3 // note here
vfmadd132ps %zmm5, %zmm1, %zmm2
vpermilps   $170, %zmm0, %zmm1
vpermilps   $255, %zmm0, %zmm0
vfmadd132ps %zmm4, %zmm2, %zmm1
vfmadd132ps %zmm3, %zmm1, %zmm0
vmovups %zmm0, (%rdi)
vzeroupper
ret


gcc-11 (missing vbroadcasatf32x4) :

endbr64
vmovups (%rsi), %zmm0
vpermilps   $0, %zmm0, %zmm1
vmulps  (%rdx){1to16}, %zmm1, %zmm1
vpermilps   $85, %zmm0, %zmm2
vfmadd132ps 16(%rdx){1to16}, %zmm1, %zmm2
vpermilps   $170, %zmm0, %zmm1
vpermilps   $255, %zmm0, %zmm0
vfmadd132ps 32(%rdx){1to16}, %zmm2, %zmm1
vfmadd132ps 48(%rdx){1to16}, %zmm1, %zmm0
vmovups %zmm0, (%rdi)
vzeroupper
ret

[Bug target/104188] gcc omitting AVX-512 broadcast instruction

2022-01-22 Thread kvr000 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188

--- Comment #2 from Zbynek Vyskovsky  ---
> {1to16} says to broadcast  from first element to all 16.

The vbroadcastf32x4 is supposed to copy first four elmenents to 4-7, 8-11 and
12-15 .

> Why do you think this is wrong code?

It doesn't work.  It produces the same number for each column for the same row,
likely as a result of above as it uses single element to multiply instead of
four different elements.

[Bug target/104188] gcc omitting AVX-512 broadcast instruction

2022-01-22 Thread kvr000 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188

--- Comment #4 from Zbynek Vyskovsky  ---
Sure, the code:

#include 

#ifndef NO_VECTORIZE
#ifdef __x86_64__
#include 
#include 
#endif
#ifdef __aarch64__
#include 
#endif
#endif

typedef union Mat44 {
float m[4][4];
#ifndef NO_VECTORIZE
#ifdef __x86_64__
__m128 row[4];
__m256 rowDuet[2];
__m512 rowQuad;
#endif
#ifdef __aarch64__
float32x4_t row[4];
#endif
#endif
} Mat44;

__attribute__((noipa)) void matmult_avx512(union Mat44 *out, const Mat44 *a,
const Mat44 *b)
{
__m512 a0123 = _mm512_loadu_ps(a->m[0]);
__m512 b = _mm512_broadcast_f32x4(b->row[0]);
__m512 b = _mm512_broadcast_f32x4(b->row[1]);
__m512 b = _mm512_broadcast_f32x4(b->row[2]);
__m512 b = _mm512_broadcast_f32x4(b->row[3]);

__m512 result = _mm512_mul_ps(_mm512_permute_ps(a0123, 0x00), b);
result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0x55), b,
result);
result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xaa), b,
result);
result = _mm512_fmadd_ps(_mm512_permute_ps(a0123, 0xff), b,
result);

_mm512_storeu_ps(out->m[0], result);
}

__attribute__((noipa)) void matmult_ref(Mat44 *out, const Mat44 *a, const Mat44
*b)
{
Mat44 t;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
t.m[i][j] = a->m[i][0]*b->m[0][j] +
a->m[i][1]*b->m[1][j] + a->m[i][2]*b->m[2][j] + a->m[i][3]*b->m[3][j];
}
}

*out = t;
}

int main(void)
{
Mat44 in = { m: { { 1, 2, 3, 4 }, { 5, 6, 7, 8 }, { 9, 10, 11, 12 }, {
13, 14, 15, 16 } } };
Mat44 avx512_out;
Mat44 ref_out;
matmult_ref(&ref_out, &in, &in);
matmult_avx512(&avx512_out, &in, &in);
for (int r = 0; r < 4; ++r) {
printf("%5.0f %5.0f %5.0f %5.0f      %5.0f %5.0f %5.0f
%5.0f\n", avx512_out.m[r][0], avx512_out.m[r][1], avx512_out.m[r][2],
avx512_out.m[r][3], ref_out.m[r][0], ref_out.m[r][1], ref_out.m[r][2],
ref_out.m[r][3]);
}
return 0;
}


Output (note the repeating first column on first side, caused by duplicating
single element instead of four):

  90909090 90   100   110   120
 202   202   202   202202   228   254   280
 314   314   314   314314   356   398   440
 426   426   426   426426   484   542   600

[Bug target/104188] [11/12 Regression] gcc omitting AVX-512 broadcast instruction

2022-01-22 Thread kvr000 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104188

--- Comment #10 from Zbynek Vyskovsky  ---
Thanks for quick fix!