https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93930
Bug ID: 93930
Summary: Unnecessary broadcast instructions for AVX512
Product: gcc
Version: 9.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: fredrik987 at gmail dot com
Target Milestone: ---
Created attachment 47908
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47908&action=edit
Test case
The code below generates unnecessary broadcast instructions for AVX512,
compiled with "-Ofast -march=skylake-avx512". This occurs for gcc trunk and
9.2/8.3 but not 7.5.
Most constants are read from memory via vbroadcastss except two, which are read
as scalars and then broadcast within the loop. For gcc 7.5 all constants are
read via vbroadcastss.
The problem seems to be more frequent for larger functions.
---
Compiler output for gcc 9.2:
...
.L3:
vmovaps zmm0, ZMMWORD PTR [rdi]
add rdi, 64
vmovaps zmm3, zmm0
vmovaps zmm1, zmm0
vmulps zmm2, zmm0, zmm0
vfmadd132ps zmm3, zmm11, zmm12
vfmadd132ps zmm1, zmm13, zmm14
vmovaps zmm4, zmm0
vfmadd132ps zmm4, zmm7, zmm8
sub rsi, -128
vfmadd132ps zmm1, zmm3, zmm2
vmovaps zmm3, zmm0
vfmadd132ps zmm3, zmm9, zmm10
vfmadd132ps zmm3, zmm4, zmm2
vbroadcastsszmm4, xmm15 <--- Broadcast within loop
vmulps zmm3, zmm3, zmm1
vmovaps ZMMWORD PTR [rsi-128], zmm3
vbroadcastsszmm3, xmm16 <--- Broadcast within loop
vfmadd132ps zmm3, zmm4, zmm0
vfmadd132ps zmm0, zmm5, zmm6
vfmadd132ps zmm0, zmm3, zmm2
vmulps zmm1, zmm1, zmm0
vmovaps ZMMWORD PTR [rsi-64], zmm1
cmp rdi, rax
jne .L3
...
---
#include
static __m512 f(__m512 x)
{
__m512 a = _mm512_set1_ps(11);
__m512 b = _mm512_set1_ps(12);
__m512 c = _mm512_set1_ps(13);
__m512 d = _mm512_set1_ps(14);
__m512 y = _mm512_mul_ps(x, x);
return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}
static __m512 g(__m512 x)
{
__m512 a = _mm512_set1_ps(21);
__m512 b = _mm512_set1_ps(22);
__m512 c = _mm512_set1_ps(23);
__m512 d = _mm512_set1_ps(24);
__m512 y = _mm512_mul_ps(x, x);
return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}
static __m512 h(__m512 x)
{
__m512 a = _mm512_set1_ps(31);
__m512 b = _mm512_set1_ps(32);
__m512 c = _mm512_set1_ps(33);
__m512 d = _mm512_set1_ps(34);
__m512 y = _mm512_mul_ps(x, x);
return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c,
d));
}
void test(__m512 *x, __m512 *y, int n)
{
for (int i = 0; i < n; i++) {
__m512 u = *x++;
__m512 v = h(u);
*y++ = _mm512_mul_ps(f(u), v);
*y++ = _mm512_mul_ps(g(u), v);
}
}