https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93078
Bug ID: 93078
Summary: Missing fma and round functions auto-vectorization
with x86-64 (sse2)
Product: gcc
Version: 9.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: diegoandres91b at hotmail dot com
Target Milestone: ---
The next code (with -Ofast):
#include <cmath>
using namespace std;
float a[4], b[4], c[4];
void vec_fma() {
for (int i = 0; i < 4; ++i) c[i] = fma(a[i], b[i], c[i]);
}
void vec_round() {
for (int i = 0; i < 4; ++i) c[i] = round(a[i]);
}
void vec_floor() {
for (int i = 0; i < 4; ++i) c[i] = floor(a[i]);
}
void vec_ceil() {
for (int i = 0; i < 4; ++i) c[i] = ceil(a[i]);
}
void vec_trunc() {
for (int i = 0; i < 4; ++i) c[i] = trunc(a[i]);
}
void vec_rint() {
for (int i = 0; i < 4; ++i) c[i] = rint(a[i]);
}
void vec_nearbyint() {
for (int i = 0; i < 4; ++i) c[i] = nearbyint(a[i]);
}
Compiles without auto-vectorization:
vec_fma():
sub rsp, 8
movss xmm2, DWORD PTR c[rip]
movss xmm1, DWORD PTR b[rip]
movss xmm0, DWORD PTR a[rip]
call fmaf
movss xmm2, DWORD PTR c[rip+4]
movss xmm1, DWORD PTR b[rip+4]
movss DWORD PTR c[rip], xmm0
movss xmm0, DWORD PTR a[rip+4]
call fmaf
movss xmm2, DWORD PTR c[rip+8]
movss xmm1, DWORD PTR b[rip+8]
movss DWORD PTR c[rip+4], xmm0
movss xmm0, DWORD PTR a[rip+8]
call fmaf
movss xmm2, DWORD PTR c[rip+12]
movss xmm1, DWORD PTR b[rip+12]
movss DWORD PTR c[rip+8], xmm0
movss xmm0, DWORD PTR a[rip+12]
call fmaf
movss DWORD PTR c[rip+12], xmm0
add rsp, 8
ret
vec_round():
movss xmm3, DWORD PTR a[rip]
movss xmm0, DWORD PTR .LC1[rip]
movss xmm2, DWORD PTR .LC0[rip]
movaps xmm4, xmm0
movaps xmm1, xmm3
andps xmm1, xmm0
comiss xmm2, xmm1
jbe .L5
addss xmm1, DWORD PTR .LC2[rip]
andnps xmm4, xmm3
movaps xmm3, xmm4
cvttss2si eax, xmm1
pxor xmm1, xmm1
cvtsi2ss xmm1, eax
orps xmm3, xmm1
.L5:
movss DWORD PTR c[rip], xmm3
movss xmm3, DWORD PTR a[rip+4]
movaps xmm4, xmm0
movaps xmm1, xmm3
andps xmm1, xmm0
comiss xmm2, xmm1
jbe .L6
addss xmm1, DWORD PTR .LC2[rip]
andnps xmm4, xmm3
movaps xmm3, xmm4
cvttss2si eax, xmm1
pxor xmm1, xmm1
cvtsi2ss xmm1, eax
orps xmm3, xmm1
.L6:
movss DWORD PTR c[rip+4], xmm3
movss xmm3, DWORD PTR a[rip+8]
movaps xmm4, xmm0
movaps xmm1, xmm3
andps xmm1, xmm0
comiss xmm2, xmm1
jbe .L7
addss xmm1, DWORD PTR .LC2[rip]
andnps xmm4, xmm3
movaps xmm3, xmm4
cvttss2si eax, xmm1
pxor xmm1, xmm1
cvtsi2ss xmm1, eax
orps xmm3, xmm1
.L7:
movss DWORD PTR c[rip+8], xmm3
movss xmm3, DWORD PTR a[rip+12]
movaps xmm1, xmm3
andps xmm1, xmm0
comiss xmm2, xmm1
jbe .L8
addss xmm1, DWORD PTR .LC2[rip]
andnps xmm0, xmm3
cvttss2si eax, xmm1
pxor xmm1, xmm1
cvtsi2ss xmm1, eax
movaps xmm3, xmm1
orps xmm3, xmm0
.L8:
movss DWORD PTR c[rip+12], xmm3
ret
...
vec_nearbyint():
sub rsp, 8
movss xmm0, DWORD PTR a[rip]
call nearbyintf
movss DWORD PTR c[rip], xmm0
movss xmm0, DWORD PTR a[rip+4]
call nearbyintf
movss DWORD PTR c[rip+4], xmm0
movss xmm0, DWORD PTR a[rip+8]
call nearbyintf
movss DWORD PTR c[rip+8], xmm0
movss xmm0, DWORD PTR a[rip+12]
call nearbyintf
movss DWORD PTR c[rip+12], xmm0
add rsp, 8
ret
In comparison, the icc compiler also fails to auto-vectorize fma in sse2 mode
(without vfmadd132ps native instruction of fma), but it does have vectorized
versions of rounding functions (in sse2 mode, withtout roundps native
instruction of sse4.1):
vec_round():
push rsi
movups xmm0, XMMWORD PTR a[rip]
call QWORD PTR [__svml_roundf4@GOTPCREL+rip]
movups XMMWORD PTR c[rip], xmm0
pop rcx
ret
...
vec_nearbyint():
push rsi
movups xmm0, XMMWORD PTR a[rip]
call QWORD PTR [__svml_nearbyintf4@GOTPCREL+rip]
movups XMMWORD PTR c[rip], xmm0
pop rcx
ret
Compiler Explorer Code: https://gcc.godbolt.org/z/xwKluO
With the -msse4.1 flag the gcc compiler stills fail to auto-vectorize fma and
nearbyint, i not sure why dont auto-vectorize the function round directly to
"roundps xmm0, XMMWORD PTR a[rip], 0":
vec_round():
movaps xmm0, XMMWORD PTR a[rip]
andps xmm0, XMMWORD PTR .LC1[rip]
orps xmm0, XMMWORD PTR .LC0[rip]
addps xmm0, XMMWORD PTR a[rip]
roundps xmm0, xmm0, 3
movaps XMMWORD PTR c[rip], xmm0
ret
vec_floor():
roundps xmm0, XMMWORD PTR a[rip], 1
movaps XMMWORD PTR c[rip], xmm0
ret
vec_ceil():
roundps xmm0, XMMWORD PTR a[rip], 2
movaps XMMWORD PTR c[rip], xmm0
ret
vec_trunc():
roundps xmm0, XMMWORD PTR a[rip], 3
movaps XMMWORD PTR c[rip], xmm0
ret
vec_rint():
roundps xmm0, XMMWORD PTR a[rip], 4
movaps XMMWORD PTR c[rip], xmm0
ret
vec_nearbyint():
mov eax, OFFSET FLAT:a
movss xmm0, DWORD PTR [rax]
roundss xmm0, xmm0, 12
movss DWORD PTR c[rip], xmm0
movss xmm0, DWORD PTR [rax+4]
roundss xmm0, xmm0, 12
movss DWORD PTR c[rip+4], xmm0
movss xmm0, DWORD PTR [rax+8]
roundss xmm0, xmm0, 12
movss DWORD PTR c[rip+8], xmm0
movss xmm0, DWORD PTR [rax+12]
roundss xmm0, xmm0, 12
movss DWORD PTR c[rip+12], xmm0
ret
Compiler Explorer Code: https://gcc.godbolt.org/z/Rc63b9
With -mfma flag, the nearbyint function continues without auto-vectorization:
vec_fma():
vmovaps xmm1, XMMWORD PTR c[rip]
vmovaps xmm0, XMMWORD PTR a[rip]
vfmadd132ps xmm0, xmm1, XMMWORD PTR b[rip]
vmovaps XMMWORD PTR c[rip], xmm0
ret
vec_round():
vmovaps xmm1, XMMWORD PTR a[rip]
vandps xmm0, xmm1, XMMWORD PTR .LC1[rip]
vorps xmm0, xmm0, XMMWORD PTR .LC0[rip]
vaddps xmm0, xmm0, xmm1
vroundps xmm0, xmm0, 3
vmovaps XMMWORD PTR c[rip], xmm0
ret
vec_floor():
vroundps xmm0, XMMWORD PTR a[rip], 1
vmovaps XMMWORD PTR c[rip], xmm0
ret
vec_ceil():
vroundps xmm0, XMMWORD PTR a[rip], 2
vmovaps XMMWORD PTR c[rip], xmm0
ret
vec_trunc():
vroundps xmm0, XMMWORD PTR a[rip], 3
vmovaps XMMWORD PTR c[rip], xmm0
ret
vec_rint():
vroundps xmm0, XMMWORD PTR a[rip], 4
vmovaps XMMWORD PTR c[rip], xmm0
ret
vec_nearbyint():
mov eax, OFFSET FLAT:a
vmovss xmm1, DWORD PTR [rax]
vmovss xmm2, DWORD PTR [rax+4]
vmovss xmm3, DWORD PTR [rax+8]
vmovss xmm4, DWORD PTR [rax+12]
vroundss xmm0, xmm1, xmm1, 12
vmovss DWORD PTR c[rip], xmm0
vroundss xmm0, xmm2, xmm2, 12
vmovss DWORD PTR c[rip+4], xmm0
vroundss xmm0, xmm3, xmm3, 12
vmovss DWORD PTR c[rip+8], xmm0
vroundss xmm0, xmm4, xmm4, 12
vmovss DWORD PTR c[rip+12], xmm0
ret
Compiler Explorer Code: https://gcc.godbolt.org/z/_WeniA