https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99412
Bug ID: 99412
Summary: s352 benchmark of TSVC is vectorized by clang and not
by gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D];
int main ()
{
// loop rerolling
// unrolled dot product
real_t dot;
for (int nl = 0; nl < 8*iterations; nl++) {
dot = (real_t)0.;
for (int i = 0; i < LEN_1D; i += 5) {
dot = dot + a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2]
* b[i + 2] + a[i + 3] * b[i + 3] + a[i + 4] * b[i + 4];
}
}
return dot;
}
clang does:
main: # @main
.cfi_startproc
# %bb.0:
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_1: # =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
vxorps %xmm0, %xmm0, %xmm0
movq $-5, %rcx
.p2align 4, 0x90
.LBB0_2: # Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovups b+20(,%rcx,4), %xmm1
vmovss b+36(,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
vmulps a+20(,%rcx,4), %xmm1, %xmm1
vpermilpd $1, %xmm1, %xmm3 # xmm3 = xmm1[1,0]
vaddps %xmm3, %xmm1, %xmm1
vmovshdup %xmm1, %xmm3 # xmm3 = xmm1[1,1,3,3]
vaddss %xmm3, %xmm1, %xmm1
vfmadd231ss a+36(,%rcx,4), %xmm2, %xmm1 # xmm1 = (xmm2 * mem) +
xmm1
addq $5, %rcx
vaddss %xmm0, %xmm1, %xmm0
cmpq $31995, %rcx # imm = 0x7CFB
jb .LBB0_2
# %bb.3: # in Loop: Header=BB0_1 Depth=1
incl %eax
cmpl $800000, %eax # imm = 0xC3500
jne .LBB0_1
# %bb.4:
vcvttss2si %xmm0, %eax
retq