https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79830
Bug ID: 79830
Summary: GCC generates counterproductive code surrounding very
simple loops (improvement request)
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: kobalicek.petr at gmail dot com
Target Milestone: ---
It seems that GCC tries very hard to optimize loops, but in my case it's
counterproductive. I have illustrated the problem in the following C++ code and
disassembly.
Loops that are constructed this way need only one variable (`i`) as a loop
counter and use sign flag to check whether the loop is done or not. Typically
such loop requires simple check at the beginning (`sub` and `js`) and at the
end. The purpose of such loop is to save registers and to only require minimal
code surrounding the loop.
However, it seems that GCC tries to convert such loop into something else and
requires a lot of operations to do that, resulting in bigger and slower code.
When using `-Os` GCC produces code that I would expect, however, I don't want
to optimize for size globally.
It's not a compiler bug, but I think that in this case this optimization
doesn't make any sense and only adds to the executable/library size. I doubt
this leads to any improvement and it would be nice if GCC can somehow discover
to not do this for these kind of loops.
Also, here is a compiler explorer URL, for people wanting to compare:
https://godbolt.org/g/oeDGmy
Consider the following C++ code
-------------------------------
#include <stdint.h>
#if defined(_MSC_VER)
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
void transform(double* dst, const double* src, const double* matrix, size_t
length) {
__m256d m_00_11 = _mm256_castpd128_pd256(_mm_set_pd(matrix[3], matrix[0]));
__m256d m_10_01 = _mm256_castpd128_pd256(_mm_set_pd(matrix[1], matrix[2]));
__m256d m_20_21 = _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(matrix
+ 4));
m_00_11 = _mm256_permute2f128_pd(m_00_11, m_00_11, 0);
m_10_01 = _mm256_permute2f128_pd(m_10_01, m_10_01, 0);
intptr_t i = static_cast<intptr_t>(length);
while ((i -= 8) >= 0) {
__m256d s0 = _mm256_loadu_pd(src + 0);
__m256d s1 = _mm256_loadu_pd(src + 4);
__m256d s2 = _mm256_loadu_pd(src + 8);
__m256d s3 = _mm256_loadu_pd(src + 12);
__m256d a0 = _mm256_add_pd(_mm256_mul_pd(s0, m_00_11), m_20_21);
__m256d a1 = _mm256_add_pd(_mm256_mul_pd(s1, m_00_11), m_20_21);
__m256d a2 = _mm256_add_pd(_mm256_mul_pd(s2, m_00_11), m_20_21);
__m256d a3 = _mm256_add_pd(_mm256_mul_pd(s3, m_00_11), m_20_21);
__m256d b0 = _mm256_mul_pd(_mm256_shuffle_pd(s0, s0, 0x1), m_10_01);
__m256d b1 = _mm256_mul_pd(_mm256_shuffle_pd(s1, s1, 0x1), m_10_01);
__m256d b2 = _mm256_mul_pd(_mm256_shuffle_pd(s2, s2, 0x1), m_10_01);
__m256d b3 = _mm256_mul_pd(_mm256_shuffle_pd(s3, s3, 0x1), m_10_01);
_mm256_storeu_pd(dst + 0, _mm256_add_pd(a0, b0));
_mm256_storeu_pd(dst + 4, _mm256_add_pd(a1, b1));
_mm256_storeu_pd(dst + 8, _mm256_add_pd(a2, b2));
_mm256_storeu_pd(dst + 12, _mm256_add_pd(a3, b3));
dst += 16;
src += 16;
}
i += 8;
while ((i -= 2) >= 0) {
__m256d s0 = _mm256_loadu_pd(src);
__m256d a0 = _mm256_add_pd(_mm256_mul_pd(s0, m_00_11), m_20_21);
__m256d b0 = _mm256_mul_pd(_mm256_shuffle_pd(s0, s0, 0x1), m_10_01);
_mm256_storeu_pd(dst, _mm256_add_pd(a0, b0));
dst += 4;
src += 4;
}
if (i & 1) {
__m128d s0 = _mm_loadu_pd(src + 0);
__m128d a0 = _mm_add_pd(_mm_mul_pd(s0, _mm256_castpd256_pd128(m_00_11)),
_mm256_castpd256_pd128(m_20_21));
__m128d b0 = _mm_mul_pd(_mm_shuffle_pd(s0, s0, 0x1),
_mm256_castpd256_pd128(m_10_01));
_mm_storeu_pd(dst + 0, _mm_add_pd(a0, b0));
}
}
Which is compiled to the following
----------------------------------
(-O2 -mavx -fno-exceptions -fno-tree-vectorize)
See comments describing what I din't like..
transform(double*, double const*, double const*, unsigned long):
vmovsd xmm4, QWORD PTR [rdx]
mov r9, rcx
vmovsd xmm5, QWORD PTR [rdx+16]
sub r9, 8
vmovhpd xmm4, xmm4, QWORD PTR [rdx+24]
vbroadcastf128 ymm6, XMMWORD PTR [rdx+32]
mov r8, rcx
vmovhpd xmm5, xmm5, QWORD PTR [rdx+8]
vperm2f128 ymm4, ymm4, ymm4, 0
vperm2f128 ymm5, ymm5, ymm5, 0
js .L6
;; <----------------------- Weird
mov rax, r9
sub rcx, 16
mov r8, r9
and rax, -8
mov rdx, rsi
sub rcx, rax
mov rax, rdi
;; <----------------------- Weird
.L5:
vmovupd xmm3, XMMWORD PTR [rdx]
sub r8, 8
sub rax, -128
sub rdx, -128
vinsertf128 ymm3, ymm3, XMMWORD PTR [rdx-112], 0x1
vmovupd xmm2, XMMWORD PTR [rdx-96]
vmovupd xmm1, XMMWORD PTR [rdx-64]
vinsertf128 ymm2, ymm2, XMMWORD PTR [rdx-80], 0x1
vmovupd xmm0, XMMWORD PTR [rdx-32]
vshufpd ymm10, ymm3, ymm3, 1
vmulpd ymm3, ymm4, ymm3
vinsertf128 ymm1, ymm1, XMMWORD PTR [rdx-48], 0x1
vinsertf128 ymm0, ymm0, XMMWORD PTR [rdx-16], 0x1
vaddpd ymm3, ymm3, ymm6
vmulpd ymm10, ymm5, ymm10
vshufpd ymm9, ymm2, ymm2, 1
vshufpd ymm8, ymm1, ymm1, 1
vshufpd ymm7, ymm0, ymm0, 1
vaddpd ymm3, ymm3, ymm10
vmulpd ymm2, ymm4, ymm2
vmulpd ymm1, ymm4, ymm1
vmulpd ymm0, ymm4, ymm0
vmovups XMMWORD PTR [rax-128], xmm3
vextractf128 XMMWORD PTR [rax-112], ymm3, 0x1
vaddpd ymm2, ymm2, ymm6
vmulpd ymm3, ymm5, ymm9
vaddpd ymm1, ymm1, ymm6
vmulpd ymm8, ymm5, ymm8
vaddpd ymm0, ymm0, ymm6
vmulpd ymm7, ymm5, ymm7
vaddpd ymm2, ymm2, ymm3
vaddpd ymm1, ymm1, ymm8
vaddpd ymm0, ymm0, ymm7
vmovups XMMWORD PTR [rax-96], xmm2
vextractf128 XMMWORD PTR [rax-80], ymm2, 0x1
vmovups XMMWORD PTR [rax-64], xmm1
vextractf128 XMMWORD PTR [rax-48], ymm1, 0x1
vmovups XMMWORD PTR [rax-32], xmm0
vextractf128 XMMWORD PTR [rax-16], ymm0, 0x1
cmp rcx, r8
jne .L5
;; <----------------------- Weird
mov rax, r9
shr rax, 3
lea rdx, [rax+1]
neg rax
lea r8, [r9+rax*8]
sal rdx, 7
add rdi, rdx
add rsi, rdx
.L6:
mov rax, r8
sub rax, 2
js .L4
shr rax
lea rcx, [rax+1]
mov rdx, rax
xor eax, eax
sal rcx, 5
;; <----------------------- Weird
.L7:
vmovupd xmm0, XMMWORD PTR [rsi+rax]
vinsertf128 ymm0, ymm0, XMMWORD PTR [rsi+16+rax], 0x1
vshufpd ymm1, ymm0, ymm0, 1
vmulpd ymm0, ymm4, ymm0
vmulpd ymm1, ymm5, ymm1
vaddpd ymm0, ymm0, ymm6
vaddpd ymm0, ymm0, ymm1
vmovups XMMWORD PTR [rdi+rax], xmm0
vextractf128 XMMWORD PTR [rdi+16+rax], ymm0, 0x1
add rax, 32
;; <----------------------- Doesn't follow (i -= 2) >= 0
cmp rcx, rax
jne .L7
;; <----------------------- Weird
mov rax, rdx
add rdi, rcx
add rsi, rcx
neg rax
lea rax, [r8-4+rax*2]
;; <----------------------- Weird
.L4:
test al, 1
je .L14
vmovupd xmm0, XMMWORD PTR [rsi]
vmulpd xmm4, xmm4, xmm0
vshufpd xmm1, xmm0, xmm0, 1
vaddpd xmm6, xmm4, xmm6
vmulpd xmm5, xmm5, xmm1
vaddpd xmm5, xmm6, xmm5
vmovups XMMWORD PTR [rdi], xmm5
.L14:
vzeroupper
ret