https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79830

            Bug ID: 79830
           Summary: GCC generates counterproductive code surrounding very
                    simple loops (improvement request)
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: kobalicek.petr at gmail dot com
  Target Milestone: ---

It seems that GCC tries very hard to optimize loops, but in my case it's
counterproductive. I have illustrated the problem in the following C++ code and
disassembly.

Loops that are constructed this way need only one variable (`i`) as a loop
counter and use sign flag to check whether the loop is done or not. Typically
such loop requires simple check at the beginning (`sub` and `js`) and at the
end. The purpose of such loop is to save registers and to only require minimal
code surrounding the loop.

However, it seems that GCC tries to convert such loop into something else and
requires a lot of operations to do that, resulting in bigger and slower code.
When using `-Os` GCC produces code that I would expect, however, I don't want
to optimize for size globally.

It's not a compiler bug, but I think that in this case this optimization
doesn't make any sense and only adds to the executable/library size. I doubt
this leads to any improvement and it would be nice if GCC can somehow discover
to not do this for these kind of loops.

Also, here is a compiler explorer URL, for people wanting to compare:

  https://godbolt.org/g/oeDGmy



Consider the following C++ code
-------------------------------


#include <stdint.h>

#if defined(_MSC_VER)
# include <intrin.h>
#else
# include <x86intrin.h>
#endif

void transform(double* dst, const double* src, const double* matrix, size_t
length) {
  __m256d m_00_11 = _mm256_castpd128_pd256(_mm_set_pd(matrix[3], matrix[0]));
  __m256d m_10_01 = _mm256_castpd128_pd256(_mm_set_pd(matrix[1], matrix[2]));
  __m256d m_20_21 = _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(matrix
+ 4));

  m_00_11 = _mm256_permute2f128_pd(m_00_11, m_00_11, 0);
  m_10_01 = _mm256_permute2f128_pd(m_10_01, m_10_01, 0);

  intptr_t i = static_cast<intptr_t>(length);
  while ((i -= 8) >= 0) {
    __m256d s0 = _mm256_loadu_pd(src +  0);
    __m256d s1 = _mm256_loadu_pd(src +  4);
    __m256d s2 = _mm256_loadu_pd(src +  8);
    __m256d s3 = _mm256_loadu_pd(src + 12);

    __m256d a0 = _mm256_add_pd(_mm256_mul_pd(s0, m_00_11), m_20_21);
    __m256d a1 = _mm256_add_pd(_mm256_mul_pd(s1, m_00_11), m_20_21);
    __m256d a2 = _mm256_add_pd(_mm256_mul_pd(s2, m_00_11), m_20_21);
    __m256d a3 = _mm256_add_pd(_mm256_mul_pd(s3, m_00_11), m_20_21);

    __m256d b0 = _mm256_mul_pd(_mm256_shuffle_pd(s0, s0, 0x1), m_10_01);
    __m256d b1 = _mm256_mul_pd(_mm256_shuffle_pd(s1, s1, 0x1), m_10_01);
    __m256d b2 = _mm256_mul_pd(_mm256_shuffle_pd(s2, s2, 0x1), m_10_01);
    __m256d b3 = _mm256_mul_pd(_mm256_shuffle_pd(s3, s3, 0x1), m_10_01);

    _mm256_storeu_pd(dst +  0, _mm256_add_pd(a0, b0));
    _mm256_storeu_pd(dst +  4, _mm256_add_pd(a1, b1));
    _mm256_storeu_pd(dst +  8, _mm256_add_pd(a2, b2));
    _mm256_storeu_pd(dst + 12, _mm256_add_pd(a3, b3));

    dst += 16;
    src += 16;
  }
  i += 8;

  while ((i -= 2) >= 0) {
    __m256d s0 = _mm256_loadu_pd(src);

    __m256d a0 = _mm256_add_pd(_mm256_mul_pd(s0, m_00_11), m_20_21);
    __m256d b0 = _mm256_mul_pd(_mm256_shuffle_pd(s0, s0, 0x1), m_10_01);

    _mm256_storeu_pd(dst, _mm256_add_pd(a0, b0));

    dst += 4;
    src += 4;
  }

  if (i & 1) {
    __m128d s0 = _mm_loadu_pd(src +  0);

    __m128d a0 = _mm_add_pd(_mm_mul_pd(s0, _mm256_castpd256_pd128(m_00_11)),
_mm256_castpd256_pd128(m_20_21));
    __m128d b0 = _mm_mul_pd(_mm_shuffle_pd(s0, s0, 0x1),
_mm256_castpd256_pd128(m_10_01));

    _mm_storeu_pd(dst +  0, _mm_add_pd(a0, b0));
  }
}



Which is compiled to the following
----------------------------------

(-O2 -mavx -fno-exceptions -fno-tree-vectorize)

See comments describing what I din't like..

transform(double*, double const*, double const*, unsigned long):
        vmovsd  xmm4, QWORD PTR [rdx]
        mov     r9, rcx
        vmovsd  xmm5, QWORD PTR [rdx+16]
        sub     r9, 8
        vmovhpd xmm4, xmm4, QWORD PTR [rdx+24]
        vbroadcastf128  ymm6, XMMWORD PTR [rdx+32]
        mov     r8, rcx
        vmovhpd xmm5, xmm5, QWORD PTR [rdx+8]
        vperm2f128      ymm4, ymm4, ymm4, 0
        vperm2f128      ymm5, ymm5, ymm5, 0
        js      .L6

        ;; <----------------------- Weird
        mov     rax, r9
        sub     rcx, 16
        mov     r8, r9
        and     rax, -8
        mov     rdx, rsi
        sub     rcx, rax
        mov     rax, rdi
        ;; <----------------------- Weird
.L5:
        vmovupd xmm3, XMMWORD PTR [rdx]
        sub     r8, 8
        sub     rax, -128
        sub     rdx, -128
        vinsertf128     ymm3, ymm3, XMMWORD PTR [rdx-112], 0x1
        vmovupd xmm2, XMMWORD PTR [rdx-96]
        vmovupd xmm1, XMMWORD PTR [rdx-64]
        vinsertf128     ymm2, ymm2, XMMWORD PTR [rdx-80], 0x1
        vmovupd xmm0, XMMWORD PTR [rdx-32]
        vshufpd ymm10, ymm3, ymm3, 1
        vmulpd  ymm3, ymm4, ymm3
        vinsertf128     ymm1, ymm1, XMMWORD PTR [rdx-48], 0x1
        vinsertf128     ymm0, ymm0, XMMWORD PTR [rdx-16], 0x1
        vaddpd  ymm3, ymm3, ymm6
        vmulpd  ymm10, ymm5, ymm10
        vshufpd ymm9, ymm2, ymm2, 1
        vshufpd ymm8, ymm1, ymm1, 1
        vshufpd ymm7, ymm0, ymm0, 1
        vaddpd  ymm3, ymm3, ymm10
        vmulpd  ymm2, ymm4, ymm2
        vmulpd  ymm1, ymm4, ymm1
        vmulpd  ymm0, ymm4, ymm0
        vmovups XMMWORD PTR [rax-128], xmm3
        vextractf128    XMMWORD PTR [rax-112], ymm3, 0x1
        vaddpd  ymm2, ymm2, ymm6
        vmulpd  ymm3, ymm5, ymm9
        vaddpd  ymm1, ymm1, ymm6
        vmulpd  ymm8, ymm5, ymm8
        vaddpd  ymm0, ymm0, ymm6
        vmulpd  ymm7, ymm5, ymm7
        vaddpd  ymm2, ymm2, ymm3
        vaddpd  ymm1, ymm1, ymm8
        vaddpd  ymm0, ymm0, ymm7
        vmovups XMMWORD PTR [rax-96], xmm2
        vextractf128    XMMWORD PTR [rax-80], ymm2, 0x1
        vmovups XMMWORD PTR [rax-64], xmm1
        vextractf128    XMMWORD PTR [rax-48], ymm1, 0x1
        vmovups XMMWORD PTR [rax-32], xmm0
        vextractf128    XMMWORD PTR [rax-16], ymm0, 0x1
        cmp     rcx, r8
        jne     .L5

        ;; <----------------------- Weird
        mov     rax, r9
        shr     rax, 3
        lea     rdx, [rax+1]
        neg     rax
        lea     r8, [r9+rax*8]
        sal     rdx, 7
        add     rdi, rdx
        add     rsi, rdx
.L6:
        mov     rax, r8
        sub     rax, 2
        js      .L4
        shr     rax
        lea     rcx, [rax+1]
        mov     rdx, rax
        xor     eax, eax
        sal     rcx, 5
        ;; <----------------------- Weird

.L7:
        vmovupd xmm0, XMMWORD PTR [rsi+rax]
        vinsertf128     ymm0, ymm0, XMMWORD PTR [rsi+16+rax], 0x1
        vshufpd ymm1, ymm0, ymm0, 1
        vmulpd  ymm0, ymm4, ymm0
        vmulpd  ymm1, ymm5, ymm1
        vaddpd  ymm0, ymm0, ymm6
        vaddpd  ymm0, ymm0, ymm1
        vmovups XMMWORD PTR [rdi+rax], xmm0
        vextractf128    XMMWORD PTR [rdi+16+rax], ymm0, 0x1
        add     rax, 32
        ;; <----------------------- Doesn't follow (i -= 2) >= 0
        cmp     rcx, rax
        jne     .L7
        ;; <----------------------- Weird
        mov     rax, rdx
        add     rdi, rcx
        add     rsi, rcx
        neg     rax
        lea     rax, [r8-4+rax*2]
        ;; <----------------------- Weird
.L4:
        test    al, 1
        je      .L14
        vmovupd xmm0, XMMWORD PTR [rsi]
        vmulpd  xmm4, xmm4, xmm0
        vshufpd xmm1, xmm0, xmm0, 1
        vaddpd  xmm6, xmm4, xmm6
        vmulpd  xmm5, xmm5, xmm1
        vaddpd  xmm5, xmm6, xmm5
        vmovups XMMWORD PTR [rdi], xmm5
.L14:
        vzeroupper
        ret

Reply via email to