https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80852

            Bug ID: 80852
           Summary: Optimisation missed for loop with condition that is
                    always true
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: drraph at gmail dot com
  Target Milestone: ---

Consider this (slightly odd) code:

int foo(int num) {
    int a = 0;
    for (int x = 0; x < num; x+=2) {
      if (!(x % 2)) {
        a += x;
      }
   }
    return a;
  }

Note that the condition !(x % 2) is always true.

In clang and -O3 -march=core-avx2 you get:

foo(int):                             # @square(int)
        test    edi, edi
        jle     .LBB0_1
        add     edi, -1
        mov     eax, edi
        shr     eax
        lea     ecx, [rax - 1]
        imul    ecx, eax
        and     ecx, -2
        and     edi, -2
        add     edi, ecx
        mov     eax, edi
        ret
.LBB0_1:
        xor     edi, edi
        mov     eax, edi
        ret

This is clever as it avoids looping altogether.

gcc however doesn't know this trick and you get:

foo(int):
        test    edi, edi
        jle     .L7
        lea     eax, [rdi-1]
        mov     ecx, eax
        shr     ecx
        add     ecx, 1
        cmp     eax, 17
        jbe     .L8
        mov     edx, ecx
        vmovdqa ymm1, YMMWORD PTR .LC0[rip]
        xor     eax, eax
        vpxor   xmm0, xmm0, xmm0
        vmovdqa ymm2, YMMWORD PTR .LC1[rip]
        shr     edx, 3
.L5:
        add     eax, 1
        vpaddd  ymm0, ymm0, ymm1
        vpaddd  ymm1, ymm1, ymm2
        cmp     eax, edx
        jb      .L5
        vpxor   xmm1, xmm1, xmm1
        mov     esi, ecx
        vperm2i128      ymm2, ymm0, ymm1, 33
        and     esi, -8
        vpaddd  ymm0, ymm0, ymm2
        lea     edx, [rsi+rsi]
        vperm2i128      ymm2, ymm0, ymm1, 33
        vpalignr        ymm2, ymm2, ymm0, 8
        vpaddd  ymm0, ymm0, ymm2
        vperm2i128      ymm1, ymm0, ymm1, 33
        vpalignr        ymm1, ymm1, ymm0, 4
        vpaddd  ymm0, ymm0, ymm1
        vmovd   eax, xmm0
        cmp     ecx, esi
        je      .L12
        vzeroupper
.L3:
        lea     ecx, [rdx+2]
        add     eax, edx
        cmp     edi, ecx
        jle     .L10
        add     eax, ecx
        lea     ecx, [rdx+4]
        cmp     ecx, edi
        jge     .L10
        add     eax, ecx
        lea     ecx, [rdx+6]
        cmp     edi, ecx
        jle     .L10
        add     eax, ecx
        lea     ecx, [rdx+8]
        cmp     edi, ecx
        jle     .L10
        add     eax, ecx
        lea     ecx, [rdx+10]
        cmp     edi, ecx
        jle     .L10
        add     eax, ecx
        lea     ecx, [rdx+12]
        cmp     edi, ecx
        jle     .L10
        add     eax, ecx
        lea     ecx, [rdx+14]
        cmp     edi, ecx
        jle     .L10
        add     eax, ecx
        add     edx, 16
        lea     ecx, [rax+rdx]
        cmp     edi, edx
        cmovg   eax, ecx
        ret
.L7:
        xor     eax, eax
.L10:
        ret
.L12:
        vzeroupper
        ret
.L8:
        xor     edx, edx
        xor     eax, eax
        jmp     .L3
.LC0:
        .long   0
        .long   2
        .long   4
        .long   6
        .long   8
        .long   10
        .long   12
        .long   14
.LC1:
        .long   16
        .long   16
        .long   16
        .long   16
        .long   16
        .long   16
        .long   16
        .long   16

Reply via email to