https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78411

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Priority|P3                          |P1

--- Comment #9 from Richard Biener <rguenth at gcc dot gnu.org> ---
The testcase looks quite "stupid" btw, and even with if-conversion applied we
end up with .optimized

  <bb 3> [85.00%]:
  # s_24 = PHI <s_20(3), 0(2)>
  # i_22 = PHI <i_21(3), 0(2)>
  _3 = MEM[base: products_16(D), index: i_22, step: 8, offset: 0B];
  _36 = -_3;
  iftmp.0_12 = _3 > 0 ? 1 : -1;
  prephitmp_37 = _3 > 0 ? _3 : _36;
  prephitmp_38 = _3 > 0 ? -1 : 1;
  prephitmp_39 = _3 > 0 ? 4294967295 : 1;
  prephitmp_41 = _3 > 0 ? 1 : 4294967295;
  _5 = (long long unsigned int) prephitmp_37;
  val_11 = _5 == i_22 ? iftmp.0_12 : prephitmp_38;
  prephitmp_44 = _5 != i_22 ? prephitmp_39 : prephitmp_41;
  MEM[base: products_16(D), index: i_22, step: 8, offset: 0B] = val_11;
  s.1_7 = (unsigned int) s_24;
  _8 = s.1_7 + prephitmp_44;
  s_20 = (int) _8;
  i_21 = i_22 + 1;
  if (count_15(D) != i_21)
    goto <bb 3>; [85.00%]

where _37 is abs (_3).  phi-opt (or match.pd enhanced for PHIs) could see
this from

  if (_3 > 0)
    goto <bb 4>; [64.00%]
  else
    goto <bb 5>; [36.00%]

  <bb 4> [54.40%]:

  <bb 5> [85.00%]:
  # iftmp.2_11 = PHI <1(4), -1(3)>
  _4 = (long long int) iftmp.2_11;
  _5 = _3 * _4;

(with or without the conversion in _4).  Note that there is also the
"threading" opportunity for the constant PHI values (or would you call
that tail duplication).  split-paths only duplicates the 2nd half which
isn't really useful.

Note that phiop3 sees ABS it could handle:

  <bb 3> [85.00%]:
  # s_40 = PHI <0(9), s_46(10)>
  # i_35 = PHI <0(9), i_47(10)>
  _25 = MEM[base: products_16(D), index: i_35, step: 8, offset: 0B];
  if (_25 > 0)
    goto <bb 5>; [64.00%]
  else
    goto <bb 4>; [36.00%]

  <bb 4> [30.60%]:
  _17 = -_25;

  <bb 5> [85.00%]:
  # iftmp.0_13 = PHI <1(3), -1(4)>
  # prephitmp_10 = PHI <_25(3), _17(4)>
  # prephitmp_9 = PHI <-1(3), 1(4)>
  # prephitmp_33 = PHI <4294967295(3), 1(4)>
  # prephitmp_32 = PHI <1(3), 4294967295(4)>
  _31 = (long long unsigned int) prephitmp_10;

but of course the other PHIs are in the way so it doesn't perform the
replacement (and only PRE "exposes" that form).

Currently on RTL BB reorder performs the 2nd tail duplication and the code
looks reasonable, not sure if cmovs are really desired here.  -Os produces

summation_helper_2:
.LFB1:
        .cfi_startproc
        xorl    %ecx, %ecx
        xorl    %eax, %eax
.L10:
        cmpq    %rsi, %rcx
        je      .L15
        movq    (%rdi,%rcx,8), %r9
        xorl    %edx, %edx
        testq   %r9, %r9
        setg    %dl
        leal    -1(%rdx,%rdx), %edx
        movslq  %edx, %r8
        imulq   %r9, %r8
        cmpq    %rcx, %r8
        je      .L12
        negl    %edx
.L12:
        movslq  %edx, %r8
        addl    %edx, %eax
        movq    %r8, (%rdi,%rcx,8)
        incq    %rcx
        jmp     .L10
.L15:
        ret

and -O2:

summation_helper_2:
.LFB1:
        .cfi_startproc
        testq   %rsi, %rsi
        je      .L15
        xorl    %edx, %edx
        xorl    %eax, %eax
        movq    $-1, %r10
        .p2align 4,,10
        .p2align 3
.L14:
        movq    (%rdi,%rdx,8), %r8
        movl    $1, %r9d
        movl    $1, %ecx
        testq   %r8, %r8
        jg      .L12
        negq    %r8
        movq    %r10, %r9
        movl    $-1, %ecx
.L12:
        cmpq    %rdx, %r8
        je      .L13
        negl    %ecx
        movslq  %ecx, %r9
.L13:
        movq    %r9, (%rdi,%rdx,8)
        addq    $1, %rdx
        addl    %ecx, %eax
        cmpq    %rdx, %rsi
        jne     .L14
        rep ret

(summation_helper_1 has one cmov with -O2).

Manually changing the testcase to use ABS_EXPR changes code quite a bit
(not sure if for the worse).

With -fast ICC generates (apart from lots of versioning and vectorized
versions...):

        movl      $1, %esi                                      #10.2
..___tag_value_summation_helper_1.8:                            #
                                # LOE rcx rbx rbp rsi rdi r8 r9 r10 r13 r14 r15
eax
..B2.9:                         # Preds ..B2.9 ..B2.8
        movq      (%r9,%rcx,8), %r11                            #12.18
        movq      $-1, %rdx                                     #12.30
        testq     %r11, %r11                                    #12.30
        cmovg     %rsi, %rdx                                    #12.30
        imulq     %rdx, %r11                                    #13.3
        movq      %rdx, %r12                                    #15.11
        negq      %r12                                          #15.11
        cmpq      %r11, %rcx                                    #15.4
        movslq    %eax, %rax                                    #17.3
        cmovne    %r12, %rdx                                    #15.4
        movq      %rdx, (%r9,%rcx,8)                            #16.3
        incq      %rcx                                          #10.2
        addq      %rdx, %rax                                    #17.3
        cmpq      %r8, %rcx                                     #10.2
        jb        ..B2.9        # Prob 82%                      #10.2

while GCC 6 and 5 generated

.L3:
        movq    (%rdi,%rcx,8), %rdx
        testq   %rdx, %rdx
        setg    %r8b
        movzbl  %r8b, %r9d
        movzbl  %r8b, %r8d
        leaq    -1(%r9,%r9), %r9
        leal    -1(%r8,%r8), %r8d
        movq    %r9, %r11
        imulq   %rdx, %r11
        testq   %rdx, %rdx
        setle   %dl
        movzbl  %dl, %r10d
        movzbl  %dl, %edx
        leaq    -1(%r10,%r10), %r10
        leal    -1(%rdx,%rdx), %edx
        cmpq    %rcx, %r11
        cmovne  %r10, %r9
        cmove   %r8d, %edx
        movq    %r9, (%rdi,%rcx,8)
        addq    $1, %rcx
        addl    %edx, %eax
        cmpq    %rcx, %rsi
        jne     .L3

I don't think the situation with trunk is really worse?

Not sure what to do here but certainly adding -ftree-loop-if-convert fixes
the testcase.

Reply via email to