https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78411
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Priority|P3 |P1
--- Comment #9 from Richard Biener <rguenth at gcc dot gnu.org> ---
The testcase looks quite "stupid" btw, and even with if-conversion applied we
end up with .optimized
<bb 3> [85.00%]:
# s_24 = PHI <s_20(3), 0(2)>
# i_22 = PHI <i_21(3), 0(2)>
_3 = MEM[base: products_16(D), index: i_22, step: 8, offset: 0B];
_36 = -_3;
iftmp.0_12 = _3 > 0 ? 1 : -1;
prephitmp_37 = _3 > 0 ? _3 : _36;
prephitmp_38 = _3 > 0 ? -1 : 1;
prephitmp_39 = _3 > 0 ? 4294967295 : 1;
prephitmp_41 = _3 > 0 ? 1 : 4294967295;
_5 = (long long unsigned int) prephitmp_37;
val_11 = _5 == i_22 ? iftmp.0_12 : prephitmp_38;
prephitmp_44 = _5 != i_22 ? prephitmp_39 : prephitmp_41;
MEM[base: products_16(D), index: i_22, step: 8, offset: 0B] = val_11;
s.1_7 = (unsigned int) s_24;
_8 = s.1_7 + prephitmp_44;
s_20 = (int) _8;
i_21 = i_22 + 1;
if (count_15(D) != i_21)
goto <bb 3>; [85.00%]
where _37 is abs (_3). phi-opt (or match.pd enhanced for PHIs) could see
this from
if (_3 > 0)
goto <bb 4>; [64.00%]
else
goto <bb 5>; [36.00%]
<bb 4> [54.40%]:
<bb 5> [85.00%]:
# iftmp.2_11 = PHI <1(4), -1(3)>
_4 = (long long int) iftmp.2_11;
_5 = _3 * _4;
(with or without the conversion in _4). Note that there is also the
"threading" opportunity for the constant PHI values (or would you call
that tail duplication). split-paths only duplicates the 2nd half which
isn't really useful.
Note that phiop3 sees ABS it could handle:
<bb 3> [85.00%]:
# s_40 = PHI <0(9), s_46(10)>
# i_35 = PHI <0(9), i_47(10)>
_25 = MEM[base: products_16(D), index: i_35, step: 8, offset: 0B];
if (_25 > 0)
goto <bb 5>; [64.00%]
else
goto <bb 4>; [36.00%]
<bb 4> [30.60%]:
_17 = -_25;
<bb 5> [85.00%]:
# iftmp.0_13 = PHI <1(3), -1(4)>
# prephitmp_10 = PHI <_25(3), _17(4)>
# prephitmp_9 = PHI <-1(3), 1(4)>
# prephitmp_33 = PHI <4294967295(3), 1(4)>
# prephitmp_32 = PHI <1(3), 4294967295(4)>
_31 = (long long unsigned int) prephitmp_10;
but of course the other PHIs are in the way so it doesn't perform the
replacement (and only PRE "exposes" that form).
Currently on RTL BB reorder performs the 2nd tail duplication and the code
looks reasonable, not sure if cmovs are really desired here. -Os produces
summation_helper_2:
.LFB1:
.cfi_startproc
xorl %ecx, %ecx
xorl %eax, %eax
.L10:
cmpq %rsi, %rcx
je .L15
movq (%rdi,%rcx,8), %r9
xorl %edx, %edx
testq %r9, %r9
setg %dl
leal -1(%rdx,%rdx), %edx
movslq %edx, %r8
imulq %r9, %r8
cmpq %rcx, %r8
je .L12
negl %edx
.L12:
movslq %edx, %r8
addl %edx, %eax
movq %r8, (%rdi,%rcx,8)
incq %rcx
jmp .L10
.L15:
ret
and -O2:
summation_helper_2:
.LFB1:
.cfi_startproc
testq %rsi, %rsi
je .L15
xorl %edx, %edx
xorl %eax, %eax
movq $-1, %r10
.p2align 4,,10
.p2align 3
.L14:
movq (%rdi,%rdx,8), %r8
movl $1, %r9d
movl $1, %ecx
testq %r8, %r8
jg .L12
negq %r8
movq %r10, %r9
movl $-1, %ecx
.L12:
cmpq %rdx, %r8
je .L13
negl %ecx
movslq %ecx, %r9
.L13:
movq %r9, (%rdi,%rdx,8)
addq $1, %rdx
addl %ecx, %eax
cmpq %rdx, %rsi
jne .L14
rep ret
(summation_helper_1 has one cmov with -O2).
Manually changing the testcase to use ABS_EXPR changes code quite a bit
(not sure if for the worse).
With -fast ICC generates (apart from lots of versioning and vectorized
versions...):
movl $1, %esi #10.2
..___tag_value_summation_helper_1.8: #
# LOE rcx rbx rbp rsi rdi r8 r9 r10 r13 r14 r15
eax
..B2.9: # Preds ..B2.9 ..B2.8
movq (%r9,%rcx,8), %r11 #12.18
movq $-1, %rdx #12.30
testq %r11, %r11 #12.30
cmovg %rsi, %rdx #12.30
imulq %rdx, %r11 #13.3
movq %rdx, %r12 #15.11
negq %r12 #15.11
cmpq %r11, %rcx #15.4
movslq %eax, %rax #17.3
cmovne %r12, %rdx #15.4
movq %rdx, (%r9,%rcx,8) #16.3
incq %rcx #10.2
addq %rdx, %rax #17.3
cmpq %r8, %rcx #10.2
jb ..B2.9 # Prob 82% #10.2
while GCC 6 and 5 generated
.L3:
movq (%rdi,%rcx,8), %rdx
testq %rdx, %rdx
setg %r8b
movzbl %r8b, %r9d
movzbl %r8b, %r8d
leaq -1(%r9,%r9), %r9
leal -1(%r8,%r8), %r8d
movq %r9, %r11
imulq %rdx, %r11
testq %rdx, %rdx
setle %dl
movzbl %dl, %r10d
movzbl %dl, %edx
leaq -1(%r10,%r10), %r10
leal -1(%rdx,%rdx), %edx
cmpq %rcx, %r11
cmovne %r10, %r9
cmove %r8d, %edx
movq %r9, (%rdi,%rcx,8)
addq $1, %rcx
addl %edx, %eax
cmpq %rcx, %rsi
jne .L3
I don't think the situation with trunk is really worse?
Not sure what to do here but certainly adding -ftree-loop-if-convert fixes
the testcase.