http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53533
Richard Guenther <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Target| |x86_64-*-*
Status|WAITING |NEW
Known to work| |4.6.3
Keywords| |missed-optimization
Component|middle-end |rtl-optimization
CC| |jakub at gcc dot gnu.org,
| |uros at gcc dot gnu.org
Summary|[4.7 regression] loop |[4.7/4.8 regression]
|unrolling as measured by |vectorization causes loop
|Adobe's C++Benchmark is |unrolling test slowdown as
|twice as slow versus |measured by Adobe's
|4.4-4.6 |C++Benchmark
Known to fail| |4.7.1, 4.8.0
Severity|major |normal
--- Comment #6 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-06-12
09:54:02 UTC ---
Ok, it seems to me that this has template-metaprogramming loop unrolling. With
GCC 4.7 we unroll and vectorize all loops, for example unroll factor 8 looks
like
<bb 50>:
# vect_var_.941_3474 = PHI <vect_var_.941_3472(50), {0, 0, 0, 0}(64)>
# vect_var_.941_3473 = PHI <vect_var_.941_3471(50), {0, 0, 0, 0}(64)>
# ivtmp.1325_970 = PHI <ivtmp.1325_812(50), ivtmp.1325_813(64)>
D.9934_819 = (void *) ivtmp.1325_970;
vect_var_.918_323 = MEM[base: D.9934_819, offset: 0B];
vect_var_.919_325 = MEM[base: D.9934_819, offset: 16B];
vect_var_.920_328 = vect_var_.918_323 + { 12345, 12345, 12345, 12345 };
vect_var_.920_330 = vect_var_.919_325 + { 12345, 12345, 12345, 12345 };
vect_var_.923_480 = vect_var_.920_328 * { 914237, 914237, 914237, 914237 };
vect_var_.923_895 = vect_var_.920_330 * { 914237, 914237, 914237, 914237 };
vect_var_.926_231 = vect_var_.923_480 + { 12332, 12332, 12332, 12332 };
vect_var_.926_232 = vect_var_.923_895 + { 12332, 12332, 12332, 12332 };
vect_var_.929_235 = vect_var_.926_231 * { 914237, 914237, 914237, 914237 };
vect_var_.929_236 = vect_var_.926_232 * { 914237, 914237, 914237, 914237 };
vect_var_.932_239 = vect_var_.929_235 + { 12332, 12332, 12332, 12332 };
vect_var_.932_240 = vect_var_.929_236 + { 12332, 12332, 12332, 12332 };
vect_var_.935_113 = vect_var_.932_239 * { 914237, 914237, 914237, 914237 };
vect_var_.935_247 = vect_var_.932_240 * { 914237, 914237, 914237, 914237 };
vect_var_.938_582 = vect_var_.935_113 + { -13, -13, -13, -13 };
vect_var_.938_839 = vect_var_.935_247 + { -13, -13, -13, -13 };
vect_var_.941_3472 = vect_var_.938_582 + vect_var_.941_3474;
vect_var_.941_3471 = vect_var_.938_839 + vect_var_.941_3473;
ivtmp.1325_812 = ivtmp.1325_970 + 32;
if (ivtmp.1325_812 != D.9937_388)
goto <bb 50>;
else
goto <bb 51>;
<bb 51>:
# vect_var_.941_3468 = PHI <vect_var_.941_3472(50)>
# vect_var_.941_3467 = PHI <vect_var_.941_3471(50)>
vect_var_.945_3466 = vect_var_.941_3468 + vect_var_.941_3467;
vect_var_.946_3465 = vect_var_.945_3466 v>> 64;
vect_var_.946_3464 = vect_var_.946_3465 + vect_var_.945_3466;
vect_var_.946_3463 = vect_var_.946_3464 v>> 32;
vect_var_.946_3462 = vect_var_.946_3463 + vect_var_.946_3464;
stmp_var_.944_3461 = BIT_FIELD_REF <vect_var_.946_3462, 32, 0>;
init_value.7_795 = init_value;
D.8606_796 = (int) init_value.7_795;
D.8600_797 = D.8606_796 + 12345;
D.8599_798 = D.8600_797 * 914237;
D.8602_799 = D.8599_798 + 12332;
D.8601_800 = D.8602_799 * 914237;
D.8604_801 = D.8601_800 + 12332;
D.8603_802 = D.8604_801 * 914237;
D.8605_803 = D.8603_802 + -13;
temp_804 = D.8605_803 * 8000;
if (temp_804 != stmp_var_.944_3461)
goto <bb 52>;
else
goto <bb 53>;
With GCC 4.6 OTOH the above loop is not vectorized, only the (slow) not
unrolled loop is.
<bb 49>:
# result_622 = PHI <result_704(49), 0(63)>
# ivtmp.852_1026 = PHI <ivtmp.852_842(49), ivtmp.852_844(63)>
D.9283_3302 = (void *) ivtmp.852_1026;
temp_801 = MEM[base: D.9283_3302, offset: 0B];
D.8366_802 = temp_801 + 12345;
D.8365_803 = D.8366_802 * 914237;
D.8368_804 = D.8365_803 + 12332;
D.8367_805 = D.8368_804 * 914237;
D.8370_806 = D.8367_805 + 12332;
D.8369_807 = D.8370_806 * 914237;
temp_808 = D.8369_807 + -13;
result_810 = temp_808 + result_622;
temp_815 = MEM[base: D.9283_3302, offset: 4B];
D.8381_816 = temp_815 + 12345;
D.8382_817 = D.8381_816 * 914237;
D.8378_818 = D.8382_817 + 12332;
D.8379_819 = D.8378_818 * 914237;
D.8376_820 = D.8379_819 + 12332;
D.8377_821 = D.8376_820 * 914237;
temp_822 = D.8377_821 + -13;
result_824 = result_810 + temp_822;
temp_788 = MEM[base: D.9283_3302, offset: 8B];
D.8351_789 = temp_788 + 12345;
D.8352_790 = D.8351_789 * 914237;
D.8348_791 = D.8352_790 + 12332;
D.8349_792 = D.8348_791 * 914237;
D.8346_793 = D.8349_792 + 12332;
D.8347_794 = D.8346_793 * 914237;
temp_795 = D.8347_794 + -13;
result_797 = temp_795 + result_824;
temp_774 = MEM[base: D.9283_3302, offset: 12B];
D.8333_775 = temp_774 + 12345;
D.8334_776 = D.8333_775 * 914237;
D.8330_777 = D.8334_776 + 12332;
D.8331_778 = D.8330_777 * 914237;
D.8328_779 = D.8331_778 + 12332;
D.8329_780 = D.8328_779 * 914237;
temp_781 = D.8329_780 + -13;
result_783 = temp_781 + result_797;
temp_760 = MEM[base: D.9283_3302, offset: 16B];
D.8315_761 = temp_760 + 12345;
D.8316_762 = D.8315_761 * 914237;
D.8312_763 = D.8316_762 + 12332;
D.8313_764 = D.8312_763 * 914237;
D.8310_765 = D.8313_764 + 12332;
D.8311_766 = D.8310_765 * 914237;
temp_767 = D.8311_766 + -13;
result_769 = temp_767 + result_783;
temp_746 = MEM[base: D.9283_3302, offset: 20B];
D.8297_747 = temp_746 + 12345;
D.8298_748 = D.8297_747 * 914237;
D.8294_749 = D.8298_748 + 12332;
D.8295_750 = D.8294_749 * 914237;
D.8292_751 = D.8295_750 + 12332;
D.8293_752 = D.8292_751 * 914237;
temp_753 = D.8293_752 + -13;
result_755 = temp_753 + result_769;
temp_732 = MEM[base: D.9283_3302, offset: 24B];
D.8279_733 = temp_732 + 12345;
D.8280_734 = D.8279_733 * 914237;
D.8276_735 = D.8280_734 + 12332;
D.8277_736 = D.8276_735 * 914237;
D.8274_737 = D.8277_736 + 12332;
D.8275_738 = D.8274_737 * 914237;
temp_739 = D.8275_738 + -13;
result_741 = temp_739 + result_755;
temp_695 = MEM[base: D.9283_3302, offset: 28B];
D.8246_696 = temp_695 + 12345;
D.8245_697 = D.8246_696 * 914237;
D.8248_698 = D.8245_697 + 12332;
D.8247_699 = D.8248_698 * 914237;
D.8250_700 = D.8247_699 + 12332;
D.8249_701 = D.8250_700 * 914237;
temp_702 = D.8249_701 + -13;
result_704 = temp_702 + result_741;
ivtmp.852_842 = ivtmp.852_1026 + 32;
if (ivtmp.852_842 != D.9292_3369)
goto <bb 49>;
else
goto <bb 50>;
<bb 50>:
# result_3198 = PHI <result_704(49)>
init_value.7_825 = init_value;
D.8393_826 = (int) init_value.7_825;
D.8387_827 = D.8393_826 + 12345;
D.8386_828 = D.8387_827 * 914237;
D.8389_829 = D.8386_828 + 12332;
D.8388_830 = D.8389_829 * 914237;
D.8391_831 = D.8388_830 + 12332;
D.8390_832 = D.8391_831 * 914237;
D.8392_833 = D.8390_832 + -13;
temp_834 = D.8392_833 * 8000;
if (temp_834 != result_3198)
goto <bb 51>;
else
goto <bb 52>;
With -fno-tree-vectorize the performance is the same. It seems that
vectorization is not profitable here for some reason. Same behavior
can be observed with GCC 4.8.
I used the preprocessed source for 4.7 from the ZIP file.
The code generated is odd at least, the inner loop looks like
movdqa .LC6(%rip), %xmm3
xorl %ebx, %ebx
movdqa .LC7(%rip), %xmm0
movdqa .LC8(%rip), %xmm1
movdqa .LC9(%rip), %xmm2
.p2align 4,,10
.p2align 3
.L51:
pxor %xmm6, %xmm6
movl $data32, %eax
movdqa %xmm6, %xmm7
.p2align 4,,10
.p2align 3
.L53:
movdqa (%rax), %xmm4
movdqa %xmm0, %xmm8
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
psrldq $4, %xmm4
psrldq $4, %xmm8
pmuludq %xmm8, %xmm4
pshufd $8, %xmm4, %xmm4
pmuludq %xmm0, %xmm5
pshufd $8, %xmm5, %xmm5
movdqa %xmm0, %xmm8
psrldq $4, %xmm8
punpckldq %xmm4, %xmm5
paddd %xmm1, %xmm5
movdqa %xmm5, %xmm4
psrldq $4, %xmm5
pmuludq %xmm8, %xmm5
pshufd $8, %xmm5, %xmm5
pmuludq %xmm0, %xmm4
pshufd $8, %xmm4, %xmm4
punpckldq %xmm5, %xmm4
movdqa %xmm0, %xmm5
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm8
psrldq $4, %xmm5
psrldq $4, %xmm4
pmuludq %xmm4, %xmm5
pshufd $8, %xmm5, %xmm5
pmuludq %xmm0, %xmm8
pshufd $8, %xmm8, %xmm4
movdqa %xmm0, %xmm8
psrldq $4, %xmm8
punpckldq %xmm5, %xmm4
paddd %xmm2, %xmm4
paddd %xmm4, %xmm7
movdqa 16(%rax), %xmm4
addq $32, %rax
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
psrldq $4, %xmm4
pmuludq %xmm8, %xmm4
pshufd $8, %xmm4, %xmm4
movdqa %xmm0, %xmm8
pmuludq %xmm0, %xmm5
pshufd $8, %xmm5, %xmm5
cmpq $data32+32000, %rax
psrldq $4, %xmm8
punpckldq %xmm4, %xmm5
paddd %xmm1, %xmm5
movdqa %xmm5, %xmm4
psrldq $4, %xmm5
pmuludq %xmm8, %xmm5
pshufd $8, %xmm5, %xmm5
pmuludq %xmm0, %xmm4
pshufd $8, %xmm4, %xmm4
punpckldq %xmm5, %xmm4
movdqa %xmm0, %xmm5
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm8
psrldq $4, %xmm5
psrldq $4, %xmm4
pmuludq %xmm4, %xmm5
pshufd $8, %xmm5, %xmm5
pmuludq %xmm0, %xmm8
pshufd $8, %xmm8, %xmm4
punpckldq %xmm5, %xmm4
paddd %xmm2, %xmm4
paddd %xmm4, %xmm6
jne .L53
which means we expand the multiplications with the constants in an odd way.