https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
ivtmp.23_31 = (unsigned long) b_24(D);
ivtmp.24_46 = (unsigned long) pa_26(D);
_50 = ivtmp.23_31 + 400000;
<bb 3> [local count: 1063004408]:
# vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
# ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
# ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
_47 = (void *) ivtmp.23_14;
_4 = MEM[(int *)_47];
_25 = {_4, _4, _4, _4};
_48 = (void *) ivtmp.24_30;
_7 = MEM[(__m128i * {ref-all})_48];
_8 = VIEW_CONVERT_EXPR<__v4si>(_7);
_9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
_27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
ivtmp.23_15 = ivtmp.23_14 + 4;
ivtmp.24_45 = ivtmp.24_30 + 16;
if (ivtmp.23_15 != _50)
goto <bb 3>; [98.99%]
else
goto <bb 4>; [1.01%]
<bb 4> [local count: 10737416]:
*pc_19(D) = vsum_28;
ivtmp.15_34 = (unsigned long) &vsum.0;
_13 = ivtmp.15_34 + 16;
<bb 5> [local count: 42949663]:
# ssum_38 = PHI <ssum_22(5), 0(4)>
# ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>
I'm curious if we can "move" VIEW_EXPR_CONVERT outside of the loop as below
<bb 3> [local count: 1063004408]:
- # vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
+ # _9 = PHI <_27(3), { 0, 0, 0, 0}(2)>
# ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
# ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
_47 = (void *) ivtmp.23_14;
_4 = MEM[(int *)_47];
_25 = {_4, _4, _4, _4};
_48 = (void *) ivtmp.24_30;
_7 = MEM[(__m128i * {ref-all})_48];
_8 = VIEW_CONVERT_EXPR<__v4si>(_7);
- _9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
_27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
- vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
ivtmp.23_15 = ivtmp.23_14 + 4;
ivtmp.24_45 = ivtmp.24_30 + 16;
if (ivtmp.23_15 != _50)
goto <bb 3>; [98.99%]
else
goto <bb 4>; [1.01%]
<bb 4> [local count: 10737416]:
+ vsum_28 = VIEW_CONVERT_EXPR <_27>
*pc_19(D) = vsum_28;
ivtmp.15_34 = (unsigned long) &vsum.0;
_13 = ivtmp.15_34 + 16;
<bb 5> [local count: 42949663]:
# ssum_38 = PHI <ssum_22(5), 0(4)>
# ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>
It looks like an lazy code motion optimization, but currently not handled by
PRE.