https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
  ivtmp.23_31 = (unsigned long) b_24(D);
  ivtmp.24_46 = (unsigned long) pa_26(D);
  _50 = ivtmp.23_31 + 400000;

  <bb 3> [local count: 1063004408]:
  # vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
  # ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
  # ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
  _47 = (void *) ivtmp.23_14;
  _4 = MEM[(int *)_47];
  _25 = {_4, _4, _4, _4};
  _48 = (void *) ivtmp.24_30;
  _7 = MEM[(__m128i * {ref-all})_48];
  _8 = VIEW_CONVERT_EXPR<__v4si>(_7);
  _9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
  _27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
  vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
  ivtmp.23_15 = ivtmp.23_14 + 4;
  ivtmp.24_45 = ivtmp.24_30 + 16;
  if (ivtmp.23_15 != _50)
    goto <bb 3>; [98.99%]
  else
    goto <bb 4>; [1.01%]

  <bb 4> [local count: 10737416]:
  *pc_19(D) = vsum_28;
  ivtmp.15_34 = (unsigned long) &vsum.0;
  _13 = ivtmp.15_34 + 16;

  <bb 5> [local count: 42949663]:
  # ssum_38 = PHI <ssum_22(5), 0(4)>
  # ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>

I'm curious if we can "move" VIEW_EXPR_CONVERT outside of the loop as below

  <bb 3> [local count: 1063004408]:
-  # vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
+  # _9 = PHI <_27(3), { 0, 0, 0, 0}(2)>
  # ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
  # ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
  _47 = (void *) ivtmp.23_14;
  _4 = MEM[(int *)_47];
  _25 = {_4, _4, _4, _4};
  _48 = (void *) ivtmp.24_30;
  _7 = MEM[(__m128i * {ref-all})_48];
  _8 = VIEW_CONVERT_EXPR<__v4si>(_7);
-  _9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
  _27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
-  vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
  ivtmp.23_15 = ivtmp.23_14 + 4;
  ivtmp.24_45 = ivtmp.24_30 + 16;
  if (ivtmp.23_15 != _50)
    goto <bb 3>; [98.99%]
  else
    goto <bb 4>; [1.01%]

  <bb 4> [local count: 10737416]:
+  vsum_28 = VIEW_CONVERT_EXPR <_27>
  *pc_19(D) = vsum_28;
  ivtmp.15_34 = (unsigned long) &vsum.0;
  _13 = ivtmp.15_34 + 16;

  <bb 5> [local count: 42949663]:
  # ssum_38 = PHI <ssum_22(5), 0(4)>
  # ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>


It looks like an lazy code motion optimization, but currently not handled by
PRE.

Reply via email to