https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030
--- Comment #6 from amker at gcc dot gnu.org --- It's not only the vectorizer generating CSE sub-optimal code, pre and lim also do this kind of transform. Compiling the attached example with below command line $ ./gcc -S -Ofast -march=haswell pr68030.c -o pr68030.S -fdump-tree-vect-details -fdump-tree-slp -fdump-tree-ivopts-details -fdump-tree-all -fno-tree-vectorize Gives below dump info before IVOPT: <bb 2>: local_Filter_33 = global_Filters; pretmp_887 = global_Output; pretmp_889 = global_Input; goto <bb 7>; <bb 3>: <bb 4>: # ix_187 = PHI <_202(3), 2(7)> # ivtmp_1065 = PHI <ivtmp_1064(3), 512(7)> _154 = ix_187 + -2; _157 = _154 + _971; _158 = (long unsigned int) _157; _159 = _158 * 4; _160 = pretmp_889 + _159; _161 = *_160; _165 = *local_Filter_33; _166 = _161 * _165; _170 = ix_187 + -1; _173 = _170 + _971; _174 = (long unsigned int) _173; _175 = _174 * 4; _176 = pretmp_889 + _175; _177 = *_176; _181 = MEM[(float *)local_Filter_33 + 4B]; _182 = _177 * _181; _81 = _166 + _182; _189 = ix_187 + _971; _190 = (long unsigned int) _189; _191 = _190 * 4; _192 = pretmp_889 + _191; _193 = *_192; _197 = MEM[(float *)local_Filter_33 + 8B]; _198 = _193 * _197; _202 = ix_187 + 1; _205 = _202 + _971; _206 = (long unsigned int) _205; _207 = _206 * 4; _208 = pretmp_889 + _207; _209 = *_208; _213 = MEM[(float *)local_Filter_33 + 12B]; _214 = _209 * _213; _218 = ix_187 + 2; _221 = _218 + _971; _222 = (long unsigned int) _221; _223 = _222 * 4; _224 = pretmp_889 + _223; _225 = *_224; _229 = MEM[(float *)local_Filter_33 + 16B]; _230 = _225 * _229; _82 = _214 + _230; _67 = _81 + _82; _243 = _154 + _980; _244 = (long unsigned int) _243; _245 = _244 * 4; _246 = pretmp_889 + _245; _247 = *_246; _251 = MEM[(float *)local_Filter_33 + 20B]; _252 = _247 * _251; _259 = _170 + _980; _260 = (long unsigned int) _259; _261 = _260 * 4; _262 = pretmp_889 + _261; _263 = *_262; _267 = MEM[(float *)local_Filter_33 + 24B]; _268 = _263 * _267; _78 = _252 + _268; _275 = ix_187 + _980; _276 = (long unsigned int) _275; _277 = _276 * 4; _278 = pretmp_889 + _277; _279 = *_278; _283 = MEM[(float *)local_Filter_33 + 28B]; _284 = _279 * _283; _72 = _198 + _284; _291 = _202 + _980; _292 = (long unsigned int) _291; _293 = _292 * 4; _294 = pretmp_889 + _293; _295 = *_294; _299 = MEM[(float *)local_Filter_33 + 32B]; _300 = _295 * _299; _307 = _218 + _980; _308 = (long unsigned int) _307; _309 = _308 * 4; _310 = pretmp_889 + _309; _311 = *_310; _315 = MEM[(float *)local_Filter_33 + 36B]; _316 = _311 * _315; _79 = _300 + _316; _56 = _78 + _79; _329 = _154 + _985; _330 = (long unsigned int) _329; _331 = _330 * 4; _332 = pretmp_889 + _331; _333 = *_332; _337 = MEM[(float *)local_Filter_33 + 40B]; _338 = _333 * _337; _345 = _170 + _985; _346 = (long unsigned int) _345; _347 = _346 * 4; _348 = pretmp_889 + _347; _349 = *_348; _353 = MEM[(float *)local_Filter_33 + 44B]; _354 = _349 * _353; _75 = _338 + _354; _361 = ix_187 + _985; _362 = (long unsigned int) _361; _363 = _362 * 4; _364 = pretmp_889 + _363; _365 = *_364; _369 = MEM[(float *)local_Filter_33 + 48B]; _370 = _365 * _369; _377 = _202 + _985; _378 = (long unsigned int) _377; _379 = _378 * 4; _380 = pretmp_889 + _379; _381 = *_380; _385 = MEM[(float *)local_Filter_33 + 52B]; _386 = _381 * _385; _393 = _218 + _985; _394 = (long unsigned int) _393; _395 = _394 * 4; _396 = pretmp_889 + _395; _397 = *_396; _401 = MEM[(float *)local_Filter_33 + 56B]; _402 = _397 * _401; _76 = _386 + _402; _495 = _75 + _76; _415 = _154 + _991; _416 = (long unsigned int) _415; _417 = _416 * 4; _418 = pretmp_889 + _417; _419 = *_418; _423 = MEM[(float *)local_Filter_33 + 60B]; _424 = _419 * _423; _431 = _170 + _991; _432 = (long unsigned int) _431; _433 = _432 * 4; _434 = pretmp_889 + _433; _435 = *_434; _439 = MEM[(float *)local_Filter_33 + 64B]; _440 = _435 * _439; _572 = _424 + _440; _447 = ix_187 + _991; _448 = (long unsigned int) _447; _449 = _448 * 4; _450 = pretmp_889 + _449; _451 = *_450; _455 = MEM[(float *)local_Filter_33 + 68B]; _456 = _451 * _455; _73 = _370 + _456; _65 = _72 + _73; _55 = _65 + _67; _25 = _55 + _56; _19 = _25 + _495; _463 = _202 + _991; _464 = (long unsigned int) _463; _465 = _464 * 4; _466 = pretmp_889 + _465; _467 = *_466; _471 = MEM[(float *)local_Filter_33 + 72B]; _472 = _467 * _471; _479 = _218 + _991; _480 = (long unsigned int) _479; _481 = _480 * 4; _482 = pretmp_889 + _481; _483 = *_482; _487 = MEM[(float *)local_Filter_33 + 76B]; _488 = _483 * _487; _556 = _472 + _488; _20 = _556 + _572; _429 = _19 + _20; _501 = _154 + _997; _502 = (long unsigned int) _501; _503 = _502 * 4; _504 = pretmp_889 + _503; _505 = *_504; _509 = MEM[(float *)local_Filter_33 + 80B]; _510 = _505 * _509; _517 = _170 + _997; _518 = (long unsigned int) _517; _519 = _518 * 4; _520 = pretmp_889 + _519; _521 = *_520; _525 = MEM[(float *)local_Filter_33 + 84B]; _526 = _521 * _525; _444 = _510 + _526; _533 = ix_187 + _997; _534 = (long unsigned int) _533; _535 = _534 * 4; _536 = pretmp_889 + _535; _537 = *_536; _541 = MEM[(float *)local_Filter_33 + 88B]; _542 = _537 * _541; _549 = _202 + _997; _550 = (long unsigned int) _549; _551 = _550 * 4; _552 = pretmp_889 + _551; _553 = *_552; _557 = MEM[(float *)local_Filter_33 + 92B]; _558 = _553 * _557; _565 = _218 + _997; _566 = (long unsigned int) _565; _567 = _566 * 4; _568 = pretmp_889 + _567; _569 = *_568; _573 = MEM[(float *)local_Filter_33 + 96B]; _574 = _569 * _573; _445 = _558 + _574; _430 = _444 + _445; _257 = _429 + _430; sum_575 = _257 + _542; _21 = pretmp_887 + _363; *_21 = sum_575; ivtmp_1064 = ivtmp_1065 - 1; if (ivtmp_1064 != 0) goto <bb 3>; else goto <bb 5>; <bb 5>: ivtmp_1062 = ivtmp_1063 - 1; if (ivtmp_1062 != 0) goto <bb 6>; else goto <bb 8>; <bb 6>: <bb 7>: # iy_186 = PHI <_990(6), 2(2)> # ivtmp_1063 = PHI <ivtmp_1062(6), 512(2)> _970 = iy_186 + -2; _971 = _970 * 516; _979 = iy_186 + -1; _980 = _979 * 516; _985 = iy_186 * 516; _990 = iy_186 + 1; _991 = _990 * 516; _996 = iy_186 + 2; _997 = _996 * 516; goto <bb 4>; <bb 8>: return; Most memory references in <bb 4> are accessing the same memory object, but IVOPT failed to group these IVs because PRE hoists some parts of address computation into <bb7>. And PRE/LIM creates more difficult code than vectorizer because the CSE opportunities are hidden by re-association. I will first try to fix vectorizer issue since PRE/LIM issue isn't that critical because it's only exposed in loops unrolled by tree cunroll, and in versioned/peeled loops only.