https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84106
--- Comment #6 from Daniel Fruzynski <bugzi...@poradnik-webmastera.com> --- When you will be revisiting your cost-model for loops, please also take a look on this code. test2 has one assignment moved to separate loops, and it is about twice as fast as test1 function (for gcc 4.8.5). [code] #include <stdint.h> #include <string.h> #define N 9 int a1[N][N]; int a2[N][N]; int a3[N][N]; uint16_t a4[N][N-1]; void test1() { for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { a2[i][j] = a1[i][j]; a3[i][j] = 1u << a1[i][j]; if (i > 0) a4[j][i-1] = a3[i][j]; } } } void test2() { for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { a2[i][j] = a1[i][j]; a3[i][j] = 1u << a1[i][j]; } } for (int i = 1; i < N; ++i) { for (int j = 0; j < N; ++j) { a4[j][i-1] = a3[i][j]; } } } [/code]