gcc version for reproducer bug ====================== I tried this on the most recent gcc 32bit compiler version at godbolt.org, and it is reporoducing the same assembly language for this benchmark as the gcc version I am using, Raspbian 8.3.0-6+rpi1. The optional performance enhancement I describe at the bottom *definitely* still applies, even if the bugs I'm reporting in this reproducer are fixed.
output: ===== -970094740 14028998 problem: ====== (1) The bench() function below can only return positive values, so the first number printed above is wrong. (2) the second number is supposed to be the first number divided by lit-const 237. The second number is too big for that to be correct. REPRODUCER -- REPRODUCER -- REPRODUCER ====================================== #include <stdio.h> int bench() { int sum = 0; for (int i=1000000000; --i) sum += i / 237; return ((sum < 0) ? -sum : sum); } int main() { int asum = bench(); pritntf("%d %d\n", asum, asum / 237); return 0; } ============================== END REPRODUCER -- END REPRODUCER -- END REPRODUCER OTHER non-bug comment =================== The compiler I wrote runs literally twice as fast as 32bit ARM gcc for this operation. The lit-const division and modulo in gcc 32 bit ARM is crazy bad. Here is how I did it in my compiler: // Following function from Hacker's Delight, 2nd Edition // converts literal-constant integer division into a multiply struct muldiv {int M; int s;}; // Recip mult and shift amount void irecip(int d, struct muldiv *mag) { int p; int ad, anc, delta, q1, r1, q2, r2, t; int two31 = 0x80000000; // 2**31. ad = (d < 0) ? -d : d; t = 0x7fffffff + ((d < 0) ? (1+d) : -d) + 1; anc = 0x7fffffff - t%ad + ((d < 0) ? 1 : 0); p = 31; // Init. p. q1 = -(two31/anc); // Init. q1 = 2**p/|nc|. r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). q2 = two31/ad; // Init. q2 = 2**p/|d|. r2 = -(two31 - q2*ad); // Init. r2 = rem(2**p, |d|). q2 = -q2; do { p = p + 1; q1 = 2*q1; // Update q1 = 2**p/|nc|. r1 = 2*r1; // Update r1 = rem(2**p, |nc|). if (r1 >= anc) { // (Must be an unsigned q1 = q1 + 1; // comparison here). r1 = r1 - anc;} q2 = 2*q2; // Update q2 = 2**p/|d|. r2 = 2*r2; // Update r2 = rem(2**p, |d|). if (r2 >= ad) { // (Must be an unsigned q2 = q2 + 1; // comparison here). r2 = r2 - ad;} delta = ad - r2; } while (q1 < delta || (q1 == delta && r1 == 0)); mag->M = q2 + 1; if (d < 0) mag->M = -mag->M; // Magic number and mag->s = p - 32; // shift amount to return. } void const_div_AST(int *enode) { struct muldiv mag; int *a, divisor = n[2]; irecip(divisor, &mag); n[2] = mag.M; *--n = DivCnst; if (divisor > 0 && mag.M < 0) { *--n = (int) enode; *--n = Add; } else if (divisor < 0 && mag.M > 0) { *--n = (int) enode; *--n = Sub; } if (mag.s > 0) { a = n; *--n = mag.s; *--n = Num; *--n = (int) a; *--n = Shr; } }