https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459
--- Comment #4 from Thomas Koenig <tkoenig at gcc dot gnu.org> --- Here's a complete program for benchmarks on x86_64, using Jakub's functions (so they are indeed correct): #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <x86intrin.h> unsigned r3_128u_v2 (__uint128_t n) { return (unsigned) (n%3); } unsigned r3_128u_v3 (__uint128_t n) { unsigned long a; a = (n >> 88); a += (n >> 44) & 0xfffffffffffULL; a += (n & 0xfffffffffffULL); return a % 3; } unsigned r3_128u_v4 (__uint128_t n) { unsigned long a; a = (n >> 96); a += (n >> 64) & 0xffffffffULL; a += (n >> 32) & 0xffffffffULL; a += (n & 0xffffffffULL); return a % 3; } #define N 1000000 int main() { __uint128_t *a; unsigned int s; unsigned long t1, t2; int fd; int i; a = malloc (sizeof (*a) * N); fd = open ("/dev/random", O_RDONLY); read (fd, a, sizeof (*a) * N); s = 0; t1 = __rdtsc(); for (i=0; i<N; i++) s += r3_128u_v2(a[i]); t2 = __rdtsc(); printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double) N); s = 0; t1 = __rdtsc(); for (i=0; i<N; i++) s += r3_128u_v3(a[i]); t2 = __rdtsc(); printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double) N); s = 0; t1 = __rdtsc(); for (i=0; i<N; i++) s += r3_128u_v4(a[i]); t2 = __rdtsc(); printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double) N); } And here are the results on my box using -O3 -march=native: s = 7 r3_128u_v2: 22.204618 cycles per iteration s = 7 r3_128u_v2: 8.143544 cycles per iteration s = 7 r3_128u_v2: 6.110718 cycles per iteration