https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84719
--- Comment #8 from gpnuma at centaurean dot com --- Just to make sure I commented out bit masking : #include <sys/stat.h> #include <sys/types.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <stdbool.h> #include <string.h> int main(int argc, char *argv[]) { const uint64_t size = 1000000000; const size_t alloc_mem = size * sizeof(uint8_t); uint8_t *mem = malloc(alloc_mem); for (uint_fast64_t i = 0; i < size; i++) mem[i] = (uint8_t) (i >> 7); uint_fast64_t counter = 0; uint64_t total = 0x123456789abcdefllu; uint64_t receiver = 0; printf("%u ...\n", 3); counter = 0; while (counter < size - 8) { __builtin_memcpy(&receiver, &mem[counter], 3); // receiver &= (0xffffffffffffffffllu >> (64 - ((3) << 3))); total += ((receiver * 0x321654987cbafedllu) >> 48); counter += 3; } printf("=> %llu\n", total); return EXIT_SUCCESS; } Results are exactly the same : gcc time ./a.out 3 ... => 81996806116422545 real 0m3.771s user 0m3.292s sys 0m0.403s clang time ./a.out 3 ... => 81996806116422545 real 0m1.209s user 0m0.833s sys 0m0.359s Still 4x faster