https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011
--- Comment #9 from Yuri Rumyantsev <ysrumyan at gmail dot com> ---
This is not u32 version but u64. The first loop (u32) version looks like:
.L23:
leal 1(%rdx), %ecx
xorq %rax, %rax
popcntq (%rbx,%rax,8), %rax
leal 2(%rdx), %r8d
xorq %rcx, %rcx
popcntq (%rbx,%rcx,8), %rcx
addq %rax, %rcx
leal 3(%rdx), %esi
xorq %rax, %rax
popcntq (%rbx,%r8,8), %rax
addq %rax, %rcx
xorq %rax, %rax
popcntq (%rbx,%rsi,8), %rax
addq %rax, %rcx
leal 4(%rdx), %eax
addq %rcx, %r14
movq %rax, %rdx
cmpq %rax, %r12
ja .L23