https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120629
--- Comment #15 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Ok, managed to reproduce with ../configure 'CFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' 'CXXFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' 'XCFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' 'TCFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' 'GDCFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -g' --enable-languages=c,c++ --enable-host-shared --enable-checking=release --disable-werror --disable-libstdcxx-pch --with-arch-32=x86-64 --with-tune=generic --with-build-config=bootstrap-lto-lean --enable-link-serialization make -j32 profiledbootstrap The routine in question is roughly struct S { void stream_in (void *); static const int max_clauses = 8; unsigned m_clause[max_clauses + 1]; }; unsigned long streamer_read_uhwi (void *); void S::stream_in (void *ib) { unsigned clause; int k = 0; do { ((void)(__builtin_expect (!(k <= max_clauses), 0) ? __builtin_unreachable (), 0 : 0)); clause = m_clause[k++] = streamer_read_uhwi (ib); } while (clause); /* Zero-initialize the remaining clauses in OUT. */ while (k <= max_clauses) m_clause[k++] = 0; } and the first loop works ok, but somehow when the first loop ends with k == 8, the second loop is incorrect. Unfortunately with -O3 -fprofile-generate without LTO on just the above testcase that doesn't reproduce, In the crashing lto1 binary, I see after the first loop: 0x000000000103a70e <+110>: cmp $0x9,%ebx 0x000000000103a711 <+113>: je 0x103a771 <_ZN13ipa_predicate9stream_inEP15lto_input_block+209> 0x000000000103a713 <+115>: mov $0x8,%esi 0x000000000103a718 <+120>: mov 0x3e608b9(%rip),%r8 # 0x4e9afd8 <__gcov0._ZN13ipa_predicate9stream_inEP15lto_input_block+24> 0x000000000103a71f <+127>: sub %ebx,%esi 0x000000000103a721 <+129>: mov %ebx,%ebx 0x000000000103a723 <+131>: lea 0x4(,%rsi,4),%rax 0x000000000103a72b <+139>: lea 0x0(%rbp,%rbx,4),%rdx 0x000000000103a730 <+144>: mov %eax,%ecx 0x000000000103a732 <+146>: movq $0x0,(%rdx) 0x000000000103a739 <+153>: movq $0x0,-0x8(%rdx,%rcx,1) 0x000000000103a742 <+162>: lea 0x8(%rdx),%rcx 0x000000000103a746 <+166>: and $0xfffffffffffffff8,%rcx 0x000000000103a74a <+170>: sub %rcx,%rdx 0x000000000103a74d <+173>: add %edx,%eax 0x000000000103a74f <+175>: xor %edx,%edx 0x000000000103a751 <+177>: and $0xfffffff8,%eax 0x000000000103a754 <+180>: mov %edx,%edi 0x000000000103a756 <+182>: add $0x8,%edx 0x000000000103a759 <+185>: movq $0x0,(%rcx,%rdi,1) 0x000000000103a761 <+193>: cmp %eax,%edx 0x000000000103a763 <+195>: jb 0x103a754 <_ZN13ipa_predicate9stream_inEP15lto_input_block+180> 0x000000000103a765 <+197>: lea 0x1(%r8,%rsi,1),%rax 0x000000000103a76a <+202>: mov %rax,0x3e60867(%rip) # 0x4e9afd8 <__gcov0._ZN13ipa_predicate9stream_inEP15lto_input_block+24> 0x000000000103a771 <+209>: pop %rbx 0x000000000103a772 <+210>: addq $0x1,0x3e60866(%rip) # 0x4e9afe0 <__gcov0._ZN13ipa_predicate9stream_inEP15lto_input_block+32> 0x000000000103a77a <+218>: pop %rbp 0x000000000103a77b <+219>: pop %r12 0x000000000103a77d <+221>: ret where %ebx at that point is 8 (i.e. k) and %rbp is this (equal to &this->m_clause[0]). While the -O3 -fprofile-generated code without LTO looks like: cmpl $9, %ebx je .L6 movl $8, %edx movq __gcov0._ZN1S9stream_inEPv+24(%rip), %rdi subl %ebx, %edx movl %ebx, %ebx leaq 4(,%rdx,4), %rax leaq 0(%rbp,%rbx,4), %rcx cmpl $8, %eax jnb .L7 testb $4, %al jne .L18 testl %eax, %eax je .L8 movb $0, (%rcx) .L8: leaq 1(%rdx,%rdi), %rax movq %rax, __gcov0._ZN1S9stream_inEPv+24(%rip) .L6: popq %rbx addq $1, __gcov0._ZN1S9stream_inEPv+32(%rip) popq %rbp popq %r12 ret The first lea in both cases computes (8 - k) * 4 + 4, the second &this->m_clause[k]. But the (8 - k) * 4 + 4 comparison with 8 and testing if it has 4 set in it is only present in the second case, in the lto1 case it stores 0 to *(long *)&this->m_clause[k] (already that is 4 byte buffer overflow) and movq $0x0,-0x8(%rdx,%rcx,1) is weird too, that is storing 64-bit 0 to *(long *)(((char *)&this->m_clause[k]) + (8 - k) * 4 + 4 - 8), i.e. always to *(long *)(((char *)&this->m_clause[7] (that isn't a buffer overflow but will overwrite the last stored value). Now, outside of lto1, I see just 2 get_range_pos_neg calls, in one case the range is [0, 7] regardless of whether stmt is non-NULL or NULL (so the ranger patch didn't change that), in the other case the range is [1, 9] when using global range and [1, 8] otherwise, but it still returns 1 in both cases. So during the LTO compilation it must be something different.