https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120629

--- Comment #15 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Ok, managed to reproduce with
../configure 'CFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables
-fstack-clash-protection -Werror=return-type -g' 'CXXFLAGS= -O2 -funwind-tables
-fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g'
'XCFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables
-fstack-clash-protection -Werror=return-type -g' 'TCFLAGS= -O2 -funwind-tables
-fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g'
'GDCFLAGS= -O2 -funwind-tables -fasynchronous-unwind-tables
-fstack-clash-protection -g' --enable-languages=c,c++ --enable-host-shared
--enable-checking=release --disable-werror --disable-libstdcxx-pch
--with-arch-32=x86-64 --with-tune=generic
--with-build-config=bootstrap-lto-lean --enable-link-serialization
make -j32 profiledbootstrap

The routine in question is roughly
struct S {
  void stream_in (void *);
  static const int max_clauses = 8;
  unsigned m_clause[max_clauses + 1];
};
unsigned long streamer_read_uhwi (void *);

void
S::stream_in (void *ib)
{
  unsigned clause;
  int k = 0;

  do
    {
      ((void)(__builtin_expect (!(k <= max_clauses), 0) ? __builtin_unreachable
(), 0 : 0));
      clause = m_clause[k++] = streamer_read_uhwi (ib);
    }
  while (clause);

  /* Zero-initialize the remaining clauses in OUT.  */
  while (k <= max_clauses)
    m_clause[k++] = 0;
}

and the first loop works ok, but somehow when the first loop ends with k == 8,
the second loop is incorrect.
Unfortunately with
-O3 -fprofile-generate without LTO on just the above testcase that doesn't
reproduce,
In the crashing lto1 binary, I see after the first loop:

   0x000000000103a70e <+110>:   cmp    $0x9,%ebx
   0x000000000103a711 <+113>:   je     0x103a771
<_ZN13ipa_predicate9stream_inEP15lto_input_block+209>
   0x000000000103a713 <+115>:   mov    $0x8,%esi
   0x000000000103a718 <+120>:   mov    0x3e608b9(%rip),%r8        # 0x4e9afd8
<__gcov0._ZN13ipa_predicate9stream_inEP15lto_input_block+24>
   0x000000000103a71f <+127>:   sub    %ebx,%esi
   0x000000000103a721 <+129>:   mov    %ebx,%ebx
   0x000000000103a723 <+131>:   lea    0x4(,%rsi,4),%rax
   0x000000000103a72b <+139>:   lea    0x0(%rbp,%rbx,4),%rdx
   0x000000000103a730 <+144>:   mov    %eax,%ecx
   0x000000000103a732 <+146>:   movq   $0x0,(%rdx)
   0x000000000103a739 <+153>:   movq   $0x0,-0x8(%rdx,%rcx,1)
   0x000000000103a742 <+162>:   lea    0x8(%rdx),%rcx
   0x000000000103a746 <+166>:   and    $0xfffffffffffffff8,%rcx
   0x000000000103a74a <+170>:   sub    %rcx,%rdx
   0x000000000103a74d <+173>:   add    %edx,%eax
   0x000000000103a74f <+175>:   xor    %edx,%edx
   0x000000000103a751 <+177>:   and    $0xfffffff8,%eax
   0x000000000103a754 <+180>:   mov    %edx,%edi
   0x000000000103a756 <+182>:   add    $0x8,%edx
   0x000000000103a759 <+185>:   movq   $0x0,(%rcx,%rdi,1)
   0x000000000103a761 <+193>:   cmp    %eax,%edx
   0x000000000103a763 <+195>:   jb     0x103a754
<_ZN13ipa_predicate9stream_inEP15lto_input_block+180>
   0x000000000103a765 <+197>:   lea    0x1(%r8,%rsi,1),%rax
   0x000000000103a76a <+202>:   mov    %rax,0x3e60867(%rip)        # 0x4e9afd8
<__gcov0._ZN13ipa_predicate9stream_inEP15lto_input_block+24>
   0x000000000103a771 <+209>:   pop    %rbx
   0x000000000103a772 <+210>:   addq   $0x1,0x3e60866(%rip)        # 0x4e9afe0
<__gcov0._ZN13ipa_predicate9stream_inEP15lto_input_block+32>
   0x000000000103a77a <+218>:   pop    %rbp
   0x000000000103a77b <+219>:   pop    %r12
   0x000000000103a77d <+221>:   ret    

where %ebx at that point is 8 (i.e. k) and %rbp is this (equal to
&this->m_clause[0]).
While the -O3 -fprofile-generated code without LTO looks like:
        cmpl    $9, %ebx
        je      .L6
        movl    $8, %edx
        movq    __gcov0._ZN1S9stream_inEPv+24(%rip), %rdi
        subl    %ebx, %edx
        movl    %ebx, %ebx
        leaq    4(,%rdx,4), %rax
        leaq    0(%rbp,%rbx,4), %rcx
        cmpl    $8, %eax
        jnb     .L7
        testb   $4, %al
        jne     .L18
        testl   %eax, %eax
        je      .L8
        movb    $0, (%rcx)
.L8:
        leaq    1(%rdx,%rdi), %rax
        movq    %rax, __gcov0._ZN1S9stream_inEPv+24(%rip)
.L6:
        popq    %rbx
        addq    $1, __gcov0._ZN1S9stream_inEPv+32(%rip)
        popq    %rbp
        popq    %r12
        ret
The first lea in both cases computes (8 - k) * 4 + 4, the second
&this->m_clause[k].
But the (8 - k) * 4 + 4 comparison with 8 and testing if it has 4 set in it is
only present in the second case,
in the lto1 case it stores 0 to *(long *)&this->m_clause[k] (already that is 4
byte buffer overflow) and movq   $0x0,-0x8(%rdx,%rcx,1) is weird too,
that is storing 64-bit 0 to *(long *)(((char *)&this->m_clause[k]) + (8 - k) *
4 + 4 - 8), i.e. always to *(long *)(((char *)&this->m_clause[7]  (that isn't a
buffer overflow but will overwrite the last stored value).
Now, outside of lto1, I see just 2 get_range_pos_neg calls, in one case the
range is [0, 7] regardless of whether stmt is non-NULL or NULL (so the ranger
patch didn't change that), in the other case the range is [1, 9] when using
global range and [1, 8] otherwise, but it still returns 1 in both cases.
So during the LTO compilation it must be something different.

Reply via email to