https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115875

            Bug ID: 115875
           Summary: -Oz optimization of "push IMM; pop REG" is used
                    incorrectly for 64-bit constants with 31th bit set
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vda.linux at googlemail dot com
  Target Milestone: ---

void sp_256_sub_8_p256_mod(unsigned long *r)
{
        unsigned long reg, ooff;
        asm volatile (
"\n             subq    $0xffffffffffffffff, (%0)"
"\n             sbbq    %1, 1*8(%0)"
"\n             sbbq    $0, 2*8(%0)"
"\n             movq    3*8(%0), %2"
"\n             sbbq    $0, %2"
"\n             addq    %1, %2"
"\n             movq    %2, 3*8(%0)"
                : "=r" (r), "=r" (ooff), "=r" (reg)
                : "0" (r), "1" (0x00000000ffffffff)
                : "memory");
}

"gcc -fomit-frame-pointer -Oz -S tls_sp_c32.c" generates this:

        pushq   $-1
        popq    %rax # BUG!!! gcc thinks %rax = 0x00000000ffffffff
                     # but, of course, it loads 0xffffffffffffffff instead!
                subq    $0xffffffffffffffff, (%rdi)
                sbbq    %rax, 1*8(%rdi)
                sbbq    $0, 2*8(%rdi)
                movq    3*8(%rdi), %rdx
                sbbq    $0, %rdx
                addq    %rax, %rdx
                movq    %rdx, 3*8(%rdi)
        ret

Looks like either gcc thinks "pushq $-1" truncates the value by 32 bits (in
reality, it is sign-extended), or it thinks it uses "pop %eax" insn (no such
insn exists in 64-bit mode, only 64-bit register pops are possible do).

Code generated with -Os is correct:

        orl     $-1, %eax  # zero-extended to 64 bits, correct result in %rax
                subq    $0xffffffffffffffff, (%rdi)
                sbbq    %rax, 1*8(%rdi)
                sbbq    $0, 2*8(%rdi)
                movq    3*8(%rdi), %rdx
                sbbq    $0, %rdx
                addq    %rax, %rdx
                movq    %rdx, 3*8(%rdi)
        ret

In fact, in this case "push IMM+pop REG" is 3 bytes and "orl $-1, %eax" is 3
bytes too (8-bit immediate form), so -Oz optimization is not a win here (same
size, slower code).

$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/14/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-redhat-linux
Configured with: ../configure --enable-bootstrap
--enable-languages=c,c++,fortran,objc,obj-c++,ada,go,d,m2,lto --prefix=/usr
--mandir=/usr/share/man --infodir=/usr/share/info
--with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-shared
--enable-threads=posix --enable-checking=release --enable-multilib
--with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions
--enable-gnu-unique-object --enable-linker-build-id
--with-gcc-major-version-only --enable-libstdcxx-backtrace
--with-libstdcxx-zoneinfo=/usr/share/zoneinfo --with-linker-hash-style=gnu
--enable-plugin --enable-initfini-array
--with-isl=/builddir/build/BUILD/gcc-14.0.1-20240328/obj-x86_64-redhat-linux/isl-install
--enable-offload-targets=nvptx-none,amdgcn-amdhsa --enable-offload-defaulted
--without-cuda-driver --enable-gnu-indirect-function --enable-cet
--with-tune=generic --with-arch_32=i686 --build=x86_64-redhat-linux
--with-build-config=bootstrap-lto --enable-link-serialization=1
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 14.0.1 20240328 (Red Hat 14.0.1-0) (GCC)

Reply via email to