https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118086

            Bug ID: 118086
           Summary: Missed Optimization: Redundant Copying of Large Struct
                    Parameter onto Stack
           Product: gcc
           Version: 14.2.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jonathan.gruber.jg at gmail dot com
  Target Milestone: ---

Created attachment 59891
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=59891&action=edit
Minimal test case.

When passing a large struct as an argument to a function, and the calling
function already has the large struct as a parameter, GCC redundantly copies
the struct parameter to the stack. I tested this with the (non-cross-compiling)
gcc, the aarch64 cross-compiler (aarch64-linux-gnu-gcc), and the riscv64
cross-compiler (riscv64-linux-gnu-gcc); with -O2, -O3, -Os, and -Oz. I have not
checked other CPU architectures.

A test case is in the attached file test.c, which I have reproduced below for
your convenience:

struct Small {
        void *x;
};

struct Large {
        void *x, *y, *z, *w;
};

extern int extern_func_small(struct Small);

extern int extern_func_large(struct Large);

int tail_call_small(struct Small x) {
        return extern_func_small(x);
}

int non_tail_call_small(struct Small x) {
        return ~extern_func_small(x);
}

int tail_call_large(struct Large x) {
        return extern_func_large(x);
}

int non_tail_call_large(struct Large x) {
        return ~extern_func_large(x);
}


x86_64 assembly, -O3:

tail_call_small:
        .cfi_startproc
        jmp     extern_func_small@PLT
        .cfi_endproc

non_tail_call_small:
        .cfi_startproc
        subq    $8, %rsp
        .cfi_def_cfa_offset 16
        call    extern_func_small@PLT
        addq    $8, %rsp
        .cfi_def_cfa_offset 8
        notl    %eax
        ret
        .cfi_endproc

tail_call_large:
        .cfi_startproc
        jmp     extern_func_large@PLT
        .cfi_endproc

non_tail_call_large:
        .cfi_startproc
        subq    $40, %rsp
        .cfi_def_cfa_offset 48
        movdqu  48(%rsp), %xmm0
        movups  %xmm0, (%rsp)
        movdqu  64(%rsp), %xmm0
        movups  %xmm0, 16(%rsp)
        call    extern_func_large@PLT
        addq    $40, %rsp
        .cfi_def_cfa_offset 8
        notl    %eax
        ret
        .cfi_endproc


aarch64 assembly, -O3:

tail_call_small:
        .cfi_startproc
        b       extern_func_small
        .cfi_endproc

non_tail_call_small:
        .cfi_startproc
        stp     x29, x30, [sp, -16]!
        .cfi_def_cfa_offset 16
        .cfi_offset 29, -16
        .cfi_offset 30, -8
        mov     x29, sp
        bl      extern_func_small
        ldp     x29, x30, [sp], 16
        .cfi_restore 30
        .cfi_restore 29
        .cfi_def_cfa_offset 0
        mvn     w0, w0
        ret
        .cfi_endproc

tail_call_large:
        .cfi_startproc
        mov     x2, x0
        stp     x29, x30, [sp, -48]!
        .cfi_def_cfa_offset 48
        .cfi_offset 29, -48
        .cfi_offset 30, -40
        mov     x29, sp
        add     x1, sp, 16
        ldp     q30, q31, [x2]
        mov     x0, x1
        str     q30, [sp, 16]
        str     q31, [x1, 16]
        bl      extern_func_large
        ldp     x29, x30, [sp], 48
        .cfi_restore 30
        .cfi_restore 29
        .cfi_def_cfa_offset 0
        ret
        .cfi_endproc

non_tail_call_large:
        .cfi_startproc
        mov     x2, x0
        stp     x29, x30, [sp, -48]!
        .cfi_def_cfa_offset 48
        .cfi_offset 29, -48
        .cfi_offset 30, -40
        mov     x29, sp
        add     x1, sp, 16
        ldp     q30, q31, [x2]
        mov     x0, x1
        str     q30, [sp, 16]
        str     q31, [x1, 16]
        bl      extern_func_large
        mvn     w0, w0
        ldp     x29, x30, [sp], 48
        .cfi_restore 30
        .cfi_restore 29
        .cfi_def_cfa_offset 0
        ret
        .cfi_endproc


riscv64 assembly, -O3:

tail_call_small:
        .cfi_startproc
        tail    extern_func_small@plt
        .cfi_endproc

non_tail_call_small:
        .cfi_startproc
        addi    sp,sp,-16
        .cfi_def_cfa_offset 16
        sd      ra,8(sp)
        .cfi_offset 1, -8
        call    extern_func_small@plt
        ld      ra,8(sp)
        .cfi_restore 1
        not     a0,a0
        sext.w  a0,a0
        addi    sp,sp,16
        .cfi_def_cfa_offset 0
        jr      ra
        .cfi_endproc

tail_call_large:
        .cfi_startproc
        ld      a2,0(a0)
        ld      a3,8(a0)
        ld      a4,16(a0)
        ld      a5,24(a0)
        addi    sp,sp,-48
        .cfi_def_cfa_offset 48
        mv      a0,sp
        sd      ra,40(sp)
        .cfi_offset 1, -8
        sd      a2,0(sp)
        sd      a3,8(sp)
        sd      a4,16(sp)
        sd      a5,24(sp)
        call    extern_func_large@plt
        ld      ra,40(sp)
        .cfi_restore 1
        addi    sp,sp,48
        .cfi_def_cfa_offset 0
        jr      ra
        .cfi_endproc

non_tail_call_large:
        .cfi_startproc
        ld      a2,0(a0)
        ld      a3,8(a0)
        ld      a4,16(a0)
        ld      a5,24(a0)
        addi    sp,sp,-48
        .cfi_def_cfa_offset 48
        mv      a0,sp
        sd      ra,40(sp)
        .cfi_offset 1, -8
        sd      a2,0(sp)
        sd      a3,8(sp)
        sd      a4,16(sp)
        sd      a5,24(sp)
        call    extern_func_large@plt
        ld      ra,40(sp)
        .cfi_restore 1
        not     a0,a0
        sext.w  a0,a0
        addi    sp,sp,48
        .cfi_def_cfa_offset 0
        jr      ra
        .cfi_endproc


For a tail call with the "small" or pointer-sized struct, each of the
architectures seem to correctly elide the tail call into an unconditional
branch instruction. I have only a passing understanding of the calling
conventions for each of the architectures, so I do not know if the non-tail
call with the small struct is correctly optimized.

For a tail call with the "large" struct, only x86_64 correctly elides the tail
call into an unconditional branch instruction. The other two architectures
redundantly copy the struct parameter from the stack and onto the stack again
and pass the copy of the struct, rather than the original, as the argument to
extern_func. For the non-tail call with the "large" struct, all the
architectures likewise mistakenly copy the struct parameter from the stack and
onto the stack again and pass the copy of the struct, rather than the original,
to extern_func (I am aware that a non-tail call is not to be elided into an
unconditional branch instruction, but copying the struct to the stack and
passing instead the copy to extern_func is still unnecessary).

Again, I have only a passing understanding of the calling conventions for each
of the architectures in question, so I do not know whether or not, disregarding
the redundant struct copying, the amount that they subtract from the stack
pointer is optimal.


I'm not sure if I filed this bug under the correct component
(rtl-optimization), so feel free to reassign it to the correct component after
the fact.

Host system type: Arch Linux, x86_64

gcc information:
Version: 14.2.1 20240910 (GCC)
Configured with: /build/gcc/src/gcc/configure
--enable-languages=ada,c,c++,d,fortran,go,lto,m2,objc,obj-c++,rust
--enable-bootstrap --prefix=/usr --libdir=/usr/lib --libexecdir=/usr/lib
--mandir=/usr/share/man --infodir=/usr/share/info
--with-bugurl=https://gitlab.archlinux.org/archlinux/packaging/packages/gcc/-/issues
--with-build-config=bootstrap-lto --with-linker-hash-style=gnu
--with-system-zlib --enable-__cxa_atexit --enable-cet=auto
--enable-checking=release --enable-clocale=gnu --enable-default-pie
--enable-default-ssp --enable-gnu-indirect-function --enable-gnu-unique-object
--enable-libstdcxx-backtrace --enable-link-serialization=1
--enable-linker-build-id --enable-lto --enable-multilib --enable-plugin
--enable-shared --enable-threads=posix --disable-libssp --disable-libstdcxx-pch
--disable-werror

aarch64-linux-gnu-gcc information:
Version: 14.2.0
Configured with: /build/aarch64-linux-gnu-gcc/src/gcc-14.2.0/configure
--prefix=/usr --program-prefix=aarch64-linux-gnu-
--with-local-prefix=/usr/aarch64-linux-gnu
--with-sysroot=/usr/aarch64-linux-gnu
--with-build-sysroot=/usr/aarch64-linux-gnu
--with-native-system-header-dir=/include --libdir=/usr/lib
--libexecdir=/usr/lib --target=aarch64-linux-gnu --host=x86_64-pc-linux-gnu
--build=x86_64-pc-linux-gnu --disable-nls --enable-default-pie
--enable-languages=c,c++,fortran --enable-shared --enable-threads=posix
--with-system-zlib --with-isl --enable-__cxa_atexit
--disable-libunwind-exceptions --enable-clocale=gnu --disable-libstdcxx-pch
--disable-libssp --enable-gnu-unique-object --enable-linker-build-id
--enable-lto --enable-plugin --enable-install-libiberty
--with-linker-hash-style=gnu --enable-gnu-indirect-function --disable-multilib
--disable-werror --enable-checking=release

riscv64-linux-gnu-gcc information:
Version: 14.2.0
Configured with: /build/riscv64-linux-gnu-gcc/src/gcc-14.2.0/configure
--prefix=/usr --program-prefix=riscv64-linux-gnu-
--with-local-prefix=/usr/riscv64-linux-gnu
--with-sysroot=/usr/riscv64-linux-gnu
--with-build-sysroot=/usr/riscv64-linux-gnu --libdir=/usr/lib
--libexecdir=/usr/lib --target=riscv64-linux-gnu --host=x86_64-pc-linux-gnu
--build=x86_64-pc-linux-gnu --with-system-zlib --with-isl
--with-linker-hash-style=gnu --disable-nls --disable-libunwind-exceptions
--disable-libstdcxx-pch --disable-libssp --disable-multilib --disable-werror
--enable-languages=c,c++ --enable-shared --enable-threads=posix
--enable-__cxa_atexit --enable-clocale=gnu --enable-gnu-unique-object
--enable-linker-build-id --enable-lto --enable-plugin
--enable-install-libiberty --enable-gnu-indirect-function --enable-default-pie
--enable-checking=release
  • [Bug rtl-optimization/118... jonathan.gruber.jg at gmail dot com via Gcc-bugs

Reply via email to