https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65421

            Bug ID: 65421
           Summary: inefficient code returning float aggregates on
                    powepc64le
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: msebor at gcc dot gnu.org

When returning homogeneous floating-point aggregates, gcc first loads the
aggregate into GPRs, then saves it to the save area, and then copies the
aggregate from the GPRs to the floating registers used to return it to the
caller.  Clang emits much more efficient code (see below).

$ cat x.c && ./gcc/xgcc -B./gcc -O3 -o/dev/tty -S x.c
typedef struct { double a[4]; } A;

A foo (const A *a) { return *a; }

    .file    "x.c"
    .machine power8
    .abiversion 2
    .section    ".toc","aw"
    .section    ".text"
    .align 2
    .p2align 4,,15
    .globl foo
    .type    foo, @function
foo:
    ld 7,0(3)
    ld 8,8(3)
    ld 10,16(3)
    ld 9,24(3)
    std 7,-64(1)
    std 8,-56(1)
    std 10,-48(1)
    std 9,-40(1)
    lfd 2,-56(1)
    lfd 1,-64(1)
    ori 2,2,0
    lfd 3,-48(1)
    lfd 4,-40(1)
    blr
    .long 0
    .byte 0,0,0,0,0,0,0,0
    .size    foo,.-foo
    .ident    "GCC: (GNU) 5.0.0 20150310 (experimental)"
    .section    .note.GNU-stack,"",@progbits


Clang emits:
    .text
    .abiversion 2
    .file    "-"
    .globl    foo
    .align    2
    .type    foo,@function
foo:
    lfd 1, 0(3)
    lfd 2, 8(3)
    lfd 3, 16(3)
    lfd 4, 24(3)
    blr
    .long    0
    .quad    0
.Ltmp0:
    .size    foo, .Ltmp0-foo


    .ident    "clang version 3.6.0 (trunk 215935)"

Reply via email to