http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55258
Bug #: 55258
Summary: SSE register isn't used for 16byte copy
Classification: Unclassified
Product: gcc
Version: 4.8.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
AssignedTo: [email protected]
ReportedBy: [email protected]
CC: [email protected]
[hjl@gnu-tools-1 pr55247]$ cat x.i
typedef unsigned int uint32_t;
typedef uint32_t Elf32_Word;
typedef uint32_t Elf32_Addr;
typedef struct {
Elf32_Word st_name;
Elf32_Addr st_value;
Elf32_Word st_size;
unsigned char st_other;
} Elf32_Sym;
typedef struct {
Elf32_Word r_info;
}
Elf32_Rela;
typedef struct {
union {
Elf32_Addr d_ptr;
}
d_un;
} Elf32_Dyn;
struct link_map {
Elf32_Dyn *l_info[34];
};
extern void symbind32 (Elf32_Sym *);
void
_dl_profile_fixup (struct link_map *l, Elf32_Word reloc_arg)
{
const Elf32_Sym *const symtab = (const void *) l->l_info[6]->d_un.d_ptr;
const Elf32_Rela *const reloc = (const void *) (l->l_info[23]->d_un.d_ptr +
reloc_arg * sizeof (Elf32_Rela));
Elf32_Sym sym = symtab[(reloc->r_info) >> 8];
symbind32 (&sym);
}
[hjl@gnu-tools-1 pr55247]$ /export/build/gnu/gcc/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/gcc/build-x86_64-linux/gcc/ -O -Wall -mx32
-maddress-mode=short -S x.i -o short.asm
[hjl@gnu-tools-1 pr55247]$ cat short.asm
.file "x.i"
.text
.globl _dl_profile_fixup
.type _dl_profile_fixup, @function
_dl_profile_fixup:
.LFB0:
.cfi_startproc
subl $24, %esp
.cfi_def_cfa_offset 32
movl 24(%edi), %edx
movl 92(%edi), %eax
movl (%eax), %eax
movl (%eax,%esi,4), %eax
shrl $8, %eax
sall $4, %eax
addl (%edx), %eax
movq 8(%eax), %rdx
movq (%eax), %rax
movq %rax, (%esp)
movq %rdx, 8(%esp)
movl %esp, %edi
call symbind32
addl $24, %esp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size _dl_profile_fixup, .-_dl_profile_fixup
.ident "GCC: (GNU) 4.8.0 20121110 (experimental)"
.section .note.GNU-stack,"",@progbits
[hjl@gnu-tools-1 pr55247]$ /export/build/gnu/gcc/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/gcc/build-x86_64-linux/gcc/ -O -Wall -mx32
-maddress-mode=long -S x.i -o long.asm
[hjl@gnu-tools-1 pr55247]$ cat long.asm
.file "x.i"
.text
.globl _dl_profile_fixup
.type _dl_profile_fixup, @function
_dl_profile_fixup:
.LFB0:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl 24(%rdi), %edx
movl 92(%rdi), %eax
movl (%rax), %eax
movl (%eax,%esi,4), %eax
shrl $8, %eax
sall $4, %eax
addl (%rdx), %eax
movdqu (%eax), %xmm0
movdqa %xmm0, (%rsp)
movq (%rsp), %rax
movq 8(%rsp), %rdx
movq %rax, 16(%rsp)
movq %rdx, 24(%rsp)
leaq 16(%rsp), %rdi
call symbind32
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size _dl_profile_fixup, .-_dl_profile_fixup
.ident "GCC: (GNU) 4.8.0 20121110 (experimental)"
.section .note.GNU-stack,"",@progbits
[hjl@gnu-tools-1 pr55247]$
For TARGET_SSE_UNALIGNED_LOAD_OPTIMAL/TARGET_SSE_UNALIGNED_STORE_OPTIMAL,
we should always generate
movdqu (%eax), %xmm0
movdqa %xmm0, (%rsp)