This testcase:

--cut here--
typedef long long __m128i __attribute__ ((__vector_size__ (16),
__may_alias__));
typedef int __v4si __attribute__ ((__vector_size__ (16)));

__m128i
_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
{
  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
}
--cut here--

compiles using -O2 -m64 -msse2 to:

        movq    %rsi, -8(%rsp)  # 28    *movdi_1_rex64/4        [length = 5]
        movq    -8(%rsp), %xmm1 # 29    *movdi_1_rex64/17       [length = 6]
        movq    %rdi, -8(%rsp)  # 30    *movdi_1_rex64/4        [length = 5]
        movq    -8(%rsp), %xmm0 # 31    *movdi_1_rex64/17       [length = 6]
        movq    %rcx, -8(%rsp)  # 32    *movdi_1_rex64/4        [length = 5]
        punpckldq       %xmm0, %xmm1    # 9     *vec_concatv2si_sse2/1= 3]
        movq    -8(%rsp), %xmm0 # 33    *movdi_1_rex64/17       [length = 6]
        movq    %rdx, -8(%rsp)  # 34    *movdi_1_rex64/4        [length = 5]
        movq    -8(%rsp), %xmm2 # 35    *movdi_1_rex64/17       [length = 6]
        punpckldq       %xmm2, %xmm0    # 10    *vec_concatv2si_sse2/1
        punpcklqdq      %xmm1, %xmm0    # 11    *vec_concatv4si_1/1
        ret     # 38    return_internal [length = 1]

when -march=core2 is added to compile flags, so TARGET_INTER_UNIT_MOVES is
enabled, following code is produced:

        movd    %edi, %xmm0     # 29    *movsi_1/11     [length = 3]
        movd    %esi, %xmm1     # 28    *movsi_1/11     [length = 3]
        movd    %edx, %xmm2     # 31    *movsi_1/11     [length = 3]
        punpckldq       %xmm0, %xmm1    # 9     *vec_concatv2si_sse2/1
        movd    %ecx, %xmm0     # 30    *movsi_1/11     [length = 3]
        punpckldq       %xmm2, %xmm0    # 10    *vec_concatv2si_sse2/1
        punpcklqdq      %xmm1, %xmm0    # 11    *vec_concatv4si_1/1
        ret     # 34    return_internal [length = 1]

also, when compiled with -m32 -O2 -msse2 -mregparm=3:

        subl    $4, %esp        # 37    pro_epilogue_adjust_stack_1/1
        movl    %edx, (%esp)    # 28    *movsi_1/2      [length = 3]
        movd    8(%esp), %xmm3  # 33    *movsi_1/12     [length = 5]
        movd    (%esp), %xmm0   # 29    *movsi_1/12     [length = 4]
        movl    %eax, (%esp)    # 31    *movsi_1/2      [length = 3]
        movd    (%esp), %xmm2   # 32    *movsi_1/12     [length = 4]
        movl    %ecx, (%esp)    # 35    *movsi_1/2      [length = 3]
        punpckldq       %xmm2, %xmm0    # 9     *vec_concatv2si_sse2/1
        movd    (%esp), %xmm2   # 36    *movsi_1/12     [length = 4]
        movq    %xmm0, %xmm1    # 30    *movv2si_internal/7     [length = 4]
        punpckldq       %xmm2, %xmm3    # 10    *vec_concatv2si_sse2/1
        addl    $4, %esp        # 40    pro_epilogue_adjust_stack_1/1
        movq    %xmm3, %xmm0    # 34    *movv2si_internal/7     [length = 4]
        punpcklqdq      %xmm1, %xmm0    # 11    *vec_concatv4si_1/1
        ret     # 41    return_internal [length = 1]

The problem is, that gcc generates 64bit reg->mem->xmmreg moves (see first asm
code dump) for 32bit values, when direct reg->xmmreg moves are disabled. This
happens only for 64bit targets, code for 32bit targets is what is expected.

For the first asm code dump, we have following RTX for (insn 9) in _lreg:

(insn:HI 2 7 3 2 uuu.c:7 (set (reg/v:SI 59 [ __q3 ])
        (reg:SI 5 di [ __q3 ])) 47 {*movsi_1}

(insn:HI 3 2 4 2 uuu.c:7 (set (reg/v:SI 60 [ __q2 ])
        (reg:SI 4 si [ __q2 ])) 47 {*movsi_1}

(insn:HI 9 6 10 2 uuu.c:7 (set (reg:V2SI 65)
        (vec_concat:V2SI (reg/v:SI 60 [ __q2 ])
            (reg/v:SI 59 [ __q3 ]))) 1338 {*vec_concatv2si_sse2}

Reload says:

Reloads for insn # 9
Reload 0: reload_in (SI) = (reg/v:SI 4 si [orig:60 __q2 ] [60])
        reload_out (V2SI) = (reg:V2SI 22 xmm1 [65])
        SSE_REGS, RELOAD_OTHER (opnum = 0)
        reload_in_reg: (reg/v:SI 4 si [orig:60 __q2 ] [60])
        reload_out_reg: (reg:V2SI 22 xmm1 [65])
        reload_reg_rtx: (reg:V2SI 22 xmm1 [65])
Reload 1: reload_in (SI) = (reg/v:SI 5 di [orig:59 __q3 ] [59])
        SSE_REGS, RELOAD_FOR_INPUT (opnum = 2)
        reload_in_reg: (reg/v:SI 5 di [orig:59 __q3 ] [59])
        reload_reg_rtx: (reg:SI 21 xmm0)

And this results in:

(insn 28 6 29 2 uuu.c:7 (set (mem/c:DI (plus:DI (reg/f:DI 7 sp)
                (const_int -8 [0xfffffffffffffff8])) [0 S8 A8])
        (reg:DI 4 si)) 89 {*movdi_1_rex64} (nil))

(insn 29 28 30 2 uuu.c:7 (set (reg:DI 22 xmm1)
        (mem/c:DI (plus:DI (reg/f:DI 7 sp)
                (const_int -8 [0xfffffffffffffff8])) [0 S8 A8])) 89
{*movdi_1_rex64} (nil))

(insn 30 29 31 2 uuu.c:7 (set (mem/c:DI (plus:DI (reg/f:DI 7 sp)
                (const_int -8 [0xfffffffffffffff8])) [0 S8 A8])
        (reg:DI 5 di)) 89 {*movdi_1_rex64} (nil))

(insn 31 30 9 2 uuu.c:7 (set (reg:DI 21 xmm0)
        (mem/c:DI (plus:DI (reg/f:DI 7 sp)
                (const_int -8 [0xfffffffffffffff8])) [0 S8 A8])) 89
{*movdi_1_rex64} (nil))

(insn:HI 9 31 32 2 uuu.c:7 (set (reg:V2SI 22 xmm1 [65])
        (vec_concat:V2SI (reg:SI 22 xmm1)
            (reg:SI 21 xmm0))) 1338 {*vec_concatv2si_sse2} (nil))

Do we really need 64bit moves here?


-- 
           Summary: Reload chooses too wide mode for reg->mem->reg reload
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: ubizjak at gmail dot com
 GCC build triplet: x86_64-pc-linux-gnu
  GCC host triplet: x86_64-pc-linux-gnu
GCC target triplet: x86_64-pc-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36246

Reply via email to