Hello!
Attached patch avoids a deficiency in reload, where reload gives up on
handling subregs of pseudos (please see the PR [1] for explanation by
Ulrich). The patch simply avoids generating V4SF moves with V4SF
subregs of V2DF values unless really necessary (i.e. moving SSE2 modes
without SSE2 enabled, which shouldn't happen anyway). With patched
gcc, expand pass emits (unaligned) moves in their original mode, and
this mode is kept until asm is generated. The asm instruction is
chosen according to the mode of insn pattern, and the mode is
calculated using various influencing conditions.
2012-05-09 Uros Bizjak <[email protected]>
PR target/44141
* config/i386/i386.c (ix86_expand_vector_move_misalign): Do not handle
128 bit vectors specially for TARGET_AVX. Emit sse2_movupd and
sse_movupd RTXes for TARGET_AVX, TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
or when optimizing for size.
* config/i386/sse.md (*mov<mode>_internal): Remove
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling from asm output code.
Calculate "mode" attribute according to optimize_function_for_size_p
and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flag.
(*<sse>_movu<ssemodesuffix><avxsizesuffix>): Choose asm template
depending on the mode of the instruction. Calculate "mode" attribute
according to optimize_function_for_size_p, TARGET_SSE_TYPELESS_STORES
and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flags.
(*<sse2>_movdqu<avxsizesuffix>): Ditto.
Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}.
The patch also fixes the testcase from the PR.
Patch will be committed to mainline SVN.
[1] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44141#c16
Uros.
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md (revision 187286)
+++ config/i386/sse.md (working copy)
@@ -449,8 +449,6 @@
&& (misaligned_operand (operands[0], <MODE>mode)
|| misaligned_operand (operands[1], <MODE>mode)))
return "vmovupd\t{%1, %0|%0, %1}";
- else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
- return "%vmovaps\t{%1, %0|%0, %1}";
else
return "%vmovapd\t{%1, %0|%0, %1}";
@@ -460,8 +458,6 @@
&& (misaligned_operand (operands[0], <MODE>mode)
|| misaligned_operand (operands[1], <MODE>mode)))
return "vmovdqu\t{%1, %0|%0, %1}";
- else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
- return "%vmovaps\t{%1, %0|%0, %1}";
else
return "%vmovdqa\t{%1, %0|%0, %1}";
@@ -475,19 +471,21 @@
[(set_attr "type" "sselog1,ssemov,ssemov")
(set_attr "prefix" "maybe_vex")
(set (attr "mode")
- (cond [(match_test "TARGET_AVX")
+ (cond [(and (eq_attr "alternative" "1,2")
+ (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+ (if_then_else
+ (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+ (const_string "V8SF")
+ (const_string "V4SF"))
+ (match_test "TARGET_AVX")
(const_string "<sseinsnmode>")
- (ior (ior (match_test "optimize_function_for_size_p (cfun)")
- (not (match_test "TARGET_SSE2")))
+ (ior (and (eq_attr "alternative" "1,2")
+ (match_test "optimize_function_for_size_p (cfun)"))
(and (eq_attr "alternative" "2")
(match_test "TARGET_SSE_TYPELESS_STORES")))
(const_string "V4SF")
- (eq (const_string "<MODE>mode") (const_string "V4SFmode"))
- (const_string "V4SF")
- (eq (const_string "<MODE>mode") (const_string "V2DFmode"))
- (const_string "V2DF")
]
- (const_string "TI")))])
+ (const_string "<sseinsnmode>")))])
(define_insn "sse2_movq128"
[(set (match_operand:V2DI 0 "register_operand" "=x")
@@ -597,11 +595,33 @@
[(match_operand:VF 1 "nonimmediate_operand" "xm,x")]
UNSPEC_MOVU))]
"TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
- "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}"
+{
+ switch (get_attr_mode (insn))
+ {
+ case MODE_V8SF:
+ case MODE_V4SF:
+ return "%vmovups\t{%1, %0|%0, %1}";
+ default:
+ return "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}";
+ }
+}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
(set_attr "prefix" "maybe_vex")
- (set_attr "mode" "<MODE>")])
+ (set (attr "mode")
+ (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+ (if_then_else
+ (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+ (const_string "V8SF")
+ (const_string "V4SF"))
+ (match_test "TARGET_AVX")
+ (const_string "<MODE>")
+ (ior (match_test "optimize_function_for_size_p (cfun)")
+ (and (eq_attr "alternative" "1")
+ (match_test "TARGET_SSE_TYPELESS_STORES")))
+ (const_string "V4SF")
+ ]
+ (const_string "<MODE>")))])
(define_expand "<sse2>_movdqu<avxsizesuffix>"
[(set (match_operand:VI1 0 "nonimmediate_operand")
@@ -618,7 +638,16 @@
(unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "xm,x")]
UNSPEC_MOVU))]
"TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
- "%vmovdqu\t{%1, %0|%0, %1}"
+{
+ switch (get_attr_mode (insn))
+ {
+ case MODE_V8SF:
+ case MODE_V4SF:
+ return "%vmovups\t{%1, %0|%0, %1}";
+ default:
+ return "%vmovdqu\t{%1, %0|%0, %1}";
+ }
+}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
(set (attr "prefix_data16")
@@ -627,7 +656,20 @@
(const_string "*")
(const_string "1")))
(set_attr "prefix" "maybe_vex")
- (set_attr "mode" "<sseinsnmode>")])
+ (set (attr "mode")
+ (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+ (if_then_else
+ (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+ (const_string "V8SF")
+ (const_string "V4SF"))
+ (match_test "TARGET_AVX")
+ (const_string "<sseinsnmode>")
+ (ior (match_test "optimize_function_for_size_p (cfun)")
+ (and (eq_attr "alternative" "1")
+ (match_test "TARGET_SSE_TYPELESS_STORES")))
+ (const_string "V4SF")
+ ]
+ (const_string "<sseinsnmode>")))])
(define_insn "<sse3>_lddqu<avxsizesuffix>"
[(set (match_operand:VI1 0 "register_operand" "=x")
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 187289)
+++ config/i386/i386.c (working copy)
@@ -15907,60 +15907,19 @@ ix86_expand_vector_move_misalign (enum machine_mod
op0 = operands[0];
op1 = operands[1];
- if (TARGET_AVX)
+ if (TARGET_AVX
+ && GET_MODE_SIZE (mode) == 32)
{
switch (GET_MODE_CLASS (mode))
{
case MODE_VECTOR_INT:
case MODE_INT:
- switch (GET_MODE_SIZE (mode))
- {
- case 16:
- if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
- {
- op0 = gen_lowpart (V4SFmode, op0);
- op1 = gen_lowpart (V4SFmode, op1);
- emit_insn (gen_sse_movups (op0, op1));
- }
- else
- {
- op0 = gen_lowpart (V16QImode, op0);
- op1 = gen_lowpart (V16QImode, op1);
- emit_insn (gen_sse2_movdqu (op0, op1));
- }
- break;
- case 32:
- op0 = gen_lowpart (V32QImode, op0);
- op1 = gen_lowpart (V32QImode, op1);
- ix86_avx256_split_vector_move_misalign (op0, op1);
- break;
- default:
- gcc_unreachable ();
- }
- break;
+ op0 = gen_lowpart (V32QImode, op0);
+ op1 = gen_lowpart (V32QImode, op1);
+ /* FALLTHRU */
+
case MODE_VECTOR_FLOAT:
- switch (mode)
- {
- case V4SFmode:
- emit_insn (gen_sse_movups (op0, op1));
- break;
- case V2DFmode:
- if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
- {
- op0 = gen_lowpart (V4SFmode, op0);
- op1 = gen_lowpart (V4SFmode, op1);
- emit_insn (gen_sse_movups (op0, op1));
- }
- else
- emit_insn (gen_sse2_movupd (op0, op1));
- break;
- case V8SFmode:
- case V4DFmode:
- ix86_avx256_split_vector_move_misalign (op0, op1);
- break;
- default:
- gcc_unreachable ();
- }
+ ix86_avx256_split_vector_move_misalign (op0, op1);
break;
default:
@@ -15972,16 +15931,6 @@ ix86_expand_vector_move_misalign (enum machine_mod
if (MEM_P (op1))
{
- /* If we're optimizing for size, movups is the smallest. */
- if (optimize_insn_for_size_p ()
- || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
- {
- op0 = gen_lowpart (V4SFmode, op0);
- op1 = gen_lowpart (V4SFmode, op1);
- emit_insn (gen_sse_movups (op0, op1));
- return;
- }
-
/* ??? If we have typed data, then it would appear that using
movdqu is the only way to get unaligned data loaded with
integer type. */
@@ -15989,16 +15938,19 @@ ix86_expand_vector_move_misalign (enum machine_mod
{
op0 = gen_lowpart (V16QImode, op0);
op1 = gen_lowpart (V16QImode, op1);
+ /* We will eventually emit movups based on insn attributes. */
emit_insn (gen_sse2_movdqu (op0, op1));
- return;
}
-
- if (TARGET_SSE2 && mode == V2DFmode)
+ else if (TARGET_SSE2 && mode == V2DFmode)
{
rtx zero;
- if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+ if (TARGET_AVX
+ || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+ || optimize_function_for_size_p (cfun))
{
+ /* We will eventually emit movups based on insn attributes. */
emit_insn (gen_sse2_movupd (op0, op1));
return;
}
@@ -16030,7 +15982,10 @@ ix86_expand_vector_move_misalign (enum machine_mod
}
else
{
- if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+ if (TARGET_AVX
+ || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+ || optimize_function_for_size_p (cfun))
{
op0 = gen_lowpart (V4SFmode, op0);
op1 = gen_lowpart (V4SFmode, op1);
@@ -16045,6 +16000,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
if (mode != V4SFmode)
op0 = gen_lowpart (V4SFmode, op0);
+
m = adjust_address (op1, V2SFmode, 0);
emit_insn (gen_sse_loadlps (op0, op0, m));
m = adjust_address (op1, V2SFmode, 8);
@@ -16053,30 +16009,20 @@ ix86_expand_vector_move_misalign (enum machine_mod
}
else if (MEM_P (op0))
{
- /* If we're optimizing for size, movups is the smallest. */
- if (optimize_insn_for_size_p ()
- || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
- {
- op0 = gen_lowpart (V4SFmode, op0);
- op1 = gen_lowpart (V4SFmode, op1);
- emit_insn (gen_sse_movups (op0, op1));
- return;
- }
-
- /* ??? Similar to above, only less clear
- because of typeless stores. */
- if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
- && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
op0 = gen_lowpart (V16QImode, op0);
op1 = gen_lowpart (V16QImode, op1);
+ /* We will eventually emit movups based on insn attributes. */
emit_insn (gen_sse2_movdqu (op0, op1));
- return;
}
-
- if (TARGET_SSE2 && mode == V2DFmode)
+ else if (TARGET_SSE2 && mode == V2DFmode)
{
- if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+ if (TARGET_AVX
+ || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+ || optimize_function_for_size_p (cfun))
+ /* We will eventually emit movups based on insn attributes. */
emit_insn (gen_sse2_movupd (op0, op1));
else
{
@@ -16091,7 +16037,10 @@ ix86_expand_vector_move_misalign (enum machine_mod
if (mode != V4SFmode)
op1 = gen_lowpart (V4SFmode, op1);
- if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+ if (TARGET_AVX
+ || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+ || optimize_function_for_size_p (cfun))
{
op0 = gen_lowpart (V4SFmode, op0);
emit_insn (gen_sse_movups (op0, op1));