Hello!

Attached patch (that is in fact a variant of HJ's patch) implements
clearing of SSE target register before cvt* instructions (the same
approach as ICC takes).

While there, there is also no need to check for SUBREGs in post-reload
splitters.

2013-07-29  Uros Bizjak  <ubiz...@gmail.com>

    * config/i386/i386.md (float post-reload splitters): Do not check
    for subregs of SSE registers.

2013-07-29  Uros Bizjak  <ubiz...@gmail.com>
        H.J. Lu  <hongjiu...@intel.com>

    PR target/57954
    PR target/57988
    * config/i386/i386.md (post-reload splitter
    to avoid partial SSE reg dependency stalls): New.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
{,-m32}  and committed to mainline.

Uros.
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 201298)
+++ config/i386/i386.md (working copy)
@@ -4596,10 +4596,7 @@
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_MIX_SSE_I387
    && TARGET_INTER_UNIT_CONVERSIONS
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 0) (float:MODEF (match_dup 1)))])
 
 (define_split
@@ -4608,10 +4605,7 @@
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_MIX_SSE_I387
    && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float:MODEF (match_dup 2)))])
 
@@ -4697,10 +4691,7 @@
    (clobber (match_operand:SI 2 "memory_operand"))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   rtx op1 = operands[1];
@@ -4740,10 +4731,7 @@
    (clobber (match_operand:SI 2 "memory_operand"))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   operands[3] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
@@ -4764,10 +4752,7 @@
        (float:MODEF (match_operand:SI 1 "register_operand")))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   rtx op1 = operands[1];
@@ -4810,10 +4795,7 @@
        (float:MODEF (match_operand:SI 1 "memory_operand")))]
   "TARGET_SSE2 && TARGET_SSE_MATH
    && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(const_int 0)]
 {
   operands[3] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
@@ -4872,10 +4854,7 @@
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
    && (TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 0) (float:MODEF (match_dup 1)))])
 
 (define_insn "*float<SWI48:mode><MODEF:mode>2_sse_nointerunit"
@@ -4905,10 +4884,7 @@
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
    && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_function_for_size_p (cfun))
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float:MODEF (match_dup 2)))])
 
@@ -4917,10 +4893,7 @@
        (float:MODEF (match_operand:SWI48 1 "memory_operand")))
    (clobber (match_operand:SWI48 2 "memory_operand"))]
   "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
-   && reload_completed
-   && (SSE_REG_P (operands[0])
-       || (GET_CODE (operands[0]) == SUBREG
-          && SSE_REG_P (SUBREG_REG (operands[0]))))"
+   && reload_completed && SSE_REG_P (operands[0])"
   [(set (match_dup 0) (float:MODEF (match_dup 1)))])
 
 (define_insn "*float<SWI48x:mode><X87MODEF:mode>2_i387_with_temp"
@@ -4968,6 +4941,46 @@
    && reload_completed"
   [(set (match_dup 0) (float:X87MODEF (match_dup 1)))])
 
+;; Avoid partial SSE register dependency stalls
+
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+       (float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+       (vec_merge:<ssevecmode>
+         (vec_duplicate:<ssevecmode>
+           (float:MODEF (match_dup 1)))
+         (match_dup 0)
+         (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
+                                    <MODE>mode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode));
+})
+
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+       (float:MODEF (match_operand:DI 1 "nonimmediate_operand")))]
+  "TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+       (vec_merge:<ssevecmode>
+         (vec_duplicate:<ssevecmode>
+           (float:MODEF (match_dup 1)))
+         (match_dup 0)
+         (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
+                                    <MODE>mode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode));
+})
+
 ;; Avoid store forwarding (partial memory) stall penalty
 ;; by passing DImode value through XMM registers.  */
 

Reply via email to