Hi!

As mentioned in an earlier posting, for -msse4.1 and above
on the gcc.target/i386/{sse2,avx}-extract-1.c testcases we
generate inefficient code when trying to extract SFmode
from V4SFmode, unless it is the element 0.
The problem is that the sse4_1_extractps pattern matches, but
doesn't have a "=x" alternative, so we end up doing a
{,v}extractps into memory and load the SFmode value from memory.
The following patch adds the alternatives and splitters, so that
it is reshuffled in that case instead of spilling it.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2011-09-19  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/sse.md (*sse4_1_extractps): Change into
        define_insn_and_split, add =x 0 n and =x x n alternatives
        and split them after reload.

--- gcc/config/i386/sse.md.jj   2011-09-18 21:20:04.000000000 +0200
+++ gcc/config/i386/sse.md      2011-09-19 10:49:25.000000000 +0200
@@ -4000,19 +4000,45 @@ (define_insn "vec_extract_hi_v32qi"
    (const_string "OI")
    (const_string "V8SF")))])
 
-(define_insn "*sse4_1_extractps"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=rm")
+(define_insn_and_split "*sse4_1_extractps"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=rm,x,x")
        (vec_select:SF
-         (match_operand:V4SF 1 "register_operand" "x")
-         (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))]
+         (match_operand:V4SF 1 "register_operand" "x,0,x")
+         (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n,n")])))]
   "TARGET_SSE4_1"
-  "%vextractps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sselog")
-   (set_attr "prefix_data16" "1")
-   (set_attr "prefix_extra" "1")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "V4SF")])
+  "@
+   %vextractps\t{%2, %1, %0|%0, %1, %2}
+   #
+   #"
+  "&& reload_completed && SSE_REG_P (operands[0])"
+  [(const_int 0)]
+{
+  rtx dest = gen_rtx_REG (V4SFmode, REGNO (operands[0]));
+  switch (INTVAL (operands[2]))
+    {
+    case 1:
+    case 3:
+      emit_insn (gen_sse_shufps_v4sf (dest, operands[1], operands[1],
+                                     operands[2], operands[2],
+                                     GEN_INT (INTVAL (operands[2]) + 4),
+                                     GEN_INT (INTVAL (operands[2]) + 4)));
+      break;
+    case 2:
+      emit_insn (gen_vec_interleave_highv4sf (dest, operands[1], operands[1]));
+      break;
+    default:
+      /* 0 should be handled by the *vec_extractv4sf_0 pattern above.  */
+      gcc_unreachable ();
+    }
+  DONE;
+}
+  [(set_attr "type" "sselog,*,*")
+   (set_attr "isa" "base,noavx,avx")
+   (set_attr "prefix_data16" "1,*,*")
+   (set_attr "prefix_extra" "1,*,*")
+   (set_attr "length_immediate" "1,*,*")
+   (set_attr "prefix" "maybe_vex,*,*")
+   (set_attr "mode" "V4SF,*,*")])
 
 (define_insn_and_split "*vec_extract_v4sf_mem"
   [(set (match_operand:SF 0 "register_operand" "=x*rf")

        Jakub

Reply via email to