Hi! As mentioned in an earlier posting, for -msse4.1 and above on the gcc.target/i386/{sse2,avx}-extract-1.c testcases we generate inefficient code when trying to extract SFmode from V4SFmode, unless it is the element 0. The problem is that the sse4_1_extractps pattern matches, but doesn't have a "=x" alternative, so we end up doing a {,v}extractps into memory and load the SFmode value from memory. The following patch adds the alternatives and splitters, so that it is reshuffled in that case instead of spilling it.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2011-09-19 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (*sse4_1_extractps): Change into define_insn_and_split, add =x 0 n and =x x n alternatives and split them after reload. --- gcc/config/i386/sse.md.jj 2011-09-18 21:20:04.000000000 +0200 +++ gcc/config/i386/sse.md 2011-09-19 10:49:25.000000000 +0200 @@ -4000,19 +4000,45 @@ (define_insn "vec_extract_hi_v32qi" (const_string "OI") (const_string "V8SF")))]) -(define_insn "*sse4_1_extractps" - [(set (match_operand:SF 0 "nonimmediate_operand" "=rm") +(define_insn_and_split "*sse4_1_extractps" + [(set (match_operand:SF 0 "nonimmediate_operand" "=rm,x,x") (vec_select:SF - (match_operand:V4SF 1 "register_operand" "x") - (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))] + (match_operand:V4SF 1 "register_operand" "x,0,x") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n,n")])))] "TARGET_SSE4_1" - "%vextractps\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sselog") - (set_attr "prefix_data16" "1") - (set_attr "prefix_extra" "1") - (set_attr "length_immediate" "1") - (set_attr "prefix" "maybe_vex") - (set_attr "mode" "V4SF")]) + "@ + %vextractps\t{%2, %1, %0|%0, %1, %2} + # + #" + "&& reload_completed && SSE_REG_P (operands[0])" + [(const_int 0)] +{ + rtx dest = gen_rtx_REG (V4SFmode, REGNO (operands[0])); + switch (INTVAL (operands[2])) + { + case 1: + case 3: + emit_insn (gen_sse_shufps_v4sf (dest, operands[1], operands[1], + operands[2], operands[2], + GEN_INT (INTVAL (operands[2]) + 4), + GEN_INT (INTVAL (operands[2]) + 4))); + break; + case 2: + emit_insn (gen_vec_interleave_highv4sf (dest, operands[1], operands[1])); + break; + default: + /* 0 should be handled by the *vec_extractv4sf_0 pattern above. */ + gcc_unreachable (); + } + DONE; +} + [(set_attr "type" "sselog,*,*") + (set_attr "isa" "base,noavx,avx") + (set_attr "prefix_data16" "1,*,*") + (set_attr "prefix_extra" "1,*,*") + (set_attr "length_immediate" "1,*,*") + (set_attr "prefix" "maybe_vex,*,*") + (set_attr "mode" "V4SF,*,*")]) (define_insn_and_split "*vec_extract_v4sf_mem" [(set (match_operand:SF 0 "register_operand" "=x*rf") Jakub