Hello! 2011-04-16 Uros Bizjak <ubiz...@gmail.com>
* config/i386/sse.md (sseunpackmode): New mode attribute. (ssepackmode): Ditto. (vec_pack_trunc_<mode>): Macroize expander from vec_pack_trunc_{v8hi,v4si,v2di} using VI248_128 mode iterator. (vec_unpacks_lo_<mode>): Macroize expander from vec_unpacks_lo_{v16qi,v8hi,v4si} using VI124_128 mode iterator. (vec_unpacks_hi_<mode>): Macroize expander from vec_unpacks_hi_{v16qi,v8hi,v4si} using VI124_128 mode iterator. (vec_unpacku_lo_<mode>): Macroize expander from vec_unpacku_lo_{v16qi,v8hi,v4si} using VI124_128 mode iterator. (vec_unpacku_hi_<mode>): Macroize expander from vec_unpacks_hi_{v16qi,v8hi,v4si} using VI124_128 mode iterator. * config/i386/i386.c (ix86_expand_sse_unpack): Merge with ix86_expand_sse4_unpack. * config/i386/i386-protos.h (ix86_expand_sse4_unpack): Remove. Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}. Committed to SVN mainline. Uros.
Index: sse.md =================================================================== --- sse.md (revision 172580) +++ sse.md (working copy) @@ -70,7 +70,32 @@ (define_mode_iterator VI24_128 [V8HI V4SI]) (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) +;; Mapping from float mode to required SSE level +(define_mode_attr sse + [(SF "sse") (DF "sse2") + (V4SF "sse") (V2DF "sse2") + (V8SF "avx") (V4DF "avx")]) +(define_mode_attr sse2 + [(V16QI "sse2") (V32QI "avx") + (V2DI "sse2") (V4DI "avx")]) + +(define_mode_attr sse3 + [(V16QI "sse3") (V32QI "avx")]) + +(define_mode_attr sse4_1 + [(V4SF "sse4_1") (V2DF "sse4_1") + (V8SF "avx") (V4DF "avx")]) + +;; Pack/unpack vector modes +(define_mode_attr sseunpackmode + [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")]) + +(define_mode_attr ssepackmode + [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")]) + + + ;; Instruction suffix for sign and zero extensions. (define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")]) @@ -126,23 +151,6 @@ (V2DF "TARGET_SSE") (V4SF "TARGET_SSE") (V4DF "TARGET_AVX") (V8SF "TARGET_AVX")]) -;; Mapping from float mode to required SSE level -(define_mode_attr sse - [(SF "sse") (DF "sse2") - (V4SF "sse") (V2DF "sse2") - (V8SF "avx") (V4DF "avx")]) - -(define_mode_attr sse2 - [(V16QI "sse2") (V32QI "avx") - (V2DI "sse2") (V4DI "avx")]) - -(define_mode_attr sse3 - [(V16QI "sse3") (V32QI "avx")]) - -(define_mode_attr sse4_1 - [(V4SF "sse4_1") (V2DF "sse4_1") - (V8SF "avx") (V4DF "avx")]) - ;; Mapping from integer vector mode to mnemonic suffix (define_mode_attr ssevecsize [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")]) @@ -5856,42 +5864,18 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(define_expand "vec_pack_trunc_v8hi" - [(match_operand:V16QI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "") - (match_operand:V8HI 2 "register_operand" "")] +(define_expand "vec_pack_trunc_<mode>" + [(match_operand:<ssepackmode> 0 "register_operand" "") + (match_operand:VI248_128 1 "register_operand" "") + (match_operand:VI248_128 2 "register_operand" "")] "TARGET_SSE2" { - rtx op1 = gen_lowpart (V16QImode, operands[1]); - rtx op2 = gen_lowpart (V16QImode, operands[2]); + rtx op1 = gen_lowpart (<MODE>mode, operands[1]); + rtx op2 = gen_lowpart (<MODE>mode, operands[2]); ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); DONE; }) -(define_expand "vec_pack_trunc_v4si" - [(match_operand:V8HI 0 "register_operand" "") - (match_operand:V4SI 1 "register_operand" "") - (match_operand:V4SI 2 "register_operand" "")] - "TARGET_SSE2" -{ - rtx op1 = gen_lowpart (V8HImode, operands[1]); - rtx op2 = gen_lowpart (V8HImode, operands[2]); - ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); - DONE; -}) - -(define_expand "vec_pack_trunc_v2di" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V2DI 1 "register_operand" "") - (match_operand:V2DI 2 "register_operand" "")] - "TARGET_SSE2" -{ - rtx op1 = gen_lowpart (V4SImode, operands[1]); - rtx op2 = gen_lowpart (V4SImode, operands[2]); - ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0); - DONE; -}) - (define_insn "sse2_packsswb" [(set (match_operand:V16QI 0 "register_operand" "=x,x") (vec_concat:V16QI @@ -6767,150 +6751,30 @@ (set_attr "prefix" "maybe_vex,orig,orig,vex,orig,orig,vex") (set_attr "mode" "TI,TI,TI,TI,V4SF,V2SF,V2SF")]) -(define_expand "vec_unpacku_hi_v16qi" - [(match_operand:V8HI 0 "register_operand" "") - (match_operand:V16QI 1 "register_operand" "")] +(define_expand "vec_unpacku_hi_<mode>" + [(match_operand:<sseunpackmode> 0 "register_operand" "") + (match_operand:VI124_128 1 "register_operand" "")] "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, true, true); - else - ix86_expand_sse_unpack (operands, true, true); - DONE; -}) + "ix86_expand_sse_unpack (operands, true, true); DONE;") -(define_expand "vec_unpacks_hi_v16qi" - [(match_operand:V8HI 0 "register_operand" "") - (match_operand:V16QI 1 "register_operand" "")] +(define_expand "vec_unpacks_hi_<mode>" + [(match_operand:<sseunpackmode> 0 "register_operand" "") + (match_operand:VI124_128 1 "register_operand" "")] "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, false, true); - else - ix86_expand_sse_unpack (operands, false, true); - DONE; -}) + "ix86_expand_sse_unpack (operands, false, true); DONE;") -(define_expand "vec_unpacku_lo_v16qi" - [(match_operand:V8HI 0 "register_operand" "") - (match_operand:V16QI 1 "register_operand" "")] +(define_expand "vec_unpacku_lo_<mode>" + [(match_operand:<sseunpackmode> 0 "register_operand" "") + (match_operand:VI124_128 1 "register_operand" "")] "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, true, false); - else - ix86_expand_sse_unpack (operands, true, false); - DONE; -}) + "ix86_expand_sse_unpack (operands, true, false); DONE;") -(define_expand "vec_unpacks_lo_v16qi" - [(match_operand:V8HI 0 "register_operand" "") - (match_operand:V16QI 1 "register_operand" "")] +(define_expand "vec_unpacks_lo_<mode>" + [(match_operand:<sseunpackmode> 0 "register_operand" "") + (match_operand:VI124_128 1 "register_operand" "")] "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, false, false); - else - ix86_expand_sse_unpack (operands, false, false); - DONE; -}) + "ix86_expand_sse_unpack (operands, false, false); DONE;") -(define_expand "vec_unpacku_hi_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, true, true); - else - ix86_expand_sse_unpack (operands, true, true); - DONE; -}) - -(define_expand "vec_unpacks_hi_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, false, true); - else - ix86_expand_sse_unpack (operands, false, true); - DONE; -}) - -(define_expand "vec_unpacku_lo_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, true, false); - else - ix86_expand_sse_unpack (operands, true, false); - DONE; -}) - -(define_expand "vec_unpacks_lo_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, false, false); - else - ix86_expand_sse_unpack (operands, false, false); - DONE; -}) - -(define_expand "vec_unpacku_hi_v4si" - [(match_operand:V2DI 0 "register_operand" "") - (match_operand:V4SI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, true, true); - else - ix86_expand_sse_unpack (operands, true, true); - DONE; -}) - -(define_expand "vec_unpacks_hi_v4si" - [(match_operand:V2DI 0 "register_operand" "") - (match_operand:V4SI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, false, true); - else - ix86_expand_sse_unpack (operands, false, true); - DONE; -}) - -(define_expand "vec_unpacku_lo_v4si" - [(match_operand:V2DI 0 "register_operand" "") - (match_operand:V4SI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, true, false); - else - ix86_expand_sse_unpack (operands, true, false); - DONE; -}) - -(define_expand "vec_unpacks_lo_v4si" - [(match_operand:V2DI 0 "register_operand" "") - (match_operand:V4SI 1 "register_operand" "")] - "TARGET_SSE2" -{ - if (TARGET_SSE4_1) - ix86_expand_sse4_unpack (operands, false, false); - else - ix86_expand_sse_unpack (operands, false, false); - DONE; -}) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Miscellaneous @@ -10062,7 +9926,7 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn_and_split "vec_dup<mode>" +(define_insn "vec_dup<mode>" [(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x") (vec_duplicate:AVX256MODE24P (match_operand:<avxscalarmode> 1 "nonimmediate_operand" "m,?x")))] @@ -10070,15 +9934,20 @@ "@ vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1} #" - "&& reload_completed && REG_P (operands[1])" - [(set (match_dup 2) (vec_duplicate:<avxhalfvecmode> (match_dup 1))) - (set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))] - "operands[2] = gen_rtx_REG (<avxhalfvecmode>mode, REGNO (operands[0]));" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") (set_attr "mode" "V8SF")]) +(define_split + [(set (match_operand:AVX256MODE24P 0 "register_operand" "") + (vec_duplicate:AVX256MODE24P + (match_operand:<avxscalarmode> 1 "register_operand" "")))] + "TARGET_AVX && reload_completed" + [(set (match_dup 2) (vec_duplicate:<avxhalfvecmode> (match_dup 1))) + (set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))] + "operands[2] = gen_rtx_REG (<avxhalfvecmode>mode, REGNO (operands[0]));") + (define_insn "avx_vbroadcastf128_<mode>" [(set (match_operand:AVX256MODE 0 "register_operand" "=x,x,x") (vec_concat:AVX256MODE Index: i386-protos.h =================================================================== --- i386-protos.h (revision 172580) +++ i386-protos.h (working copy) @@ -114,7 +114,6 @@ extern bool ix86_expand_fp_vcond (rtx[]); extern bool ix86_expand_int_vcond (rtx[]); extern void ix86_expand_sse_unpack (rtx[], bool, bool); -extern void ix86_expand_sse4_unpack (rtx[], bool, bool); extern bool ix86_expand_int_addcc (rtx[]); extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, int); extern void ix86_split_call_vzeroupper (rtx, rtx); Index: i386.c =================================================================== --- i386.c (revision 172580) +++ i386.c (working copy) @@ -19100,91 +19100,87 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p) { enum machine_mode imode = GET_MODE (operands[1]); - rtx (*unpack)(rtx, rtx, rtx); - rtx se, dest; + rtx tmp, dest; - switch (imode) + if (TARGET_SSE4_1) { - case V16QImode: + rtx (*unpack)(rtx, rtx); + + switch (imode) + { + case V16QImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv8qiv8hi2; + else + unpack = gen_sse4_1_sign_extendv8qiv8hi2; + break; + case V8HImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv4hiv4si2; + else + unpack = gen_sse4_1_sign_extendv4hiv4si2; + break; + case V4SImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv2siv2di2; + else + unpack = gen_sse4_1_sign_extendv2siv2di2; + break; + default: + gcc_unreachable (); + } + if (high_p) - unpack = gen_vec_interleave_highv16qi; + { + /* Shift higher 8 bytes to lower 8 bytes. */ + tmp = gen_reg_rtx (imode); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp), + gen_lowpart (V1TImode, operands[1]), + GEN_INT (64))); + } else - unpack = gen_vec_interleave_lowv16qi; - break; - case V8HImode: - if (high_p) - unpack = gen_vec_interleave_highv8hi; - else - unpack = gen_vec_interleave_lowv8hi; - break; - case V4SImode: - if (high_p) - unpack = gen_vec_interleave_highv4si; - else - unpack = gen_vec_interleave_lowv4si; - break; - default: - gcc_unreachable (); - } + tmp = operands[1]; - dest = gen_lowpart (imode, operands[0]); - - if (unsigned_p) - se = force_reg (imode, CONST0_RTX (imode)); + emit_insn (unpack (operands[0], tmp)); + } else - se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), - operands[1], pc_rtx, pc_rtx); + { + rtx (*unpack)(rtx, rtx, rtx); - emit_insn (unpack (dest, operands[1], se)); -} + switch (imode) + { + case V16QImode: + if (high_p) + unpack = gen_vec_interleave_highv16qi; + else + unpack = gen_vec_interleave_lowv16qi; + break; + case V8HImode: + if (high_p) + unpack = gen_vec_interleave_highv8hi; + else + unpack = gen_vec_interleave_lowv8hi; + break; + case V4SImode: + if (high_p) + unpack = gen_vec_interleave_highv4si; + else + unpack = gen_vec_interleave_lowv4si; + break; + default: + gcc_unreachable (); + } -/* This function performs the same task as ix86_expand_sse_unpack, - but with SSE4.1 instructions. */ + dest = gen_lowpart (imode, operands[0]); -void -ix86_expand_sse4_unpack (rtx operands[2], bool unsigned_p, bool high_p) -{ - enum machine_mode imode = GET_MODE (operands[1]); - rtx (*unpack)(rtx, rtx); - rtx src, dest; - - switch (imode) - { - case V16QImode: if (unsigned_p) - unpack = gen_sse4_1_zero_extendv8qiv8hi2; + tmp = force_reg (imode, CONST0_RTX (imode)); else - unpack = gen_sse4_1_sign_extendv8qiv8hi2; - break; - case V8HImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv4hiv4si2; - else - unpack = gen_sse4_1_sign_extendv4hiv4si2; - break; - case V4SImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv2siv2di2; - else - unpack = gen_sse4_1_sign_extendv2siv2di2; - break; - default: - gcc_unreachable (); - } + tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), + operands[1], pc_rtx, pc_rtx); - dest = operands[0]; - if (high_p) - { - /* Shift higher 8 bytes to lower 8 bytes. */ - src = gen_reg_rtx (imode); - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, src), - gen_lowpart (V1TImode, operands[1]), - GEN_INT (64))); + emit_insn (unpack (dest, operands[1], tmp)); } - else - src = operands[1]; - - emit_insn (unpack (dest, src)); } /* Expand conditional increment or decrement using adb/sbb instructions.