On 2/10/19, Uros Bizjak <ubiz...@gmail.com> wrote: > On 2/10/19, H.J. Lu <hjl.to...@gmail.com> wrote: >> Emulate MMX packsswb/packssdw/packuswb with SSE >> packsswb/packssdw/packuswb >> plus moving bits 64:95 to bits 32:63 in SSE register. Only SSE register >> source operand is allowed. >> >> 2019-02-08 H.J. Lu <hongjiu...@intel.com> >> Uros Bizjak <ubiz...@gmail.com> >> >> PR target/89021 >> * config/i386/i386-protos.h (ix86_move_vector_high_sse_to_mmx): >> New prototype. >> (ix86_split_mmx_pack): Likewise. >> * config/i386/i386.c (ix86_move_vector_high_sse_to_mmx): New >> function. >> (ix86_split_mmx_pack): Likewise. >> * config/i386/i386.md (mmx_isa): New. >> (enabled): Also check mmx_isa. >> * config/i386/mmx.md (any_s_truncate): New code iterator. >> (s_trunsuffix): New code attr. >> (mmx_packsswb): Removed. >> (mmx_packssdw): Likewise. >> (mmx_packuswb): Likewise. >> (mmx_pack<s_trunsuffix>swb): New define_insn_and_split to emulate >> MMX packsswb/packuswb with SSE2. >> (mmx_packssdw): Likewise. > > LGTM, with a couple of nits below.
Oh, you also need DONE; at the end of preparation statements, otherwise splitters will inject (const_int 0) into the insn stream. Uros. >> --- >> gcc/config/i386/i386-protos.h | 3 ++ >> gcc/config/i386/i386.c | 54 ++++++++++++++++++++++++++++ >> gcc/config/i386/i386.md | 12 +++++++ >> gcc/config/i386/mmx.md | 67 +++++++++++++++++++---------------- >> 4 files changed, 106 insertions(+), 30 deletions(-) >> >> diff --git a/gcc/config/i386/i386-protos.h >> b/gcc/config/i386/i386-protos.h >> index 2d600173917..bb96a420a85 100644 >> --- a/gcc/config/i386/i386-protos.h >> +++ b/gcc/config/i386/i386-protos.h >> @@ -200,6 +200,9 @@ extern void ix86_expand_vecop_qihi (enum rtx_code, >> rtx, >> rtx, rtx); >> >> extern rtx ix86_split_stack_guard (void); >> >> +extern void ix86_move_vector_high_sse_to_mmx (rtx); >> +extern void ix86_split_mmx_pack (rtx[], enum rtx_code); >> + >> #ifdef TREE_CODE >> extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, >> int); >> #endif /* TREE_CODE */ >> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c >> index ba02c26c8b2..2af7f891350 100644 >> --- a/gcc/config/i386/i386.c >> +++ b/gcc/config/i386/i386.c >> @@ -19955,6 +19955,60 @@ ix86_expand_vector_move_misalign (machine_mode >> mode, rtx operands[]) >> gcc_unreachable (); >> } >> >> +/* Move bits 64:95 to bits 32:63. */ >> + >> +void >> +ix86_move_vector_high_sse_to_mmx (rtx op) >> +{ >> + rtx mask = gen_rtx_PARALLEL (VOIDmode, >> + gen_rtvec (4, GEN_INT (0), GEN_INT (2), >> + GEN_INT (0), GEN_INT (0))); >> + rtx dest = gen_rtx_REG (V4SImode, REGNO (op)); >> + op = gen_rtx_VEC_SELECT (V4SImode, dest, mask); >> + rtx insn = gen_rtx_SET (dest, op); >> + emit_insn (insn); >> +} >> + >> +/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */ >> + >> +void >> +ix86_split_mmx_pack (rtx operands[], enum rtx_code code) >> +{ >> + rtx op0 = operands[0]; >> + rtx op1 = operands[1]; >> + rtx op2 = operands[2]; >> + >> + machine_mode dmode = GET_MODE (op0); >> + machine_mode smode = GET_MODE (op1); >> + machine_mode inner_dmode = GET_MODE_INNER (dmode); >> + machine_mode inner_smode = GET_MODE_INNER (smode); >> + >> + /* Get the corresponding SSE mode for destination. */ >> + int nunits = 16 / GET_MODE_SIZE (inner_dmode); >> + machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode), >> + nunits).require (); >> + machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode), >> + nunits / 2).require (); >> + >> + /* Get the corresponding SSE mode for source. */ >> + nunits = 16 / GET_MODE_SIZE (inner_smode); >> + machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode), >> + nunits).require (); >> + >> + /* Generate SSE pack with signed/unsigned saturation. */ >> + rtx dest = gen_rtx_REG (sse_dmode, REGNO (op0)); >> + op1 = gen_rtx_REG (sse_smode, REGNO (op1)); >> + op2 = gen_rtx_REG (sse_smode, REGNO (op2)); >> + >> + op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1); >> + op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2); >> + rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode, >> + op1, op2)); >> + emit_insn (insn); >> + >> + ix86_move_vector_high_sse_to_mmx (op0); >> +} >> + >> /* Helper function of ix86_fixup_binary_operands to canonicalize >> operand order. Returns true if the operands should be swapped. */ >> >> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md >> index 4a32144a71a..72685107fc0 100644 >> --- a/gcc/config/i386/i386.md >> +++ b/gcc/config/i386/i386.md >> @@ -792,6 +792,9 @@ >> avx512vl,noavx512vl,x64_avx512dq,x64_avx512bw" >> (const_string "base")) >> >> +;; Define instruction set of MMX instructions >> +(define_attr "mmx_isa" "base,native,x64,x64_noavx,x64_avx" (const_string >> "base")) >> + >> (define_attr "enabled" "" >> (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT") >> (eq_attr "isa" "x64_sse2") >> @@ -830,6 +833,15 @@ >> (eq_attr "isa" "noavx512dq") (symbol_ref "!TARGET_AVX512DQ") >> (eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL") >> (eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL") >> + >> + (eq_attr "mmx_isa" "native") >> + (symbol_ref "!TARGET_MMX_WITH_SSE") >> + (eq_attr "mmx_isa" "x64") >> + (symbol_ref "TARGET_MMX_WITH_SSE") >> + (eq_attr "mmx_isa" "x64_avx") >> + (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX") >> + (eq_attr "mmx_isa" "x64_noavx") >> + (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX") >> ] >> (const_int 1))) >> >> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md >> index c1e0f2c411e..10096f7cab7 100644 >> --- a/gcc/config/i386/mmx.md >> +++ b/gcc/config/i386/mmx.md >> @@ -58,6 +58,11 @@ >> ;; Mapping from integer vector mode to mnemonic suffix >> (define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI >> "q")]) >> >> +;; Used in signed and unsigned truncations with saturation. >> +(define_code_iterator any_s_truncate [ss_truncate us_truncate]) >> +;; Instruction suffix for truncations with saturation. >> +(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")]) > > Please move definitions that have single use nearby their usage site. > >> + >> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; >> ;; >> ;; Move patterns >> @@ -1046,41 +1051,43 @@ >> ;; >> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; >> >> -(define_insn "mmx_packsswb" >> - [(set (match_operand:V8QI 0 "register_operand" "=y") >> +(define_insn_and_split "mmx_pack<s_trunsuffix>swb" >> + [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv") >> (vec_concat:V8QI >> - (ss_truncate:V4QI >> - (match_operand:V4HI 1 "register_operand" "0")) >> - (ss_truncate:V4QI >> - (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))] >> - "TARGET_MMX" >> - "packsswb\t{%2, %0|%0, %2}" >> - [(set_attr "type" "mmxshft") >> - (set_attr "mode" "DI")]) >> + (any_s_truncate:V4QI >> + (match_operand:V4HI 1 "register_operand" "0,0,Yv")) >> + (any_s_truncate:V4QI >> + (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv"))))] >> + "TARGET_MMX || TARGET_MMX_WITH_SSE" >> + "@ >> + pack<s_trunsuffix>swb\t{%2, %0|%0, %2} >> + # >> + #" >> + "&& reload_completed && TARGET_MMX_WITH_SSE" > > The above should be without first &&, with "reload_completed" last. In > effect, the condition of the separate split pattern would read as: > > "TARGET_MMX_WITH_SSE && reload_completed". > >> + [(const_int 0)] >> + "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>);" Missing DONE; above. >> + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") >> + (set_attr "type" "mmxshft,sselog,sselog") >> + (set_attr "mode" "DI,TI,TI")]) >> >> -(define_insn "mmx_packssdw" >> - [(set (match_operand:V4HI 0 "register_operand" "=y") >> +(define_insn_and_split "mmx_packssdw" >> + [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv") >> (vec_concat:V4HI >> (ss_truncate:V2HI >> - (match_operand:V2SI 1 "register_operand" "0")) >> + (match_operand:V2SI 1 "register_operand" "0,0,Yv")) >> (ss_truncate:V2HI >> - (match_operand:V2SI 2 "nonimmediate_operand" "ym"))))] >> - "TARGET_MMX" >> - "packssdw\t{%2, %0|%0, %2}" >> - [(set_attr "type" "mmxshft") >> - (set_attr "mode" "DI")]) >> - >> -(define_insn "mmx_packuswb" >> - [(set (match_operand:V8QI 0 "register_operand" "=y") >> - (vec_concat:V8QI >> - (us_truncate:V4QI >> - (match_operand:V4HI 1 "register_operand" "0")) >> - (us_truncate:V4QI >> - (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))] >> - "TARGET_MMX" >> - "packuswb\t{%2, %0|%0, %2}" >> - [(set_attr "type" "mmxshft") >> - (set_attr "mode" "DI")]) >> + (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv"))))] >> + "TARGET_MMX || TARGET_MMX_WITH_SSE" >> + "@ >> + packssdw\t{%2, %0|%0, %2} >> + # >> + #" >> + "&& reload_completed && TARGET_MMX_WITH_SSE" > > Also here. > >> + [(const_int 0)] >> + "ix86_split_mmx_pack (operands, SS_TRUNCATE);" And here. >> + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") >> + (set_attr "type" "mmxshft,sselog,sselog") >> + (set_attr "mode" "DI,TI,TI")]) >> >> (define_insn "mmx_punpckhbw" >> [(set (match_operand:V8QI 0 "register_operand" "=y") >> -- >> 2.20.1 >> >> >