On 2/10/19, H.J. Lu <hjl.to...@gmail.com> wrote: > Emulate MMX packsswb/packssdw/packuswb with SSE packsswb/packssdw/packuswb > plus moving bits 64:95 to bits 32:63 in SSE register. Only SSE register > source operand is allowed. > > 2019-02-08 H.J. Lu <hongjiu...@intel.com> > Uros Bizjak <ubiz...@gmail.com> > > PR target/89021 > * config/i386/i386-protos.h (ix86_move_vector_high_sse_to_mmx): > New prototype. > (ix86_split_mmx_pack): Likewise. > * config/i386/i386.c (ix86_move_vector_high_sse_to_mmx): New > function. > (ix86_split_mmx_pack): Likewise. > * config/i386/i386.md (mmx_isa): New. > (enabled): Also check mmx_isa. > * config/i386/mmx.md (any_s_truncate): New code iterator. > (s_trunsuffix): New code attr. > (mmx_packsswb): Removed. > (mmx_packssdw): Likewise. > (mmx_packuswb): Likewise. > (mmx_pack<s_trunsuffix>swb): New define_insn_and_split to emulate > MMX packsswb/packuswb with SSE2. > (mmx_packssdw): Likewise.
LGTM, with a couple of nits below. > --- > gcc/config/i386/i386-protos.h | 3 ++ > gcc/config/i386/i386.c | 54 ++++++++++++++++++++++++++++ > gcc/config/i386/i386.md | 12 +++++++ > gcc/config/i386/mmx.md | 67 +++++++++++++++++++---------------- > 4 files changed, 106 insertions(+), 30 deletions(-) > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 2d600173917..bb96a420a85 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -200,6 +200,9 @@ extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, > rtx, rtx); > > extern rtx ix86_split_stack_guard (void); > > +extern void ix86_move_vector_high_sse_to_mmx (rtx); > +extern void ix86_split_mmx_pack (rtx[], enum rtx_code); > + > #ifdef TREE_CODE > extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, > int); > #endif /* TREE_CODE */ > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index ba02c26c8b2..2af7f891350 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -19955,6 +19955,60 @@ ix86_expand_vector_move_misalign (machine_mode > mode, rtx operands[]) > gcc_unreachable (); > } > > +/* Move bits 64:95 to bits 32:63. */ > + > +void > +ix86_move_vector_high_sse_to_mmx (rtx op) > +{ > + rtx mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (4, GEN_INT (0), GEN_INT (2), > + GEN_INT (0), GEN_INT (0))); > + rtx dest = gen_rtx_REG (V4SImode, REGNO (op)); > + op = gen_rtx_VEC_SELECT (V4SImode, dest, mask); > + rtx insn = gen_rtx_SET (dest, op); > + emit_insn (insn); > +} > + > +/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */ > + > +void > +ix86_split_mmx_pack (rtx operands[], enum rtx_code code) > +{ > + rtx op0 = operands[0]; > + rtx op1 = operands[1]; > + rtx op2 = operands[2]; > + > + machine_mode dmode = GET_MODE (op0); > + machine_mode smode = GET_MODE (op1); > + machine_mode inner_dmode = GET_MODE_INNER (dmode); > + machine_mode inner_smode = GET_MODE_INNER (smode); > + > + /* Get the corresponding SSE mode for destination. */ > + int nunits = 16 / GET_MODE_SIZE (inner_dmode); > + machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode), > + nunits).require (); > + machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode), > + nunits / 2).require (); > + > + /* Get the corresponding SSE mode for source. */ > + nunits = 16 / GET_MODE_SIZE (inner_smode); > + machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode), > + nunits).require (); > + > + /* Generate SSE pack with signed/unsigned saturation. */ > + rtx dest = gen_rtx_REG (sse_dmode, REGNO (op0)); > + op1 = gen_rtx_REG (sse_smode, REGNO (op1)); > + op2 = gen_rtx_REG (sse_smode, REGNO (op2)); > + > + op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1); > + op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2); > + rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode, > + op1, op2)); > + emit_insn (insn); > + > + ix86_move_vector_high_sse_to_mmx (op0); > +} > + > /* Helper function of ix86_fixup_binary_operands to canonicalize > operand order. Returns true if the operands should be swapped. */ > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 4a32144a71a..72685107fc0 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -792,6 +792,9 @@ > avx512vl,noavx512vl,x64_avx512dq,x64_avx512bw" > (const_string "base")) > > +;; Define instruction set of MMX instructions > +(define_attr "mmx_isa" "base,native,x64,x64_noavx,x64_avx" (const_string > "base")) > + > (define_attr "enabled" "" > (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT") > (eq_attr "isa" "x64_sse2") > @@ -830,6 +833,15 @@ > (eq_attr "isa" "noavx512dq") (symbol_ref "!TARGET_AVX512DQ") > (eq_attr "isa" "avx512vl") (symbol_ref "TARGET_AVX512VL") > (eq_attr "isa" "noavx512vl") (symbol_ref "!TARGET_AVX512VL") > + > + (eq_attr "mmx_isa" "native") > + (symbol_ref "!TARGET_MMX_WITH_SSE") > + (eq_attr "mmx_isa" "x64") > + (symbol_ref "TARGET_MMX_WITH_SSE") > + (eq_attr "mmx_isa" "x64_avx") > + (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX") > + (eq_attr "mmx_isa" "x64_noavx") > + (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX") > ] > (const_int 1))) > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index c1e0f2c411e..10096f7cab7 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -58,6 +58,11 @@ > ;; Mapping from integer vector mode to mnemonic suffix > (define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI > "q")]) > > +;; Used in signed and unsigned truncations with saturation. > +(define_code_iterator any_s_truncate [ss_truncate us_truncate]) > +;; Instruction suffix for truncations with saturation. > +(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")]) Please move definitions that have single use nearby their usage site. > + > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > ;; > ;; Move patterns > @@ -1046,41 +1051,43 @@ > ;; > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > -(define_insn "mmx_packsswb" > - [(set (match_operand:V8QI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_pack<s_trunsuffix>swb" > + [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv") > (vec_concat:V8QI > - (ss_truncate:V4QI > - (match_operand:V4HI 1 "register_operand" "0")) > - (ss_truncate:V4QI > - (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))] > - "TARGET_MMX" > - "packsswb\t{%2, %0|%0, %2}" > - [(set_attr "type" "mmxshft") > - (set_attr "mode" "DI")]) > + (any_s_truncate:V4QI > + (match_operand:V4HI 1 "register_operand" "0,0,Yv")) > + (any_s_truncate:V4QI > + (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv"))))] > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + pack<s_trunsuffix>swb\t{%2, %0|%0, %2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" The above should be without first &&, with "reload_completed" last. In effect, the condition of the separate split pattern would read as: "TARGET_MMX_WITH_SSE && reload_completed". > + [(const_int 0)] > + "ix86_split_mmx_pack (operands, <any_s_truncate:CODE>);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxshft,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > -(define_insn "mmx_packssdw" > - [(set (match_operand:V4HI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_packssdw" > + [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv") > (vec_concat:V4HI > (ss_truncate:V2HI > - (match_operand:V2SI 1 "register_operand" "0")) > + (match_operand:V2SI 1 "register_operand" "0,0,Yv")) > (ss_truncate:V2HI > - (match_operand:V2SI 2 "nonimmediate_operand" "ym"))))] > - "TARGET_MMX" > - "packssdw\t{%2, %0|%0, %2}" > - [(set_attr "type" "mmxshft") > - (set_attr "mode" "DI")]) > - > -(define_insn "mmx_packuswb" > - [(set (match_operand:V8QI 0 "register_operand" "=y") > - (vec_concat:V8QI > - (us_truncate:V4QI > - (match_operand:V4HI 1 "register_operand" "0")) > - (us_truncate:V4QI > - (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))] > - "TARGET_MMX" > - "packuswb\t{%2, %0|%0, %2}" > - [(set_attr "type" "mmxshft") > - (set_attr "mode" "DI")]) > + (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv"))))] > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + packssdw\t{%2, %0|%0, %2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" Also here. > + [(const_int 0)] > + "ix86_split_mmx_pack (operands, SS_TRUNCATE);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxshft,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > (define_insn "mmx_punpckhbw" > [(set (match_operand:V8QI 0 "register_operand" "=y") > -- > 2.20.1 > >