On 2/10/19, H.J. Lu <hjl.to...@gmail.com> wrote: > Emulate MMX punpcklXX/punpckhXX with SSE punpcklXX. For MMX punpckhXX, > move bits 64:127 to bits 0:63 in SSE register. Only SSE register source > operand is allowed. > > PR target/89021 > * config/i386/i386-protos.h (ix86_split_mmx_punpck): New > prototype. > * config/i386/i386.c (ix86_split_mmx_punpck): New function. > * config/i386/mmx.m (mmx_punpckhbw): Changed to > define_insn_and_split to support SSE emulation. > (mmx_punpcklbw): Likewise. > (mmx_punpckhwd): Likewise. > (mmx_punpcklwd): Likewise. > (mmx_punpckhdq): Likewise. > (mmx_punpckldq): Likewise.
Please fix split condition (as in the previous patch) and add missing DONEs. Uros. > --- > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.c | 77 +++++++++++++++++++ > gcc/config/i386/mmx.md | 138 ++++++++++++++++++++++------------ > 3 files changed, 168 insertions(+), 48 deletions(-) > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index bb96a420a85..dc7fc38d8e4 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -202,6 +202,7 @@ extern rtx ix86_split_stack_guard (void); > > extern void ix86_move_vector_high_sse_to_mmx (rtx); > extern void ix86_split_mmx_pack (rtx[], enum rtx_code); > +extern void ix86_split_mmx_punpck (rtx[], bool); > > #ifdef TREE_CODE > extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, > int); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 2af7f891350..cf7a71bcc02 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -20009,6 +20009,83 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code > code) > ix86_move_vector_high_sse_to_mmx (op0); > } > > +/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */ > + > +void > +ix86_split_mmx_punpck (rtx operands[], bool high_p) > +{ > + rtx op0 = operands[0]; > + rtx op1 = operands[1]; > + rtx op2 = operands[2]; > + machine_mode mode = GET_MODE (op0); > + rtx mask; > + /* The corresponding SSE mode. */ > + machine_mode sse_mode, double_sse_mode; > + > + switch (mode) > + { > + case E_V8QImode: > + sse_mode = V16QImode; > + double_sse_mode = V32QImode; > + mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (16, > + GEN_INT (0), GEN_INT (16), > + GEN_INT (1), GEN_INT (17), > + GEN_INT (2), GEN_INT (18), > + GEN_INT (3), GEN_INT (19), > + GEN_INT (4), GEN_INT (20), > + GEN_INT (5), GEN_INT (21), > + GEN_INT (6), GEN_INT (22), > + GEN_INT (7), GEN_INT (23))); > + break; > + > + case E_V4HImode: > + sse_mode = V8HImode; > + double_sse_mode = V16HImode; > + mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (8, > + GEN_INT (0), GEN_INT (8), > + GEN_INT (1), GEN_INT (9), > + GEN_INT (2), GEN_INT (10), > + GEN_INT (3), GEN_INT (11))); > + break; > + > + case E_V2SImode: > + sse_mode = V4SImode; > + double_sse_mode = V8SImode; > + mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (4, > + GEN_INT (0), GEN_INT (4), > + GEN_INT (1), GEN_INT (5))); > + break; > + > + default: > + gcc_unreachable (); > + } > + > + /* Generate SSE punpcklXX. */ > + rtx dest = gen_rtx_REG (sse_mode, REGNO (op0)); > + op1 = gen_rtx_REG (sse_mode, REGNO (op1)); > + op2 = gen_rtx_REG (sse_mode, REGNO (op2)); > + > + op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2); > + op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask); > + rtx insn = gen_rtx_SET (dest, op2); > + emit_insn (insn); > + > + if (high_p) > + { > + /* Move bits 64:127 to bits 0:63. */ > + mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (4, GEN_INT (2), GEN_INT (3), > + GEN_INT (0), GEN_INT (0))); > + dest = gen_rtx_REG (V4SImode, REGNO (dest)); > + op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); > + insn = gen_rtx_SET (dest, op1); > + emit_insn (insn); > + } > +} > + > /* Helper function of ix86_fixup_binary_operands to canonicalize > operand order. Returns true if the operands should be swapped. */ > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 10096f7cab7..ff9c5dc8507 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -1089,87 +1089,129 @@ > (set_attr "type" "mmxshft,sselog,sselog") > (set_attr "mode" "DI,TI,TI")]) > > -(define_insn "mmx_punpckhbw" > - [(set (match_operand:V8QI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_punpckhbw" > + [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv") > (vec_select:V8QI > (vec_concat:V16QI > - (match_operand:V8QI 1 "register_operand" "0") > - (match_operand:V8QI 2 "nonimmediate_operand" "ym")) > + (match_operand:V8QI 1 "register_operand" "0,0,Yv") > + (match_operand:V8QI 2 "nonimmediate_operand" "ym,x,Yv")) > (parallel [(const_int 4) (const_int 12) > (const_int 5) (const_int 13) > (const_int 6) (const_int 14) > (const_int 7) (const_int 15)])))] > - "TARGET_MMX" > - "punpckhbw\t{%2, %0|%0, %2}" > - [(set_attr "type" "mmxcvt") > - (set_attr "mode" "DI")]) > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + punpckhbw\t{%2, %0|%0, %2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" > + [(const_int 0)] > + "ix86_split_mmx_punpck (operands, true);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxcvt,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > -(define_insn "mmx_punpcklbw" > - [(set (match_operand:V8QI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_punpcklbw" > + [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv") > (vec_select:V8QI > (vec_concat:V16QI > - (match_operand:V8QI 1 "register_operand" "0") > - (match_operand:V8QI 2 "nonimmediate_operand" "ym")) > + (match_operand:V8QI 1 "register_operand" "0,0,Yv") > + (match_operand:V8QI 2 "nonimmediate_operand" "ym,x,Yv")) > (parallel [(const_int 0) (const_int 8) > (const_int 1) (const_int 9) > (const_int 2) (const_int 10) > (const_int 3) (const_int 11)])))] > - "TARGET_MMX" > - "punpcklbw\t{%2, %0|%0, %k2}" > - [(set_attr "type" "mmxcvt") > - (set_attr "mode" "DI")]) > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + punpcklbw\t{%2, %0|%0, %k2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" > + [(const_int 0)] > + "ix86_split_mmx_punpck (operands, false);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxcvt,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > -(define_insn "mmx_punpckhwd" > - [(set (match_operand:V4HI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_punpckhwd" > + [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv") > (vec_select:V4HI > (vec_concat:V8HI > - (match_operand:V4HI 1 "register_operand" "0") > - (match_operand:V4HI 2 "nonimmediate_operand" "ym")) > + (match_operand:V4HI 1 "register_operand" "0,0,Yv") > + (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv")) > (parallel [(const_int 2) (const_int 6) > (const_int 3) (const_int 7)])))] > - "TARGET_MMX" > - "punpckhwd\t{%2, %0|%0, %2}" > - [(set_attr "type" "mmxcvt") > - (set_attr "mode" "DI")]) > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + punpckhwd\t{%2, %0|%0, %2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" > + [(const_int 0)] > + "ix86_split_mmx_punpck (operands, true);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxcvt,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > -(define_insn "mmx_punpcklwd" > - [(set (match_operand:V4HI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_punpcklwd" > + [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv") > (vec_select:V4HI > (vec_concat:V8HI > - (match_operand:V4HI 1 "register_operand" "0") > - (match_operand:V4HI 2 "nonimmediate_operand" "ym")) > + (match_operand:V4HI 1 "register_operand" "0,0,Yv") > + (match_operand:V4HI 2 "nonimmediate_operand" "ym,x,Yv")) > (parallel [(const_int 0) (const_int 4) > (const_int 1) (const_int 5)])))] > - "TARGET_MMX" > - "punpcklwd\t{%2, %0|%0, %k2}" > - [(set_attr "type" "mmxcvt") > - (set_attr "mode" "DI")]) > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + punpcklwd\t{%2, %0|%0, %k2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" > + [(const_int 0)] > + "ix86_split_mmx_punpck (operands, false);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxcvt,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > -(define_insn "mmx_punpckhdq" > - [(set (match_operand:V2SI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_punpckhdq" > + [(set (match_operand:V2SI 0 "register_operand" "=y,x,Yv") > (vec_select:V2SI > (vec_concat:V4SI > - (match_operand:V2SI 1 "register_operand" "0") > - (match_operand:V2SI 2 "nonimmediate_operand" "ym")) > + (match_operand:V2SI 1 "register_operand" "0,0,Yv") > + (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv")) > (parallel [(const_int 1) > (const_int 3)])))] > - "TARGET_MMX" > - "punpckhdq\t{%2, %0|%0, %2}" > - [(set_attr "type" "mmxcvt") > - (set_attr "mode" "DI")]) > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + punpckhdq\t{%2, %0|%0, %2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" > + [(const_int 0)] > + "ix86_split_mmx_punpck (operands, true);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxcvt,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > -(define_insn "mmx_punpckldq" > - [(set (match_operand:V2SI 0 "register_operand" "=y") > +(define_insn_and_split "mmx_punpckldq" > + [(set (match_operand:V2SI 0 "register_operand" "=y,x,Yv") > (vec_select:V2SI > (vec_concat:V4SI > - (match_operand:V2SI 1 "register_operand" "0") > - (match_operand:V2SI 2 "nonimmediate_operand" "ym")) > + (match_operand:V2SI 1 "register_operand" "0,0,Yv") > + (match_operand:V2SI 2 "nonimmediate_operand" "ym,x,Yv")) > (parallel [(const_int 0) > (const_int 2)])))] > - "TARGET_MMX" > - "punpckldq\t{%2, %0|%0, %k2}" > - [(set_attr "type" "mmxcvt") > - (set_attr "mode" "DI")]) > + "TARGET_MMX || TARGET_MMX_WITH_SSE" > + "@ > + punpckldq\t{%2, %0|%0, %k2} > + # > + #" > + "&& reload_completed && TARGET_MMX_WITH_SSE" > + [(const_int 0)] > + "ix86_split_mmx_punpck (operands, false);" > + [(set_attr "mmx_isa" "native,x64_noavx,x64_avx") > + (set_attr "type" "mmxcvt,sselog,sselog") > + (set_attr "mode" "DI,TI,TI")]) > > (define_expand "mmx_pinsrw" > [(set (match_operand:V4HI 0 "register_operand") > -- > 2.20.1 > >