Hello! Please note that you will probably hit PR33329, this is the reason that we expand multiplications after reload. Please see [1] for further explanation. There is gcc.target/i386/pr33329.c test to cover this issue, but it is not effective anymore since the simplification happens at tree level.
[1] http://gcc.gnu.org/ml/gcc-patches/2007-09/msg00668.html Uros. On Fri, Jun 15, 2012 at 10:57 PM, Richard Henderson <r...@redhat.com> wrote: > --- > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.c | 76 > +++++++++++++++++++++++++++++++++++++++++ > gcc/config/i386/predicates.md | 7 ++++ > gcc/config/i386/sse.md | 72 +++++++------------------------------- > 4 files changed, 97 insertions(+), 59 deletions(-) > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index f300a56..431db6c 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), > rtx, rtx); > > extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); > extern bool ix86_expand_pinsr (rtx *); > +extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); > > /* In i386-c.c */ > extern void ix86_target_macros (void); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 578a756..0dc08f3 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, > rtx op1, unsigned odd) > expand_vec_perm_even_odd_1 (&d, odd); > } > > +void > +ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) > +{ > + rtx op1_m1, op1_m2; > + rtx op2_m1, op2_m2; > + rtx res_1, res_2; > + > + /* Shift both input vectors down one element, so that elements 3 > + and 1 are now in the slots for elements 2 and 0. For K8, at > + least, this is faster than using a shuffle. */ > + op1_m1 = op1 = force_reg (V4SImode, op1); > + op1_m2 = gen_reg_rtx (V4SImode); > + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2), > + gen_lowpart (V1TImode, op1), > + GEN_INT (32))); > + > + if (GET_CODE (op2) == CONST_VECTOR) > + { > + rtvec v; > + > + /* Constant propagate the vector shift, leaving the dont-care > + vector elements as zero. */ > + v = rtvec_alloc (4); > + RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0); > + RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2); > + RTVEC_ELT (v, 1) = const0_rtx; > + RTVEC_ELT (v, 3) = const0_rtx; > + op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v); > + op2_m1 = force_reg (V4SImode, op2_m1); > + > + v = rtvec_alloc (4); > + RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1); > + RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3); > + RTVEC_ELT (v, 1) = const0_rtx; > + RTVEC_ELT (v, 3) = const0_rtx; > + op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v); > + op2_m2 = force_reg (V4SImode, op2_m2); > + } > + else > + { > + op2_m1 = op2 = force_reg (V4SImode, op2); > + op2_m2 = gen_reg_rtx (V4SImode); > + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2), > + gen_lowpart (V1TImode, op2), > + GEN_INT (32))); > + } > + > + /* Widening multiply of elements 0+2, and 1+3. */ > + res_1 = gen_reg_rtx (V4SImode); > + res_2 = gen_reg_rtx (V4SImode); > + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1), > + op1_m1, op2_m1)); > + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2), > + op1_m2, op2_m2)); > + > + /* Move the results in element 2 down to element 1; we don't care > + what goes in elements 2 and 3. Then we can merge the parts > + back together with an interleave. > + > + Note that two other sequences were tried: > + (1) Use interleaves at the start instead of psrldq, which allows > + us to use a single shufps to merge things back at the end. > + (2) Use shufps here to combine the two vectors, then pshufd to > + put the elements in the correct order. > + In both cases the cost of the reformatting stall was too high > + and the overall sequence slower. */ > + > + emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx, > + const0_rtx, const0_rtx)); > + emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx, > + const0_rtx, const0_rtx)); > + res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); > + > + set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); > +} > + > /* Expand an insert into a vector register through pinsr insn. > Return true if successful. */ > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index 92db809..f23e932 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -816,6 +816,13 @@ > return false; > }) > > +;; Return true when OP is a nonimmediate or a vector constant. Note > +;; that most vector constants are not legitimate operands, so we need > +;; to special-case this. > +(define_predicate "nonimmediate_or_const_vector_operand" > + (ior (match_code "const_vector") > + (match_operand 0 "nonimmediate_operand"))) > + > ;; Return true if OP is a register or a zero. > (define_predicate "reg_or_0_operand" > (ior (match_operand 0 "register_operand") > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 6a8206a..1f6fdb4 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -5610,12 +5610,22 @@ > > (define_expand "mul<mode>3" > [(set (match_operand:VI4_AVX2 0 "register_operand") > - (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand") > - (match_operand:VI4_AVX2 2 "register_operand")))] > + (mult:VI4_AVX2 > + (match_operand:VI4_AVX2 1 "nonimmediate_operand") > + (match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))] > "TARGET_SSE2" > { > if (TARGET_SSE4_1 || TARGET_AVX) > - ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); > + { > + if (CONSTANT_P (operands[2])) > + operands[2] = force_const_mem (<MODE>mode, operands[2]); > + ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); > + } > + else > + { > + ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]); > + DONE; > + } > }) > > (define_insn "*<sse4_1_avx2>_mul<mode>3" > @@ -5633,62 +5643,6 @@ > (set_attr "prefix" "orig,vex") > (set_attr "mode" "<sseinsnmode>")]) > > -(define_insn_and_split "*sse2_mulv4si3" > - [(set (match_operand:V4SI 0 "register_operand") > - (mult:V4SI (match_operand:V4SI 1 "register_operand") > - (match_operand:V4SI 2 "register_operand")))] > - "TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX > - && can_create_pseudo_p ()" > - "#" > - "&& 1" > - [(const_int 0)] > -{ > - rtx t1, t2, t3, t4, t5, t6, thirtytwo; > - rtx op0, op1, op2; > - > - op0 = operands[0]; > - op1 = operands[1]; > - op2 = operands[2]; > - t1 = gen_reg_rtx (V4SImode); > - t2 = gen_reg_rtx (V4SImode); > - t3 = gen_reg_rtx (V4SImode); > - t4 = gen_reg_rtx (V4SImode); > - t5 = gen_reg_rtx (V4SImode); > - t6 = gen_reg_rtx (V4SImode); > - thirtytwo = GEN_INT (32); > - > - /* Multiply elements 2 and 0. */ > - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), > - op1, op2)); > - > - /* Shift both input vectors down one element, so that elements 3 > - and 1 are now in the slots for elements 2 and 0. For K8, at > - least, this is faster than using a shuffle. */ > - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2), > - gen_lowpart (V1TImode, op1), > - thirtytwo)); > - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3), > - gen_lowpart (V1TImode, op2), > - thirtytwo)); > - /* Multiply elements 3 and 1. */ > - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), > - t2, t3)); > - > - /* Move the results in element 2 down to element 1; we don't care > - what goes in elements 2 and 3. */ > - emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, > - const0_rtx, const0_rtx)); > - emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, > - const0_rtx, const0_rtx)); > - > - /* Merge the parts back together. */ > - emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6)); > - > - set_unique_reg_note (get_last_insn (), REG_EQUAL, > - gen_rtx_MULT (V4SImode, operands[1], operands[2])); > - DONE; > -}) > - > (define_insn_and_split "mul<mode>3" > [(set (match_operand:VI8_AVX2 0 "register_operand") > (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand") > -- > 1.7.7.6 >