Hello!

Please note that you will probably hit PR33329, this is the reason
that we expand multiplications after reload. Please see [1] for
further explanation. There is gcc.target/i386/pr33329.c test to cover
this issue, but it is not effective anymore since the simplification
happens at tree level.

[1] http://gcc.gnu.org/ml/gcc-patches/2007-09/msg00668.html

Uros.
On Fri, Jun 15, 2012 at 10:57 PM, Richard Henderson <r...@redhat.com> wrote:
> ---
>  gcc/config/i386/i386-protos.h |    1 +
>  gcc/config/i386/i386.c        |   76 
> +++++++++++++++++++++++++++++++++++++++++
>  gcc/config/i386/predicates.md |    7 ++++
>  gcc/config/i386/sse.md        |   72 +++++++-------------------------------
>  4 files changed, 97 insertions(+), 59 deletions(-)
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index f300a56..431db6c 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), 
> rtx, rtx);
>
>  extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
>  extern bool ix86_expand_pinsr (rtx *);
> +extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
>
>  /* In i386-c.c  */
>  extern void ix86_target_macros (void);
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 578a756..0dc08f3 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, 
> rtx op1, unsigned odd)
>   expand_vec_perm_even_odd_1 (&d, odd);
>  }
>
> +void
> +ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
> +{
> +  rtx op1_m1, op1_m2;
> +  rtx op2_m1, op2_m2;
> +  rtx res_1, res_2;
> +
> +  /* Shift both input vectors down one element, so that elements 3
> +     and 1 are now in the slots for elements 2 and 0.  For K8, at
> +     least, this is faster than using a shuffle.  */
> +  op1_m1 = op1 = force_reg (V4SImode, op1);
> +  op1_m2 = gen_reg_rtx (V4SImode);
> +  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
> +                                gen_lowpart (V1TImode, op1),
> +                                GEN_INT (32)));
> +
> +  if (GET_CODE (op2) == CONST_VECTOR)
> +    {
> +      rtvec v;
> +
> +      /* Constant propagate the vector shift, leaving the dont-care
> +        vector elements as zero.  */
> +      v = rtvec_alloc (4);
> +      RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
> +      RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
> +      RTVEC_ELT (v, 1) = const0_rtx;
> +      RTVEC_ELT (v, 3) = const0_rtx;
> +      op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
> +      op2_m1 = force_reg (V4SImode, op2_m1);
> +
> +      v = rtvec_alloc (4);
> +      RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
> +      RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
> +      RTVEC_ELT (v, 1) = const0_rtx;
> +      RTVEC_ELT (v, 3) = const0_rtx;
> +      op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
> +      op2_m2 = force_reg (V4SImode, op2_m2);
> +    }
> +  else
> +    {
> +      op2_m1 = op2 = force_reg (V4SImode, op2);
> +      op2_m2 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
> +                                    gen_lowpart (V1TImode, op2),
> +                                    GEN_INT (32)));
> +    }
> +
> +  /* Widening multiply of elements 0+2, and 1+3.  */
> +  res_1 = gen_reg_rtx (V4SImode);
> +  res_2 = gen_reg_rtx (V4SImode);
> +  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
> +                                    op1_m1, op2_m1));
> +  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
> +                                    op1_m2, op2_m2));
> +
> +  /* Move the results in element 2 down to element 1; we don't care
> +     what goes in elements 2 and 3.  Then we can merge the parts
> +     back together with an interleave.
> +
> +     Note that two other sequences were tried:
> +     (1) Use interleaves at the start instead of psrldq, which allows
> +     us to use a single shufps to merge things back at the end.
> +     (2) Use shufps here to combine the two vectors, then pshufd to
> +     put the elements in the correct order.
> +     In both cases the cost of the reformatting stall was too high
> +     and the overall sequence slower.  */
> +
> +  emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
> +                               const0_rtx, const0_rtx));
> +  emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
> +                               const0_rtx, const0_rtx));
> +  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
> +
> +  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
> +}
> +
>  /* Expand an insert into a vector register through pinsr insn.
>    Return true if successful.  */
>
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index 92db809..f23e932 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -816,6 +816,13 @@
>   return false;
>  })
>
> +;; Return true when OP is a nonimmediate or a vector constant.  Note
> +;; that most vector constants are not legitimate operands, so we need
> +;; to special-case this.
> +(define_predicate "nonimmediate_or_const_vector_operand"
> +  (ior (match_code "const_vector")
> +       (match_operand 0 "nonimmediate_operand")))
> +
>  ;; Return true if OP is a register or a zero.
>  (define_predicate "reg_or_0_operand"
>   (ior (match_operand 0 "register_operand")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 6a8206a..1f6fdb4 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -5610,12 +5610,22 @@
>
>  (define_expand "mul<mode>3"
>   [(set (match_operand:VI4_AVX2 0 "register_operand")
> -       (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand")
> -                      (match_operand:VI4_AVX2 2 "register_operand")))]
> +       (mult:VI4_AVX2
> +         (match_operand:VI4_AVX2 1 "nonimmediate_operand")
> +         (match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
>   "TARGET_SSE2"
>  {
>   if (TARGET_SSE4_1 || TARGET_AVX)
> -    ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
> +    {
> +      if (CONSTANT_P (operands[2]))
> +       operands[2] = force_const_mem (<MODE>mode, operands[2]);
> +      ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
> +    }
> +  else
> +    {
> +      ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]);
> +      DONE;
> +    }
>  })
>
>  (define_insn "*<sse4_1_avx2>_mul<mode>3"
> @@ -5633,62 +5643,6 @@
>    (set_attr "prefix" "orig,vex")
>    (set_attr "mode" "<sseinsnmode>")])
>
> -(define_insn_and_split "*sse2_mulv4si3"
> -  [(set (match_operand:V4SI 0 "register_operand")
> -       (mult:V4SI (match_operand:V4SI 1 "register_operand")
> -                  (match_operand:V4SI 2 "register_operand")))]
> -  "TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX
> -   && can_create_pseudo_p ()"
> -  "#"
> -  "&& 1"
> -  [(const_int 0)]
> -{
> -  rtx t1, t2, t3, t4, t5, t6, thirtytwo;
> -  rtx op0, op1, op2;
> -
> -  op0 = operands[0];
> -  op1 = operands[1];
> -  op2 = operands[2];
> -  t1 = gen_reg_rtx (V4SImode);
> -  t2 = gen_reg_rtx (V4SImode);
> -  t3 = gen_reg_rtx (V4SImode);
> -  t4 = gen_reg_rtx (V4SImode);
> -  t5 = gen_reg_rtx (V4SImode);
> -  t6 = gen_reg_rtx (V4SImode);
> -  thirtytwo = GEN_INT (32);
> -
> -  /* Multiply elements 2 and 0.  */
> -  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
> -                                    op1, op2));
> -
> -  /* Shift both input vectors down one element, so that elements 3
> -     and 1 are now in the slots for elements 2 and 0.  For K8, at
> -     least, this is faster than using a shuffle.  */
> -  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2),
> -                                gen_lowpart (V1TImode, op1),
> -                                thirtytwo));
> -  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3),
> -                                gen_lowpart (V1TImode, op2),
> -                                thirtytwo));
> -  /* Multiply elements 3 and 1.  */
> -  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
> -                                    t2, t3));
> -
> -  /* Move the results in element 2 down to element 1; we don't care
> -     what goes in elements 2 and 3.  */
> -  emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
> -                               const0_rtx, const0_rtx));
> -  emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
> -                               const0_rtx, const0_rtx));
> -
> -  /* Merge the parts back together.  */
> -  emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6));
> -
> -  set_unique_reg_note (get_last_insn (), REG_EQUAL,
> -                      gen_rtx_MULT (V4SImode, operands[1], operands[2]));
> -  DONE;
> -})
> -
>  (define_insn_and_split "mul<mode>3"
>   [(set (match_operand:VI8_AVX2 0 "register_operand")
>        (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
> --
> 1.7.7.6
>

Reply via email to