[PATCH 2/3] i386: Implement widen_smul_*_v4si for plain sse2

Richard Henderson Tue, 26 Jun 2012 20:21:25 -0700

If we don't implement this pattern, the vectorizer is happy to
unpack the v4si and use the full mulv2di3.  This results in
more element shuffling than is required.


        * config/i386/i386.c (bdesc_args): Update.  Change
        IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI to OPTION_MASK_ISA_SSE2.
        (IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI): New.
        (ix86_builtin_mul_widen_even): Use it.
        (ix86_builtin_mul_widen_odd): Relax SMUL_ODD from sse4 to sse2.
        (ix86_expand_mul_widen_evenodd): Handle signed for sse2.
        * config/i386/sse.md (vec_widen_<s>mult_hi_<V124_AVX2>): Allow
        for all SSE2.
        (vec_widen_<s>mult_lo_<V124_AVX2>): Likewise.
        (vec_widen_<s>mult_odd_<VI4_AVX2>): Likewise.  Relax from V124_AVX2.
        (vec_widen_smult_even_v4si): New.
---
 gcc/ChangeLog          |   14 +++++++++
 gcc/config/i386/i386.c |   77 +++++++++++++++++++++++++++++------------------
 gcc/config/i386/sse.md |   29 +++++++++++-------
 3 files changed, 79 insertions(+), 41 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5cf230f..b96fc6e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -25758,6 +25758,7 @@ enum ix86_builtins
   IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI,
   IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI,
   IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI,
+  IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI,
   IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI,
   IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI,
 
@@ -26620,7 +26621,9 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, 
"__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) 
V1DI_FTYPE_V2SI_V2SI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, 
"__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) 
V2DI_FTYPE_V4SI_V4SI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, 
"__builtin_vw_umul_even_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, UNKNOWN, 
(int) V2UDI_FTYPE_V4USI_V4USI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_even_v4si, 
"__builtin_ia32_vw_smul_even_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI, 
UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_odd_v4si, 
"__builtin_ia32_vw_umul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, 
UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_odd_v4si, 
"__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, 
UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", 
IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
 
@@ -26747,7 +26750,6 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", 
IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", 
IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, 
"__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) 
V2DI_FTYPE_V4SI_V4SI },
-  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_vec_widen_smult_odd_v4si, 
"__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, 
UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
   { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", 
IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
 
   /* SSE4.1 */
@@ -31067,18 +31069,10 @@ ix86_builtin_mul_widen_even (tree type)
   switch (TYPE_MODE (type))
     {
     case V4SImode:
-      if (uns_p)
-       {
-         if (!TARGET_SSE2)
-           return NULL;
-         code = IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI;
-       }
-      else
-       {
-         if (!TARGET_SSE4_1)
-           return NULL;
-         code = IX86_BUILTIN_PMULDQ128;
-       }
+      if (!TARGET_SSE2)
+       return NULL;
+      code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI
+             : IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI);
       break;
 
     case V8SImode:
@@ -31103,18 +31097,10 @@ ix86_builtin_mul_widen_odd (tree type)
   switch (TYPE_MODE (type))
     {
     case V4SImode:
-      if (uns_p)
-       {
-         if (!TARGET_SSE2)
-           return NULL;
-         code = IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI;
-       }
-      else
-       {
-         if (!TARGET_SSE4_1)
-           return NULL;
-         code = IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI;
-       }
+      if (!TARGET_SSE2)
+       return NULL;
+      code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI
+             : IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI);
       break;
 
     case V8SImode:
@@ -38774,12 +38760,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx 
op2,
          emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
          return;
        }
+
+      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
       op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
-                         GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
-                         1, OPTAB_DIRECT);
+                         x, NULL, 1, OPTAB_DIRECT);
       op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
-                         GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
-                         1, OPTAB_DIRECT);
+                         x, NULL, 1, OPTAB_DIRECT);
       op1 = gen_lowpart (mode, op1);
       op2 = gen_lowpart (mode, op2);
     }
@@ -38801,7 +38787,38 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx 
op2,
       x = gen_xop_pmacsdql (dest, op1, op2, x);
     }
   else
-    gcc_unreachable ();
+    {
+      rtx s1, s2, t0, t1, t2;
+
+      /* The easiest way to implement this without PMULDQ is to go through
+        the motions as if we are performing a full 64-bit multiply.  With
+        the exception that we need to do less shuffling of the elements.  */
+
+      /* Compute the sign-extension, aka highparts, of the two operands.  */
+      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+                               op1, pc_rtx, pc_rtx);
+      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+                               op2, pc_rtx, pc_rtx);
+
+      /* Multiply LO(A) * HI(B), and vice-versa.  */
+      t1 = gen_reg_rtx (wmode);
+      t2 = gen_reg_rtx (wmode);
+      emit_insn (gen_sse2_umulv2siv2di3 (t1, s1, op2));
+      emit_insn (gen_sse2_umulv2siv2di3 (t2, s2, op1));
+
+      /* Multiply LO(A) * LO(B).  */
+      t0 = gen_reg_rtx (wmode);
+      emit_insn (gen_sse2_umulv2siv2di3 (t0, op1, op2));
+
+      /* Combine and shift the highparts into place.  */
+      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+                        1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+      return;
+    }
   emit_insn (x);
 }
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 81e7dc0..754b8b4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5607,9 +5607,7 @@
    (any_extend:<sseunpackmode>
      (match_operand:VI124_AVX2 1 "register_operand"))
    (match_operand:VI124_AVX2 2 "register_operand")]
-  ; Note that SSE2 does not have signed SI multiply
-  "TARGET_XOP || TARGET_SSE4_1
-   || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
+  "TARGET_SSE2"
 {
   ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
                              <u_bool>, true);
@@ -5621,23 +5619,32 @@
    (any_extend:<sseunpackmode>
      (match_operand:VI124_AVX2 1 "register_operand"))
    (match_operand:VI124_AVX2 2 "register_operand")]
-  ; Note that SSE2 does not have signed SI multiply
-  "TARGET_XOP || TARGET_SSE4_1
-   || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
+  "TARGET_SSE2"
 {
   ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
                              <u_bool>, false);
   DONE;
 })
 
+;; Most widen_<s>mult_even_<mode> can be handled directly from other
+;; named patterns, but signed V4SI needs special help for plain SSE2.
+(define_expand "vec_widen_smult_even_v4si"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SI 1 "register_operand")
+   (match_operand:V4SI 2 "register_operand")]
+  "TARGET_SSE2"
+{
+  ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
+                                false, false);
+  DONE;
+})
+
 (define_expand "vec_widen_<s>mult_odd_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
    (any_extend:<sseunpackmode>
-     (match_operand:VI124_AVX2 1 "register_operand"))
-   (match_operand:VI124_AVX2 2 "register_operand")]
-  ; Note that SSE2 does not have signed SI multiply
-  "TARGET_AVX || TARGET_XOP || TARGET_SSE4_1
-   || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
+     (match_operand:VI4_AVX2 1 "register_operand"))
+   (match_operand:VI4_AVX2 2 "register_operand")]
+  "TARGET_SSE2"
 {
   ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
                                 <u_bool>, true);
-- 
1.7.7.6

[PATCH 2/3] i386: Implement widen_smul_*_v4si for plain sse2

Reply via email to