[PATCH, i386]: Fix PR 52908 - xop-mul-1:f9 miscompiled on bulldozer (-mxop)

Uros Bizjak Wed, 09 May 2012 13:39:29 -0700

Hello!

Attached patch fixes PR 52908. There is no need to generate "fake"
multiply instructions after reload, we can expand directly to MAC
instructions. This approach even produced better assembly for a couple
of testcases in gcc.target/i386 testsuite.


2012-05-09  Uros Bizjak  <ubiz...@gmail.com>

        PR target/52908
        * config/i386/sse.md (vec_widen_smult_hi_v4si): Expand using
        xop_pmacsdqh insn pattern instead of xop_mulv2div2di3_high.
        (vec_widen_smult_lo_v4si): Expand using xop_pmacsdql insn pattern
        instead of xop_mulv2div2di3_low.
        (xop_p<macs>dql): Fix vec_select selector.
        (xop_p<macs>dqh): Ditto.
        (xop_mulv2div2di3_low): Remove insn_and_split pattern.
        (xop_mulv2div2di3_high): Ditto.

testsuite/ChangeLog:

        PR target/52908
        * gcc.target/i386/xop-imul32widen-vector.c: Update scan-assembler
        directive to Scan for vpmuldq, not vpmacsdql.

Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}, and
tested on XOP target by Venkataramanan.

Patch was committed to mainline SVN. The version, attached to the PR
should be backported to other release branches, but another volunteer
should do the backport, since I don't have access to XOP target.

Also, please note that XOP horizontal add/subtract instructions (and
possibly others) have vec_select parallel RTX in wrong endiannes.

Uros.

Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md  (revision 187347)
+++ config/i386/sse.md  (working copy)
@@ -5748,11 +5748,15 @@
 
   if (TARGET_XOP)
     {
+      rtx t3 = gen_reg_rtx (V2DImode);
+
       emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
                                    GEN_INT (1), GEN_INT (3)));
       emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
                                    GEN_INT (1), GEN_INT (3)));
-      emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+      emit_move_insn (t3, CONST0_RTX (V2DImode));
+
+      emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3));
       DONE;
     }
 
@@ -5777,11 +5781,15 @@
 
   if (TARGET_XOP)
     {
+      rtx t3 = gen_reg_rtx (V2DImode);
+
       emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
                                    GEN_INT (1), GEN_INT (3)));
       emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
                                    GEN_INT (1), GEN_INT (3)));
-      emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+      emit_move_insn (t3, CONST0_RTX (V2DImode));
+
+      emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3));
       DONE;
     }
 
@@ -9792,11 +9800,11 @@
          (sign_extend:V2DI
           (vec_select:V2SI
            (match_operand:V4SI 1 "nonimmediate_operand" "%x")
-           (parallel [(const_int 1) (const_int 3)])))
+           (parallel [(const_int 0) (const_int 2)])))
          (sign_extend:V2DI
           (vec_select:V2SI
            (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-           (parallel [(const_int 1) (const_int 3)]))))
+           (parallel [(const_int 0) (const_int 2)]))))
         (match_operand:V2DI 3 "nonimmediate_operand" "x")))]
   "TARGET_XOP"
   "vp<macs>dql\t{%3, %2, %1, %0|%0, %1, %2, %3}"
@@ -9810,93 +9818,17 @@
          (sign_extend:V2DI
           (vec_select:V2SI
            (match_operand:V4SI 1 "nonimmediate_operand" "%x")
-           (parallel [(const_int 0) (const_int 2)])))
+           (parallel [(const_int 1) (const_int 3)])))
          (sign_extend:V2DI
           (vec_select:V2SI
            (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-           (parallel [(const_int 0) (const_int 2)]))))
+           (parallel [(const_int 1) (const_int 3)]))))
         (match_operand:V2DI 3 "nonimmediate_operand" "x")))]
   "TARGET_XOP"
   "vp<macs>dqh\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "TI")])
 
-;; We don't have a straight 32-bit parallel multiply and extend on XOP, so
-;; fake it with a multiply/add.  In general, we expect the define_split to
-;; occur before register allocation, so we have to handle the corner case where
-;; the target is the same as operands 1/2
-(define_insn_and_split "xop_mulv2div2di3_low"
-  [(set (match_operand:V2DI 0 "register_operand" "=&x")
-       (mult:V2DI
-         (sign_extend:V2DI
-           (vec_select:V2SI
-             (match_operand:V4SI 1 "register_operand" "%x")
-             (parallel [(const_int 1) (const_int 3)])))
-         (sign_extend:V2DI
-           (vec_select:V2SI
-             (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-             (parallel [(const_int 1) (const_int 3)])))))]
-  "TARGET_XOP"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0)
-       (match_dup 3))
-   (set (match_dup 0)
-       (plus:V2DI
-        (mult:V2DI
-         (sign_extend:V2DI
-          (vec_select:V2SI
-           (match_dup 1)
-           (parallel [(const_int 1) (const_int 3)])))
-         (sign_extend:V2DI
-          (vec_select:V2SI
-           (match_dup 2)
-           (parallel [(const_int 1) (const_int 3)]))))
-        (match_dup 0)))]
-{
-  operands[3] = CONST0_RTX (V2DImode);
-}
-  [(set_attr "type" "ssemul")
-   (set_attr "mode" "TI")])
-
-;; We don't have a straight 32-bit parallel multiply and extend on XOP, so
-;; fake it with a multiply/add.  In general, we expect the define_split to
-;; occur before register allocation, so we have to handle the corner case where
-;; the target is the same as either operands[1] or operands[2]
-(define_insn_and_split "xop_mulv2div2di3_high"
-  [(set (match_operand:V2DI 0 "register_operand" "=&x")
-       (mult:V2DI
-         (sign_extend:V2DI
-           (vec_select:V2SI
-             (match_operand:V4SI 1 "register_operand" "%x")
-             (parallel [(const_int 0) (const_int 2)])))
-         (sign_extend:V2DI
-           (vec_select:V2SI
-             (match_operand:V4SI 2 "nonimmediate_operand" "xm")
-             (parallel [(const_int 0) (const_int 2)])))))]
-  "TARGET_XOP"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0)
-       (match_dup 3))
-   (set (match_dup 0)
-       (plus:V2DI
-        (mult:V2DI
-         (sign_extend:V2DI
-          (vec_select:V2SI
-           (match_dup 1)
-           (parallel [(const_int 0) (const_int 2)])))
-         (sign_extend:V2DI
-          (vec_select:V2SI
-           (match_dup 2)
-           (parallel [(const_int 0) (const_int 2)]))))
-        (match_dup 0)))]
-{
-  operands[3] = CONST0_RTX (V2DImode);
-}
-  [(set_attr "type" "ssemul")
-   (set_attr "mode" "TI")])
-
 ;; XOP parallel integer multiply/add instructions for the intrinisics
 (define_insn "xop_p<macs>wd"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
Index: testsuite/gcc.target/i386/xop-imul32widen-vector.c
===================================================================
--- testsuite/gcc.target/i386/xop-imul32widen-vector.c  (revision 187347)
+++ testsuite/gcc.target/i386/xop-imul32widen-vector.c  (working copy)
@@ -32,5 +32,5 @@
   exit (0);
 }
 
-/* { dg-final { scan-assembler "vpmacsdql" } } */
+/* { dg-final { scan-assembler "vpmuldq" } } */
 /* { dg-final { scan-assembler "vpmacsdqh" } } */

[PATCH, i386]: Fix PR 52908 - xop-mul-1:f9 miscompiled on bulldozer (-mxop)

Reply via email to