Hello! Attached patch fixes PR 52908. There is no need to generate "fake" multiply instructions after reload, we can expand directly to MAC instructions. This approach even produced better assembly for a couple of testcases in gcc.target/i386 testsuite.
2012-05-09 Uros Bizjak <ubiz...@gmail.com> PR target/52908 * config/i386/sse.md (vec_widen_smult_hi_v4si): Expand using xop_pmacsdqh insn pattern instead of xop_mulv2div2di3_high. (vec_widen_smult_lo_v4si): Expand using xop_pmacsdql insn pattern instead of xop_mulv2div2di3_low. (xop_p<macs>dql): Fix vec_select selector. (xop_p<macs>dqh): Ditto. (xop_mulv2div2di3_low): Remove insn_and_split pattern. (xop_mulv2div2di3_high): Ditto. testsuite/ChangeLog: PR target/52908 * gcc.target/i386/xop-imul32widen-vector.c: Update scan-assembler directive to Scan for vpmuldq, not vpmacsdql. Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}, and tested on XOP target by Venkataramanan. Patch was committed to mainline SVN. The version, attached to the PR should be backported to other release branches, but another volunteer should do the backport, since I don't have access to XOP target. Also, please note that XOP horizontal add/subtract instructions (and possibly others) have vec_select parallel RTX in wrong endiannes. Uros.
Index: config/i386/sse.md =================================================================== --- config/i386/sse.md (revision 187347) +++ config/i386/sse.md (working copy) @@ -5748,11 +5748,15 @@ if (TARGET_XOP) { + rtx t3 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); - emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2)); + emit_move_insn (t3, CONST0_RTX (V2DImode)); + + emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3)); DONE; } @@ -5777,11 +5781,15 @@ if (TARGET_XOP) { + rtx t3 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); - emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2)); + emit_move_insn (t3, CONST0_RTX (V2DImode)); + + emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3)); DONE; } @@ -9792,11 +9800,11 @@ (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") - (parallel [(const_int 1) (const_int 3)]))) + (parallel [(const_int 0) (const_int 2)]))) (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 1) (const_int 3)])))) + (parallel [(const_int 0) (const_int 2)])))) (match_operand:V2DI 3 "nonimmediate_operand" "x")))] "TARGET_XOP" "vp<macs>dql\t{%3, %2, %1, %0|%0, %1, %2, %3}" @@ -9810,93 +9818,17 @@ (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") - (parallel [(const_int 0) (const_int 2)]))) + (parallel [(const_int 1) (const_int 3)]))) (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0) (const_int 2)])))) + (parallel [(const_int 1) (const_int 3)])))) (match_operand:V2DI 3 "nonimmediate_operand" "x")))] "TARGET_XOP" "vp<macs>dqh\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssemuladd") (set_attr "mode" "TI")]) -;; We don't have a straight 32-bit parallel multiply and extend on XOP, so -;; fake it with a multiply/add. In general, we expect the define_split to -;; occur before register allocation, so we have to handle the corner case where -;; the target is the same as operands 1/2 -(define_insn_and_split "xop_mulv2div2di3_low" - [(set (match_operand:V2DI 0 "register_operand" "=&x") - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 1 "register_operand" "%x") - (parallel [(const_int 1) (const_int 3)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 1) (const_int 3)])))))] - "TARGET_XOP" - "#" - "&& reload_completed" - [(set (match_dup 0) - (match_dup 3)) - (set (match_dup 0) - (plus:V2DI - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 1) - (parallel [(const_int 1) (const_int 3)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 2) - (parallel [(const_int 1) (const_int 3)])))) - (match_dup 0)))] -{ - operands[3] = CONST0_RTX (V2DImode); -} - [(set_attr "type" "ssemul") - (set_attr "mode" "TI")]) - -;; We don't have a straight 32-bit parallel multiply and extend on XOP, so -;; fake it with a multiply/add. In general, we expect the define_split to -;; occur before register allocation, so we have to handle the corner case where -;; the target is the same as either operands[1] or operands[2] -(define_insn_and_split "xop_mulv2div2di3_high" - [(set (match_operand:V2DI 0 "register_operand" "=&x") - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 1 "register_operand" "%x") - (parallel [(const_int 0) (const_int 2)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0) (const_int 2)])))))] - "TARGET_XOP" - "#" - "&& reload_completed" - [(set (match_dup 0) - (match_dup 3)) - (set (match_dup 0) - (plus:V2DI - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 1) - (parallel [(const_int 0) (const_int 2)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 2) - (parallel [(const_int 0) (const_int 2)])))) - (match_dup 0)))] -{ - operands[3] = CONST0_RTX (V2DImode); -} - [(set_attr "type" "ssemul") - (set_attr "mode" "TI")]) - ;; XOP parallel integer multiply/add instructions for the intrinisics (define_insn "xop_p<macs>wd" [(set (match_operand:V4SI 0 "register_operand" "=x") Index: testsuite/gcc.target/i386/xop-imul32widen-vector.c =================================================================== --- testsuite/gcc.target/i386/xop-imul32widen-vector.c (revision 187347) +++ testsuite/gcc.target/i386/xop-imul32widen-vector.c (working copy) @@ -32,5 +32,5 @@ exit (0); } -/* { dg-final { scan-assembler "vpmacsdql" } } */ +/* { dg-final { scan-assembler "vpmuldq" } } */ /* { dg-final { scan-assembler "vpmacsdqh" } } */