Now that we support mult_even/odd hooks, the vectorizer can generate the exact same code for plain sse dot_prod by itself, as well as other reductions other than plus. --- gcc/ChangeLog | 6 +++++ gcc/config/i386/sse.md | 62 +++++++----------------------------------------- 2 files changed, 14 insertions(+), 54 deletions(-)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f63f523..4dc93af 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,11 @@ 2012-06-25 Richard Henderson <r...@redhat.com> + * config/i386/sse.md (sse2_sse4_1): Remove code attr. + (<s>dot_prodv4si, <s>dot_prodv8si): Remove + (sdot_prodv4si): New; handle only XOP. + +2012-06-25 Richard Henderson <r...@redhat.com> + * config/i386/i386-builtin-types.def (V4UDI, V8USI): New. (V2UDI_FUNC_V4USI_V4USI): New. (V4UDI_FUNC_V8USI_V8USI): New. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 45d3a9c..4b51415 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5737,64 +5737,18 @@ DONE; }) -(define_code_attr sse2_sse4_1 - [(zero_extend "sse2") (sign_extend "sse4_1")]) - -(define_expand "<s>dot_prodv4si" +;; Normally we use widen_mul_even/odd, but combine can't quite get it all +;; back together when madd is available. +(define_expand "sdot_prodv4si" [(match_operand:V2DI 0 "register_operand") - (any_extend:V2DI (match_operand:V4SI 1 "register_operand")) + (match_operand:V4SI 1 "register_operand") (match_operand:V4SI 2 "register_operand") (match_operand:V2DI 3 "register_operand")] - "<CODE> == ZERO_EXTEND ? TARGET_SSE2 : TARGET_SSE4_1" -{ - rtx t1, t2, t3, t4; - - t1 = gen_reg_rtx (V2DImode); - emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t1, operands[1], operands[2])); - emit_insn (gen_addv2di3 (t1, t1, operands[3])); - - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2), - gen_lowpart (V1TImode, operands[1]), - GEN_INT (32))); - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3), - gen_lowpart (V1TImode, operands[2]), - GEN_INT (32))); - - t4 = gen_reg_rtx (V2DImode); - emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t4, t2, t3)); - - emit_insn (gen_addv2di3 (operands[0], t1, t4)); - DONE; -}) - -(define_expand "<s>dot_prodv8si" - [(match_operand:V4DI 0 "register_operand") - (any_extend:V4DI (match_operand:V8SI 1 "register_operand")) - (match_operand:V8SI 2 "register_operand") - (match_operand:V4DI 3 "register_operand")] - "TARGET_AVX2" + "TARGET_XOP" { - rtx t1, t2, t3, t4; - - t1 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_<u>mulv4siv4di3 (t1, operands[1], operands[2])); - emit_insn (gen_addv4di3 (t1, t1, operands[3])); - - t2 = gen_reg_rtx (V8SImode); - t3 = gen_reg_rtx (V8SImode); - emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t2), - gen_lowpart (V2TImode, operands[1]), - GEN_INT (32))); - emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t3), - gen_lowpart (V2TImode, operands[2]), - GEN_INT (32))); - - t4 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_<u>mulv4siv4di3 (t4, t2, t3)); - - emit_insn (gen_addv4di3 (operands[0], t1, t4)); + rtx t = gen_reg_rtx (V2DImode); + emit_insn (gen_xop_pmacsdqh (t, operands[1], operands[2], operands[3])); + emit_insn (gen_xop_pmacsdql (operands[0], operands[1], operands[2], t)); DONE; }) -- 1.7.10.2