https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116979
--- Comment #12 from Jakub Jelinek <jakub at gcc dot gnu.org> --- With the following patch instead it isn't vectorized anymore and uses scalar code: --- gcc/config/i386/i386.md.jj 2024-12-07 11:40:03.604875310 +0100 +++ gcc/config/i386/i386.md 2024-12-12 19:10:26.999013426 +0100 @@ -132,6 +132,9 @@ (define_c_enum "unspec" [ UNSPEC_RSQRT UNSPEC_PSADBW + ;; For FMA4 support + UNSPEC_FMADDSUB + ;; Different from generic us_truncate RTX ;; as it does unsigned saturation of signed source. UNSPEC_US_TRUNCATE --- gcc/config/i386/sse.md.jj 2024-11-25 09:32:32.482140219 +0100 +++ gcc/config/i386/sse.md 2024-12-12 19:09:47.693565938 +0100 @@ -49,7 +49,6 @@ (define_c_enum "unspec" [ UNSPEC_PCMPISTR ;; For FMA4 support - UNSPEC_FMADDSUB UNSPEC_XOP_UNSIGNED_CMP UNSPEC_XOP_TRUEFALSE UNSPEC_FRCZ --- gcc/config/i386/mmx.md.jj 2024-12-07 11:35:48.930447349 +0100 +++ gcc/config/i386/mmx.md 2024-12-12 19:17:26.972109772 +0100 @@ -1132,6 +1132,59 @@ (define_expand "vec_addsubv2sf3" DONE; }) +(define_expand "vec_fmaddsubv2sf4" + [(set (match_operand:V2SF 0 "register_operand") + (unspec:V2SF + [(match_operand:V2SF 1 "nonimmediate_operand") + (match_operand:V2SF 2 "nonimmediate_operand") + (match_operand:V2SF 3 "nonimmediate_operand")] + UNSPEC_FMADDSUB))] + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) + && TARGET_MMX_WITH_SSE + && ix86_partial_vec_fp_math" +{ + rtx op3 = gen_reg_rtx (V4SFmode); + rtx op2 = gen_reg_rtx (V4SFmode); + rtx op1 = gen_reg_rtx (V4SFmode); + rtx op0 = gen_reg_rtx (V4SFmode); + + emit_insn (gen_movq_v2sf_to_sse (op3, operands[3])); + emit_insn (gen_movq_v2sf_to_sse (op2, operands[2])); + emit_insn (gen_movq_v2sf_to_sse (op1, operands[1])); + + emit_insn (gen_vec_fmaddsubv4sf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode)); + DONE; +}) + +(define_expand "vec_fmsubaddv2sf4" + [(set (match_operand:V2SF 0 "register_operand") + (unspec:V2SF + [(match_operand:V2SF 1 "nonimmediate_operand") + (match_operand:V2SF 2 "nonimmediate_operand") + (neg:V2SF + (match_operand:V2SF 3 "nonimmediate_operand"))] + UNSPEC_FMADDSUB))] + "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL) + && TARGET_MMX_WITH_SSE + && ix86_partial_vec_fp_math" +{ + rtx op3 = gen_reg_rtx (V4SFmode); + rtx op2 = gen_reg_rtx (V4SFmode); + rtx op1 = gen_reg_rtx (V4SFmode); + rtx op0 = gen_reg_rtx (V4SFmode); + + emit_insn (gen_movq_v2sf_to_sse (op3, operands[3])); + emit_insn (gen_movq_v2sf_to_sse (op2, operands[2])); + emit_insn (gen_movq_v2sf_to_sse (op1, operands[1])); + + emit_insn (gen_vec_fmsubaddv4sf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode)); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel single-precision floating point comparisons unless -fvect-cost-model=unlimited, then it uses vfmaddsub132ps. Dunno, haven't actually tried to compute the costs, maybe we'd need to adjust the costing too.