https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116979

--- Comment #12 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
With the following patch instead it isn't vectorized anymore and uses scalar
code:
--- gcc/config/i386/i386.md.jj  2024-12-07 11:40:03.604875310 +0100
+++ gcc/config/i386/i386.md     2024-12-12 19:10:26.999013426 +0100
@@ -132,6 +132,9 @@ (define_c_enum "unspec" [
   UNSPEC_RSQRT
   UNSPEC_PSADBW

+  ;; For FMA4 support
+  UNSPEC_FMADDSUB
+
   ;; Different from generic us_truncate RTX
   ;; as it does unsigned saturation of signed source.
   UNSPEC_US_TRUNCATE
--- gcc/config/i386/sse.md.jj   2024-11-25 09:32:32.482140219 +0100
+++ gcc/config/i386/sse.md      2024-12-12 19:09:47.693565938 +0100
@@ -49,7 +49,6 @@ (define_c_enum "unspec" [
   UNSPEC_PCMPISTR

   ;; For FMA4 support
-  UNSPEC_FMADDSUB
   UNSPEC_XOP_UNSIGNED_CMP
   UNSPEC_XOP_TRUEFALSE
   UNSPEC_FRCZ
--- gcc/config/i386/mmx.md.jj   2024-12-07 11:35:48.930447349 +0100
+++ gcc/config/i386/mmx.md      2024-12-12 19:17:26.972109772 +0100
@@ -1132,6 +1132,59 @@ (define_expand "vec_addsubv2sf3"
   DONE;
 })

+(define_expand "vec_fmaddsubv2sf4"
+  [(set (match_operand:V2SF 0 "register_operand")
+        (unspec:V2SF
+          [(match_operand:V2SF 1 "nonimmediate_operand")
+           (match_operand:V2SF 2 "nonimmediate_operand")
+           (match_operand:V2SF 3 "nonimmediate_operand")]
+          UNSPEC_FMADDSUB))]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V4SFmode);
+  rtx op2 = gen_reg_rtx (V4SFmode);
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_movq_v2sf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v2sf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v2sf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmaddsubv4sf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
+  DONE;
+})
+
+(define_expand "vec_fmsubaddv2sf4"
+  [(set (match_operand:V2SF 0 "register_operand")
+        (unspec:V2SF
+          [(match_operand:V2SF 1 "nonimmediate_operand")
+           (match_operand:V2SF 2 "nonimmediate_operand")
+           (neg:V2SF
+            (match_operand:V2SF 3 "nonimmediate_operand"))]
+          UNSPEC_FMADDSUB))]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V4SFmode);
+  rtx op2 = gen_reg_rtx (V4SFmode);
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_movq_v2sf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v2sf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v2sf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmsubaddv4sf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel single-precision floating point comparisons
unless -fvect-cost-model=unlimited, then it uses vfmaddsub132ps.
Dunno, haven't actually tried to compute the costs, maybe we'd need to adjust
the costing too.

Reply via email to