work222-float)] Add vectorized 16-bit floating point binary operation support.

Michael Meissner via Gcc-cvs Sat, 18 Oct 2025 11:03:09 -0700

https://gcc.gnu.org/g:d69a55919efa63983c6e833dd7b4e7591d977363


commit d69a55919efa63983c6e833dd7b4e7591d977363
Author: Michael Meissner <[email protected]>
Date:   Thu Oct 16 18:34:27 2025 -0400

    Add vectorized 16-bit floating point binary operation support.
    
    2025-10-16  Michael Meissner  <[email protected]>
    
    gcc/
    
            * config/rs6000/float16.cc (fp16_vectorization): New function.
            * config/rs6000/float16.md (FP16_BINARY_OP): New code iterator.
            (fp16_names): New code attribute.
            (<fp16_names><mode>3): New insns.
            * config/rs6000/rs6000-protos.h (fp16_vectorization): New 
declaration.

Diff:
---
 gcc/config/rs6000/float16.cc      | 136 ++++++++++++++++++++++++++++++++++++++
 gcc/config/rs6000/float16.md      |  30 +++++++++
 gcc/config/rs6000/rs6000-protos.h |   2 +
 3 files changed, 168 insertions(+)

diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc
index 7539125a61fc..8ad3a5ea75d1 100644
--- a/gcc/config/rs6000/float16.cc
+++ b/gcc/config/rs6000/float16.cc
@@ -191,3 +191,139 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
   else
     gcc_unreachable ();
 }
+
+
+/* Expand a 16-bit vector operation:
+
+   ICODE:   Operation to perform.
+   RESULT:  Result of the operation.
+   OP1:     Input operand1.
+   OP2:     Input operand2.
+   OP3:     Input operand3 or NULL_RTX.
+   SUBTYPE: Describe the operation.  */
+       
+void
+fp16_vectorization (enum rtx_code icode,
+                   rtx result,
+                   rtx op1,
+                   rtx op2,
+                   rtx op3,
+                   enum fp16_operation subtype)
+{
+  gcc_assert (can_create_pseudo_p ());
+
+  machine_mode result_mode = GET_MODE (result);
+  rtx op_orig[3] = { op1, op2, op3 };
+  rtx op_hi[3];
+  rtx op_lo[3];
+  rtx result_hi;
+  rtx result_lo;
+  size_t n_opts;
+
+  switch (subtype)
+    {
+    case FP16_BINARY:
+      n_opts = 2;
+      break;
+
+    case FP16_FMA:
+    case FP16_FMS:
+    case FP16_NFMA:
+    case FP16_NFMS:
+      n_opts = 3;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Allocate 2 temporaries for the results and the input operands.  */
+  result_hi = gen_reg_rtx (V4SFmode);
+  result_lo = gen_reg_rtx (V4SFmode);
+
+  for (size_t i = 0; i < n_opts; i++)
+    {
+      gcc_assert (op_orig[i] != NULL_RTX);
+      op_hi[i] = gen_reg_rtx (V4SFmode);       /* high register.  */
+      op_lo[i] = gen_reg_rtx (V4SFmode);       /* low register.  */
+
+      if (result_mode == V8HFmode)
+       {
+         emit_insn (gen_vec_unpacks_hi_v8hf (op_hi[i], op_orig[i]));
+         emit_insn (gen_vec_unpacks_lo_v8hf (op_lo[i], op_orig[i]));
+       }
+
+      else if (result_mode == V8BFmode)
+       {
+         emit_insn (gen_vec_unpacks_hi_v8hf (op_hi[i], op_orig[i]));
+         emit_insn (gen_vec_unpacks_lo_v8hf (op_lo[i], op_orig[i]));
+       }
+
+      else
+       gcc_unreachable ();
+    }
+
+  /* Do 2 sets of V4SFmode operations.  */
+  switch (subtype)
+    {
+    case FP16_BINARY:
+      emit_insn (gen_rtx_SET (result_hi,
+                             gen_rtx_fmt_ee (icode, V4SFmode,
+                                             op_hi[0],
+                                             op_hi[1])));
+
+      emit_insn (gen_rtx_SET (result_lo,
+                             gen_rtx_fmt_ee (icode, V4SFmode,
+                                             op_lo[0],
+                                             op_lo[1])));
+      break;
+
+    case FP16_FMA:
+    case FP16_FMS:
+    case FP16_NFMA:
+    case FP16_NFMS:
+      {
+       rtx op1_hi = op_hi[0];
+       rtx op2_hi = op_hi[1];
+       rtx op3_hi = op_hi[2];
+
+       rtx op1_lo = op_lo[0];
+       rtx op2_lo = op_lo[1];
+       rtx op3_lo = op_lo[2];
+
+       if (subtype == FP16_FMS || subtype == FP16_NFMS)
+         {
+           op3_hi = gen_rtx_NEG (V4SFmode, op3_hi);
+           op3_lo = gen_rtx_NEG (V4SFmode, op3_lo);
+         }
+
+       rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi);
+       rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo);
+
+       if (subtype == FP16_NFMA || subtype == FP16_NFMS)
+         {
+           op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi);
+           op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo);
+         }
+
+       emit_insn (gen_rtx_SET (result_hi, op_fma_hi));
+       emit_insn (gen_rtx_SET (result_lo, op_fma_lo));
+      }
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector.  */
+  if (result_mode == V8HFmode)
+    emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
+
+  else if (result_mode == V8BFmode)
+    emit_insn (gen_vec_pack_trunc_v4sf (result, result_hi, result_lo));
+
+  else
+    gcc_unreachable ();
+
+  return;
+}
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index 50205db44c2e..264d050429f4 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -59,6 +59,19 @@
                                (V8BF "V4BF")
                                (V8HF "V4HF")])
 
+;; Binary operators for bfloat16/float16 vectorization.
+(define_code_iterator FP16_BINARY_OP [plus minus mult smax smin])
+
+;; Standard names for the unary/binary/ternary operators
+(define_code_attr fp16_names [(abs   "abs")
+                             (fma   "fma")
+                             (plus  "add")
+                             (minus "sub")
+                             (mult  "mul")
+                             (neg   "neg")
+                             (smax  "smax")
+                             (smin  "smin")])
+
 ;; UNSPEC constants
 (define_c_enum "unspec"
   [UNSPEC_FP16_SHIFT_LEFT_32BIT
@@ -691,6 +704,23 @@
                              operands[3], FP16_NFMS);
   DONE;
 })
+
+;; Add vectorization support for 16-bit floating point.
+;; Binary operators being vectorized.
+(define_insn_and_split "<fp16_names><mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (FP16_BINARY_OP:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], NULL_RTX,
+                     FP16_BINARY);
+  DONE;
+})
 
 
 ;; If we do multiple __bfloat16 operations, between the first and
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index be76cdb507ba..3c4d21299e1a 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -271,6 +271,8 @@ enum fp16_operation {
 
 extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx,
                                        enum fp16_operation);
+extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx,
+                               enum fp16_operation);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE

[gcc(refs/users/meissner/heads/work222-float)] Add vectorized 16-bit floating point binary operation support.

Reply via email to