[PATCH, 8/11] Add 16-bit floating point vectorization.

Michael Meissner Fri, 14 Nov 2025 14:08:40 -0800

This patch adds automatic vectorization of _Float16 values on ISA 3.0
(power9) systems, and automatic vectorization of __bfloat16 values on
ISA 3.1 (power10 and power11) systems.


This is done by converting the vector of 16-bit floating point to two
vectors of 32-bit floating point values, doing the operation, and then
converting the resulting pair of float vectors back to a vector of the
16-bit values.

All 11 patches have been tested on little endian and big endian PowerPC
servers with no regressions.  Can I check in these patches?

2025-11-14  Michael Meissner  <[email protected]>

gcc/

        * config.gcc (powerpc*-*-*): Add float16.o.
        * config/rs6000/float16.cc: New file to add 16-bit floating point
        vectorization.
        * config/rs6000/float16.md: (FP16_BINARY_OP): New mode iterator.
        (fp16_names): New mode attribute.
        (UNSPEC_XVCVSPHP_V8HF): New unspec.
        (UNSPEC_XVCVSPBF16_V8BF): Likewise.
        (<fp16_names><mode>): New insns to support vectorization of 16-bit
        floating point.
        (fma<mode>4): Likewise.
        (fms<mode>4): Likewise.
        (nfma<mode>): Likewise.
        (nfms<mode>4): Likewise.
        (vec_pack_trunc_v4sf_v8hf): Likewise.
        (vec_pack_trunc_v4sf_v8bf): Likewise.
        (vec_pack_trunc_v4sf): Likewise.
        (xvcvsphp_v8hf): Likewise.
        (xvcvspbf16_v8bf): Likewise.
        (vec_unpacks_hi_v8hf): Likewise.
        (vec_unpacks_lo_v8hf): Likewise.
        (xvcvhpsp_v8hf): Likewise.
        (vec_unpacks_hi_v8bf): Likewise.
        (vec_unpacks_lo_v8bf): Likewise.
        (xvcvbf16spn_v8bf): Likewise.
        * config/rs6000/rs6000-protos.h (enum fp16_operation): New enumeration
        for vectorizing 16-bit floating point.
        (fp16_vectorization): New declaration.
        * config/rs6000/t-rs6000 (float16.o): Add build rules.
---
 gcc/config.gcc                    |   1 +
 gcc/config/rs6000/float16.cc      | 185 ++++++++++++++++++++++
 gcc/config/rs6000/float16.md      | 245 +++++++++++++++++++++++++++++-
 gcc/config/rs6000/rs6000-protos.h |  13 ++
 gcc/config/rs6000/t-rs6000        |   4 +
 5 files changed, 447 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/rs6000/float16.cc

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 07d0409b509..b752bb6201b 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -533,6 +533,7 @@ powerpc*-*-*)
        extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
        extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
        extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+       extra_objs="${extra_objs} float16.o"
        extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
        extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
        extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc
new file mode 100644
index 00000000000..5274a0df962
--- /dev/null
+++ b/gcc/config/rs6000/float16.cc
@@ -0,0 +1,185 @@
+/* Subroutines for the C front end on the PowerPC architecture.
+   Copyright (C) 2002-2025 Free Software Foundation, Inc.
+
+   Contributed by Zack Weinberg <[email protected]>
+   and Paolo Bonzini <[email protected]>
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* 16-bit floating point support.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "attribs.h"
+#include "explow.h"
+#include "expr.h"
+#include "common/common-target.h"
+#include "rs6000-internal.h"
+
+/* Expand a 16-bit vector operation:
+
+   ICODE:   Operation to perform.
+   RESULT:  Result of the operation.
+   OP1:     Input operand1.
+   OP2:     Input operand2.
+   OP3:     Input operand3 or NULL_RTX.
+   SUBTYPE: Describe the operation.  */
+       
+void
+fp16_vectorization (enum rtx_code icode,
+                   rtx result,
+                   rtx op1,
+                   rtx op2,
+                   rtx op3,
+                   enum fp16_operation subtype)
+{
+  gcc_assert (can_create_pseudo_p ());
+
+  machine_mode result_mode = GET_MODE (result);
+  rtx op_orig[3] = { op1, op2, op3 };
+  rtx op_hi[3];
+  rtx op_lo[3];
+  rtx result_hi;
+  rtx result_lo;
+  size_t n_opts;
+
+  switch (subtype)
+    {
+    case FP16_BINARY:
+      n_opts = 2;
+      break;
+
+    case FP16_FMA:
+    case FP16_FMS:
+    case FP16_NFMA:
+    case FP16_NFMS:
+      n_opts = 3;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Allocate 2 temporaries for the results and the input operands.  */
+  result_hi = gen_reg_rtx (V4SFmode);
+  result_lo = gen_reg_rtx (V4SFmode);
+
+  for (size_t i = 0; i < n_opts; i++)
+    {
+      gcc_assert (op_orig[i] != NULL_RTX);
+      op_hi[i] = gen_reg_rtx (V4SFmode);       /* high register.  */
+      op_lo[i] = gen_reg_rtx (V4SFmode);       /* low register.  */
+
+      rtx interleave_hi = gen_reg_rtx (result_mode);
+      rtx interleave_lo = gen_reg_rtx (result_mode);
+      rtx orig = op_orig[i];
+
+      rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN);
+      rs6000_expand_interleave (interleave_lo, orig, orig,  BYTES_BIG_ENDIAN);
+
+      if (result_mode == V8HFmode)
+       {
+         emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi));
+         emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo));
+       }
+
+      else if (result_mode == V8BFmode)
+       {
+         emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi));
+         emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo));
+       }
+
+      else
+       gcc_unreachable ();
+    }
+
+  /* Do 2 sets of V4SFmode operations.  */
+  switch (subtype)
+    {
+    case FP16_BINARY:
+      emit_insn (gen_rtx_SET (result_hi,
+                             gen_rtx_fmt_ee (icode, V4SFmode,
+                                             op_hi[0],
+                                             op_hi[1])));
+
+      emit_insn (gen_rtx_SET (result_lo,
+                             gen_rtx_fmt_ee (icode, V4SFmode,
+                                             op_lo[0],
+                                             op_lo[1])));
+      break;
+
+    case FP16_FMA:
+    case FP16_FMS:
+    case FP16_NFMA:
+    case FP16_NFMS:
+      {
+       rtx op1_hi = op_hi[0];
+       rtx op2_hi = op_hi[1];
+       rtx op3_hi = op_hi[2];
+
+       rtx op1_lo = op_lo[0];
+       rtx op2_lo = op_lo[1];
+       rtx op3_lo = op_lo[2];
+
+       if (subtype == FP16_FMS || subtype == FP16_NFMS)
+         {
+           op3_hi = gen_rtx_NEG (V4SFmode, op3_hi);
+           op3_lo = gen_rtx_NEG (V4SFmode, op3_lo);
+         }
+
+       rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi);
+       rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo);
+
+       if (subtype == FP16_NFMA || subtype == FP16_NFMS)
+         {
+           op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi);
+           op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo);
+         }
+
+       emit_insn (gen_rtx_SET (result_hi, op_fma_hi));
+       emit_insn (gen_rtx_SET (result_lo, op_fma_lo));
+      }
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector.  */
+  if (result_mode == V8HFmode)
+    emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
+
+  else if (result_mode == V8BFmode)
+    emit_insn (gen_vec_pack_trunc_v4sf_v8bf (result, result_hi, result_lo));
+
+  else
+    gcc_unreachable ();
+
+  return;
+}
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index ce1a79f50ab..690b8c2d661 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -60,12 +60,27 @@ (define_mode_attr FP16_VECTOR4 [(BF   "V4BF")
                                (V8BF "V4BF")
                                (V8HF "V4HF")])
 
+;; Binary operators for bfloat16/float16 vectorization.
+(define_code_iterator FP16_BINARY_OP [plus minus mult smax smin])
+
+;; Standard names for the unary/binary/ternary operators
+(define_code_attr fp16_names [(abs   "abs")
+                             (fma   "fma")
+                             (plus  "add")
+                             (minus "sub")
+                             (mult  "mul")
+                             (neg   "neg")
+                             (smax  "smax")
+                             (smin  "smin")])
+
 ;; UNSPEC constants
 (define_c_enum "unspec"
   [UNSPEC_FP16_SHIFT_LEFT_32BIT
    UNSPEC_CVT_FP16_TO_V4SF
    UNSPEC_XXSPLTW_FP16
-   UNSPEC_XVCVSPBF16_BF])
+   UNSPEC_XVCVSPBF16_BF
+   UNSPEC_XVCVSPHP_V8HF
+   UNSPEC_XVCVSPBF16_V8BF])
 
 ;; _Float16 and __bfloat16 moves
 (define_expand "mov<mode>"
@@ -720,3 +735,231 @@ (define_insn "*boolcc<mode>3"
    xxl%q3 %x0,%x1,%x2
    %q3 %0,%1,%2"
   [(set_attr "type" "veclogical,logical")])
+
+;; Add vectorization support for 16-bit floating point.
+
+;; Binary operators being vectorized.
+(define_insn_and_split "<fp16_names><mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (FP16_BINARY_OP:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], NULL_RTX,
+                     FP16_BINARY);
+  DONE;
+})
+
+;; FMA operations being vectorized.
+(define_insn_and_split "fma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (fma:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")
+        (match_operand:VFP16_HW 3 "vsx_register_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*fms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (fma:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")
+        (neg:VFP16_HW
+         (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*nfma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (neg:VFP16_HW
+        (fma:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand")
+         (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*nfms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (neg:VFP16_HW
+        (fma:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand")
+         (neg:VFP16_HW
+          (match_operand:VFP16_HW 3 "vsx_register_operand")))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_NFMS);
+  DONE;
+})
+
+;; Vector Pack support.
+
+(define_expand "vec_pack_trunc_v4sf_v8hf"
+  [(match_operand:V8HF 0 "vfloat_operand")
+   (match_operand:V4SF 1 "vfloat_operand")
+   (match_operand:V4SF 2 "vfloat_operand")]
+  "TARGET_FLOAT16_HW"
+{
+  rtx r1 = gen_reg_rtx (V8HFmode);
+  rtx r2 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_xvcvsphp_v8hf (r1, operands[1]));
+  emit_insn (gen_xvcvsphp_v8hf (r2, operands[2]));
+  rs6000_expand_extract_even (operands[0], r1, r2);
+  DONE;
+})
+
+(define_expand "vec_pack_trunc_v4sf_v8bf"
+  [(match_operand:V8BF 0 "vfloat_operand")
+   (match_operand:V4SF 1 "vfloat_operand")
+   (match_operand:V4SF 2 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx r1 = gen_reg_rtx (V8BFmode);
+  rtx r2 = gen_reg_rtx (V8BFmode);
+
+  emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1]));
+  emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2]));
+  rs6000_expand_extract_even (operands[0], r1, r2);
+  DONE;
+})
+
+;; Unfortunately the machine independent code assumes there is only one
+;; 16-bit floating point type.  This means we have to choose whether to
+;; support packing _Float16 or __bfloat16.  It looks like __bfloat16 is
+;; more popular, so we choose __bfloat16 to be the default.
+
+(define_expand "vec_pack_trunc_v4sf"
+  [(match_operand:V8BF 0 "vfloat_operand")
+   (match_operand:V4SF 1 "vfloat_operand")
+   (match_operand:V4SF 2 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx r1 = gen_reg_rtx (V8BFmode);
+  rtx r2 = gen_reg_rtx (V8BFmode);
+
+  emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1]));
+  emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2]));
+  rs6000_expand_extract_even (operands[0], r1, r2);
+  DONE;
+})
+
+;; Used for vector conversion to _Float16
+(define_insn "xvcvsphp_v8hf"
+  [(set (match_operand:V8HF 0 "vsx_register_operand" "=wa")
+       (unspec:V8HF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XVCVSPHP_V8HF))]
+  "TARGET_FLOAT16_HW"
+  "xvcvsphp %x0,%x1"
+[(set_attr "type" "vecfloat")])
+
+;; Used for vector conversion to __bfloat16
+(define_insn "xvcvspbf16_v8bf"
+  [(set (match_operand:V8BF 0 "vsx_register_operand" "=wa")
+       (unspec:V8BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XVCVSPBF16_V8BF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvspbf16 %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
+;; Vector unpack support.  Given the name is for the type being
+;; unpacked, we can unpack both __bfloat16 and _Float16.
+
+;; Unpack vector _Float16
+(define_expand "vec_unpacks_hi_v8hf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8HF 1 "vfloat_operand")]
+  "TARGET_FLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8HFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
+  DONE;
+})
+
+(define_expand "vec_unpacks_lo_v8hf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8HF 1 "vfloat_operand")]
+  "TARGET_FLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8HFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
+  DONE;
+})
+
+;; Used for vector conversion from _Float16
+(define_insn "xvcvhpsp_v8hf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:V8HF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_CVT_FP16_TO_V4SF))]
+  "TARGET_FLOAT16_HW"
+  "xvcvhpsp %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
+;; Unpack vector __bfloat16
+(define_expand "vec_unpacks_hi_v8bf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8BF 1 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8BFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
+  DONE;
+})
+
+(define_expand "vec_unpacks_lo_v8bf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8BF 1 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8BFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
+  DONE;
+})
+
+;; Used for vector conversion from __bfloat16
+(define_insn "xvcvbf16spn_v8bf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:V8BF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_CVT_FP16_TO_V4SF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvbf16spn %x0,%x1"
+  [(set_attr "type" "vecperm")])
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index d29081837b3..dd5fcd69e83 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -258,6 +258,19 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode,
 extern unsigned constant_generates_lxvkq (vec_const_128bit_type *);
 extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *);
 extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
+
+/* From float16.cc.  */
+/* Optimize bfloat16 and float16 operations.  */
+enum fp16_operation {
+  FP16_BINARY,                         /* Bfloat16/float16 binary op.  */
+  FP16_FMA,                            /* (a * b) + c.  */
+  FP16_FMS,                            /* (a * b) - c.  */
+  FP16_NFMA,                           /* - ((a * b) + c).  */
+  FP16_NFMS                            /* - ((a * b) - c).  */
+};
+
+extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx,
+                               enum fp16_operation);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index a5d1c27424f..c8f19865311 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc 
rs6000-builtins.h
        $(COMPILE) $<
        $(POSTCOMPILE)
 
+float16.o: $(srcdir)/config/rs6000/float16.cc
+       $(COMPILE) $<
+       $(POSTCOMPILE)
+
 #$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl
 #      $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md
 
-- 
2.51.1


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: [email protected]

[PATCH, 8/11] Add 16-bit floating point vectorization.

Reply via email to