This patch adds automatic vectorization of _Float16 values on ISA 3.0 (power9) systems, and automatic vectorization of __bfloat16 values on ISA 3.1 (power10 and power11) systems.
This is done by converting the vector of 16-bit floating point to two vectors of 32-bit floating point values, doing the operation, and then converting the resulting pair of float vectors back to a vector of the 16-bit values. All 11 patches have been tested on little endian and big endian PowerPC servers with no regressions. Can I check in these patches? 2025-11-14 Michael Meissner <[email protected]> gcc/ * config.gcc (powerpc*-*-*): Add float16.o. * config/rs6000/float16.cc: New file to add 16-bit floating point vectorization. * config/rs6000/float16.md: (FP16_BINARY_OP): New mode iterator. (fp16_names): New mode attribute. (UNSPEC_XVCVSPHP_V8HF): New unspec. (UNSPEC_XVCVSPBF16_V8BF): Likewise. (<fp16_names><mode>): New insns to support vectorization of 16-bit floating point. (fma<mode>4): Likewise. (fms<mode>4): Likewise. (nfma<mode>): Likewise. (nfms<mode>4): Likewise. (vec_pack_trunc_v4sf_v8hf): Likewise. (vec_pack_trunc_v4sf_v8bf): Likewise. (vec_pack_trunc_v4sf): Likewise. (xvcvsphp_v8hf): Likewise. (xvcvspbf16_v8bf): Likewise. (vec_unpacks_hi_v8hf): Likewise. (vec_unpacks_lo_v8hf): Likewise. (xvcvhpsp_v8hf): Likewise. (vec_unpacks_hi_v8bf): Likewise. (vec_unpacks_lo_v8bf): Likewise. (xvcvbf16spn_v8bf): Likewise. * config/rs6000/rs6000-protos.h (enum fp16_operation): New enumeration for vectorizing 16-bit floating point. (fp16_vectorization): New declaration. * config/rs6000/t-rs6000 (float16.o): Add build rules. --- gcc/config.gcc | 1 + gcc/config/rs6000/float16.cc | 185 ++++++++++++++++++++++ gcc/config/rs6000/float16.md | 245 +++++++++++++++++++++++++++++- gcc/config/rs6000/rs6000-protos.h | 13 ++ gcc/config/rs6000/t-rs6000 | 4 + 5 files changed, 447 insertions(+), 1 deletion(-) create mode 100644 gcc/config/rs6000/float16.cc diff --git a/gcc/config.gcc b/gcc/config.gcc index 07d0409b509..b752bb6201b 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -533,6 +533,7 @@ powerpc*-*-*) extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" + extra_objs="${extra_objs} float16.o" extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc new file mode 100644 index 00000000000..5274a0df962 --- /dev/null +++ b/gcc/config/rs6000/float16.cc @@ -0,0 +1,185 @@ +/* Subroutines for the C front end on the PowerPC architecture. + Copyright (C) 2002-2025 Free Software Foundation, Inc. + + Contributed by Zack Weinberg <[email protected]> + and Paolo Bonzini <[email protected]> + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +/* 16-bit floating point support. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "tm_p.h" +#include "stringpool.h" +#include "expmed.h" +#include "optabs.h" +#include "regs.h" +#include "insn-attr.h" +#include "flags.h" +#include "attribs.h" +#include "explow.h" +#include "expr.h" +#include "common/common-target.h" +#include "rs6000-internal.h" + +/* Expand a 16-bit vector operation: + + ICODE: Operation to perform. + RESULT: Result of the operation. + OP1: Input operand1. + OP2: Input operand2. + OP3: Input operand3 or NULL_RTX. + SUBTYPE: Describe the operation. */ + +void +fp16_vectorization (enum rtx_code icode, + rtx result, + rtx op1, + rtx op2, + rtx op3, + enum fp16_operation subtype) +{ + gcc_assert (can_create_pseudo_p ()); + + machine_mode result_mode = GET_MODE (result); + rtx op_orig[3] = { op1, op2, op3 }; + rtx op_hi[3]; + rtx op_lo[3]; + rtx result_hi; + rtx result_lo; + size_t n_opts; + + switch (subtype) + { + case FP16_BINARY: + n_opts = 2; + break; + + case FP16_FMA: + case FP16_FMS: + case FP16_NFMA: + case FP16_NFMS: + n_opts = 3; + break; + + default: + gcc_unreachable (); + } + + /* Allocate 2 temporaries for the results and the input operands. */ + result_hi = gen_reg_rtx (V4SFmode); + result_lo = gen_reg_rtx (V4SFmode); + + for (size_t i = 0; i < n_opts; i++) + { + gcc_assert (op_orig[i] != NULL_RTX); + op_hi[i] = gen_reg_rtx (V4SFmode); /* high register. */ + op_lo[i] = gen_reg_rtx (V4SFmode); /* low register. */ + + rtx interleave_hi = gen_reg_rtx (result_mode); + rtx interleave_lo = gen_reg_rtx (result_mode); + rtx orig = op_orig[i]; + + rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN); + rs6000_expand_interleave (interleave_lo, orig, orig, BYTES_BIG_ENDIAN); + + if (result_mode == V8HFmode) + { + emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi)); + emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo)); + } + + else if (result_mode == V8BFmode) + { + emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi)); + emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo)); + } + + else + gcc_unreachable (); + } + + /* Do 2 sets of V4SFmode operations. */ + switch (subtype) + { + case FP16_BINARY: + emit_insn (gen_rtx_SET (result_hi, + gen_rtx_fmt_ee (icode, V4SFmode, + op_hi[0], + op_hi[1]))); + + emit_insn (gen_rtx_SET (result_lo, + gen_rtx_fmt_ee (icode, V4SFmode, + op_lo[0], + op_lo[1]))); + break; + + case FP16_FMA: + case FP16_FMS: + case FP16_NFMA: + case FP16_NFMS: + { + rtx op1_hi = op_hi[0]; + rtx op2_hi = op_hi[1]; + rtx op3_hi = op_hi[2]; + + rtx op1_lo = op_lo[0]; + rtx op2_lo = op_lo[1]; + rtx op3_lo = op_lo[2]; + + if (subtype == FP16_FMS || subtype == FP16_NFMS) + { + op3_hi = gen_rtx_NEG (V4SFmode, op3_hi); + op3_lo = gen_rtx_NEG (V4SFmode, op3_lo); + } + + rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi); + rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo); + + if (subtype == FP16_NFMA || subtype == FP16_NFMS) + { + op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi); + op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo); + } + + emit_insn (gen_rtx_SET (result_hi, op_fma_hi)); + emit_insn (gen_rtx_SET (result_lo, op_fma_lo)); + } + break; + + default: + gcc_unreachable (); + } + + /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */ + if (result_mode == V8HFmode) + emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo)); + + else if (result_mode == V8BFmode) + emit_insn (gen_vec_pack_trunc_v4sf_v8bf (result, result_hi, result_lo)); + + else + gcc_unreachable (); + + return; +} diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index ce1a79f50ab..690b8c2d661 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -60,12 +60,27 @@ (define_mode_attr FP16_VECTOR4 [(BF "V4BF") (V8BF "V4BF") (V8HF "V4HF")]) +;; Binary operators for bfloat16/float16 vectorization. +(define_code_iterator FP16_BINARY_OP [plus minus mult smax smin]) + +;; Standard names for the unary/binary/ternary operators +(define_code_attr fp16_names [(abs "abs") + (fma "fma") + (plus "add") + (minus "sub") + (mult "mul") + (neg "neg") + (smax "smax") + (smin "smin")]) + ;; UNSPEC constants (define_c_enum "unspec" [UNSPEC_FP16_SHIFT_LEFT_32BIT UNSPEC_CVT_FP16_TO_V4SF UNSPEC_XXSPLTW_FP16 - UNSPEC_XVCVSPBF16_BF]) + UNSPEC_XVCVSPBF16_BF + UNSPEC_XVCVSPHP_V8HF + UNSPEC_XVCVSPBF16_V8BF]) ;; _Float16 and __bfloat16 moves (define_expand "mov<mode>" @@ -720,3 +735,231 @@ (define_insn "*boolcc<mode>3" xxl%q3 %x0,%x1,%x2 %q3 %0,%1,%2" [(set_attr "type" "veclogical,logical")]) + +;; Add vectorization support for 16-bit floating point. + +;; Binary operators being vectorized. +(define_insn_and_split "<fp16_names><mode>3" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (FP16_BINARY_OP:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand")))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], NULL_RTX, + FP16_BINARY); + DONE; +}) + +;; FMA operations being vectorized. +(define_insn_and_split "fma<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (match_operand:VFP16_HW 3 "vsx_register_operand")))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMA); + DONE; +}) + +(define_insn_and_split "*fms<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (neg:VFP16_HW + (match_operand:VFP16_HW 3 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMS); + DONE; +}) + +(define_insn_and_split "*nfma<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (neg:VFP16_HW + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (match_operand:VFP16_HW 3 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*nfms<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (neg:VFP16_HW + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (neg:VFP16_HW + (match_operand:VFP16_HW 3 "vsx_register_operand")))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + +;; Vector Pack support. + +(define_expand "vec_pack_trunc_v4sf_v8hf" + [(match_operand:V8HF 0 "vfloat_operand") + (match_operand:V4SF 1 "vfloat_operand") + (match_operand:V4SF 2 "vfloat_operand")] + "TARGET_FLOAT16_HW" +{ + rtx r1 = gen_reg_rtx (V8HFmode); + rtx r2 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_xvcvsphp_v8hf (r1, operands[1])); + emit_insn (gen_xvcvsphp_v8hf (r2, operands[2])); + rs6000_expand_extract_even (operands[0], r1, r2); + DONE; +}) + +(define_expand "vec_pack_trunc_v4sf_v8bf" + [(match_operand:V8BF 0 "vfloat_operand") + (match_operand:V4SF 1 "vfloat_operand") + (match_operand:V4SF 2 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx r1 = gen_reg_rtx (V8BFmode); + rtx r2 = gen_reg_rtx (V8BFmode); + + emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1])); + emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2])); + rs6000_expand_extract_even (operands[0], r1, r2); + DONE; +}) + +;; Unfortunately the machine independent code assumes there is only one +;; 16-bit floating point type. This means we have to choose whether to +;; support packing _Float16 or __bfloat16. It looks like __bfloat16 is +;; more popular, so we choose __bfloat16 to be the default. + +(define_expand "vec_pack_trunc_v4sf" + [(match_operand:V8BF 0 "vfloat_operand") + (match_operand:V4SF 1 "vfloat_operand") + (match_operand:V4SF 2 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx r1 = gen_reg_rtx (V8BFmode); + rtx r2 = gen_reg_rtx (V8BFmode); + + emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1])); + emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2])); + rs6000_expand_extract_even (operands[0], r1, r2); + DONE; +}) + +;; Used for vector conversion to _Float16 +(define_insn "xvcvsphp_v8hf" + [(set (match_operand:V8HF 0 "vsx_register_operand" "=wa") + (unspec:V8HF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVSPHP_V8HF))] + "TARGET_FLOAT16_HW" + "xvcvsphp %x0,%x1" +[(set_attr "type" "vecfloat")]) + +;; Used for vector conversion to __bfloat16 +(define_insn "xvcvspbf16_v8bf" + [(set (match_operand:V8BF 0 "vsx_register_operand" "=wa") + (unspec:V8BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVSPBF16_V8BF))] + "TARGET_BFLOAT16_HW" + "xvcvspbf16 %x0,%x1" + [(set_attr "type" "vecfloat")]) + +;; Vector unpack support. Given the name is for the type being +;; unpacked, we can unpack both __bfloat16 and _Float16. + +;; Unpack vector _Float16 +(define_expand "vec_unpacks_hi_v8hf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8HF 1 "vfloat_operand")] + "TARGET_FLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8HFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg)); + DONE; +}) + +(define_expand "vec_unpacks_lo_v8hf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8HF 1 "vfloat_operand")] + "TARGET_FLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8HFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg)); + DONE; +}) + +;; Used for vector conversion from _Float16 +(define_insn "xvcvhpsp_v8hf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:V8HF 1 "vsx_register_operand" "wa")] + UNSPEC_CVT_FP16_TO_V4SF))] + "TARGET_FLOAT16_HW" + "xvcvhpsp %x0,%x1" + [(set_attr "type" "vecperm")]) + +;; Unpack vector __bfloat16 +(define_expand "vec_unpacks_hi_v8bf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8BF 1 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8BFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg)); + DONE; +}) + +(define_expand "vec_unpacks_lo_v8bf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8BF 1 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8BFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg)); + DONE; +}) + +;; Used for vector conversion from __bfloat16 +(define_insn "xvcvbf16spn_v8bf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:V8BF 1 "vsx_register_operand" "wa")] + UNSPEC_CVT_FP16_TO_V4SF))] + "TARGET_BFLOAT16_HW" + "xvcvbf16spn %x0,%x1" + [(set_attr "type" "vecperm")]) diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index d29081837b3..dd5fcd69e83 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -258,6 +258,19 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode, extern unsigned constant_generates_lxvkq (vec_const_128bit_type *); extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *); extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *); + +/* From float16.cc. */ +/* Optimize bfloat16 and float16 operations. */ +enum fp16_operation { + FP16_BINARY, /* Bfloat16/float16 binary op. */ + FP16_FMA, /* (a * b) + c. */ + FP16_FMS, /* (a * b) - c. */ + FP16_NFMA, /* - ((a * b) + c). */ + FP16_NFMS /* - ((a * b) - c). */ +}; + +extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx, + enum fp16_operation); #endif /* RTX_CODE */ #ifdef TREE_CODE diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index a5d1c27424f..c8f19865311 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc rs6000-builtins.h $(COMPILE) $< $(POSTCOMPILE) +float16.o: $(srcdir)/config/rs6000/float16.cc + $(COMPILE) $< + $(POSTCOMPILE) + #$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl # $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md -- 2.51.1 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: [email protected]
