https://gcc.gnu.org/g:d69a55919efa63983c6e833dd7b4e7591d977363
commit d69a55919efa63983c6e833dd7b4e7591d977363 Author: Michael Meissner <[email protected]> Date: Thu Oct 16 18:34:27 2025 -0400 Add vectorized 16-bit floating point binary operation support. 2025-10-16 Michael Meissner <[email protected]> gcc/ * config/rs6000/float16.cc (fp16_vectorization): New function. * config/rs6000/float16.md (FP16_BINARY_OP): New code iterator. (fp16_names): New code attribute. (<fp16_names><mode>3): New insns. * config/rs6000/rs6000-protos.h (fp16_vectorization): New declaration. Diff: --- gcc/config/rs6000/float16.cc | 136 ++++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/float16.md | 30 +++++++++ gcc/config/rs6000/rs6000-protos.h | 2 + 3 files changed, 168 insertions(+) diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc index 7539125a61fc..8ad3a5ea75d1 100644 --- a/gcc/config/rs6000/float16.cc +++ b/gcc/config/rs6000/float16.cc @@ -191,3 +191,139 @@ bfloat16_operation_as_v4sf (enum rtx_code icode, else gcc_unreachable (); } + + +/* Expand a 16-bit vector operation: + + ICODE: Operation to perform. + RESULT: Result of the operation. + OP1: Input operand1. + OP2: Input operand2. + OP3: Input operand3 or NULL_RTX. + SUBTYPE: Describe the operation. */ + +void +fp16_vectorization (enum rtx_code icode, + rtx result, + rtx op1, + rtx op2, + rtx op3, + enum fp16_operation subtype) +{ + gcc_assert (can_create_pseudo_p ()); + + machine_mode result_mode = GET_MODE (result); + rtx op_orig[3] = { op1, op2, op3 }; + rtx op_hi[3]; + rtx op_lo[3]; + rtx result_hi; + rtx result_lo; + size_t n_opts; + + switch (subtype) + { + case FP16_BINARY: + n_opts = 2; + break; + + case FP16_FMA: + case FP16_FMS: + case FP16_NFMA: + case FP16_NFMS: + n_opts = 3; + break; + + default: + gcc_unreachable (); + } + + /* Allocate 2 temporaries for the results and the input operands. */ + result_hi = gen_reg_rtx (V4SFmode); + result_lo = gen_reg_rtx (V4SFmode); + + for (size_t i = 0; i < n_opts; i++) + { + gcc_assert (op_orig[i] != NULL_RTX); + op_hi[i] = gen_reg_rtx (V4SFmode); /* high register. */ + op_lo[i] = gen_reg_rtx (V4SFmode); /* low register. */ + + if (result_mode == V8HFmode) + { + emit_insn (gen_vec_unpacks_hi_v8hf (op_hi[i], op_orig[i])); + emit_insn (gen_vec_unpacks_lo_v8hf (op_lo[i], op_orig[i])); + } + + else if (result_mode == V8BFmode) + { + emit_insn (gen_vec_unpacks_hi_v8hf (op_hi[i], op_orig[i])); + emit_insn (gen_vec_unpacks_lo_v8hf (op_lo[i], op_orig[i])); + } + + else + gcc_unreachable (); + } + + /* Do 2 sets of V4SFmode operations. */ + switch (subtype) + { + case FP16_BINARY: + emit_insn (gen_rtx_SET (result_hi, + gen_rtx_fmt_ee (icode, V4SFmode, + op_hi[0], + op_hi[1]))); + + emit_insn (gen_rtx_SET (result_lo, + gen_rtx_fmt_ee (icode, V4SFmode, + op_lo[0], + op_lo[1]))); + break; + + case FP16_FMA: + case FP16_FMS: + case FP16_NFMA: + case FP16_NFMS: + { + rtx op1_hi = op_hi[0]; + rtx op2_hi = op_hi[1]; + rtx op3_hi = op_hi[2]; + + rtx op1_lo = op_lo[0]; + rtx op2_lo = op_lo[1]; + rtx op3_lo = op_lo[2]; + + if (subtype == FP16_FMS || subtype == FP16_NFMS) + { + op3_hi = gen_rtx_NEG (V4SFmode, op3_hi); + op3_lo = gen_rtx_NEG (V4SFmode, op3_lo); + } + + rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi); + rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo); + + if (subtype == FP16_NFMA || subtype == FP16_NFMS) + { + op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi); + op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo); + } + + emit_insn (gen_rtx_SET (result_hi, op_fma_hi)); + emit_insn (gen_rtx_SET (result_lo, op_fma_lo)); + } + break; + + default: + gcc_unreachable (); + } + + /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */ + if (result_mode == V8HFmode) + emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo)); + + else if (result_mode == V8BFmode) + emit_insn (gen_vec_pack_trunc_v4sf (result, result_hi, result_lo)); + + else + gcc_unreachable (); + + return; +} diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index 50205db44c2e..264d050429f4 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -59,6 +59,19 @@ (V8BF "V4BF") (V8HF "V4HF")]) +;; Binary operators for bfloat16/float16 vectorization. +(define_code_iterator FP16_BINARY_OP [plus minus mult smax smin]) + +;; Standard names for the unary/binary/ternary operators +(define_code_attr fp16_names [(abs "abs") + (fma "fma") + (plus "add") + (minus "sub") + (mult "mul") + (neg "neg") + (smax "smax") + (smin "smin")]) + ;; UNSPEC constants (define_c_enum "unspec" [UNSPEC_FP16_SHIFT_LEFT_32BIT @@ -691,6 +704,23 @@ operands[3], FP16_NFMS); DONE; }) + +;; Add vectorization support for 16-bit floating point. +;; Binary operators being vectorized. +(define_insn_and_split "<fp16_names><mode>3" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (FP16_BINARY_OP:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand")))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], NULL_RTX, + FP16_BINARY); + DONE; +}) ;; If we do multiple __bfloat16 operations, between the first and diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index be76cdb507ba..3c4d21299e1a 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -271,6 +271,8 @@ enum fp16_operation { extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx, enum fp16_operation); +extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx, + enum fp16_operation); #endif /* RTX_CODE */ #ifdef TREE_CODE
