https://gcc.gnu.org/g:b09338d1e4411cd85819e86025811ba8a87d1ea3
commit b09338d1e4411cd85819e86025811ba8a87d1ea3 Author: Surya Kumari Jangala <jskum...@linux.ibm.com> Date: Mon Jul 7 03:14:48 2025 -0500 MMA+: Add int8 and bfloat16 ger builtins Add builtins __builtin_mma_dmxvbf16gerx2, __builtin_mma_dmxvbf16gerx2nn, __builtin_mma_mxvbf16gerx2np, __builtin_mma_dmxvbf16gerx2pn, __builtin_mma_dmxvbf16gerx2pp, __builtin_mma_pmdmxvbf16gerx2, __builtin_mma_pmdmxvbf16gerx2nn, __builtin_mma_pmdmxvbf16gerx2np, __builtin_mma_pmdmxvbf16gerx2pn, __builtin_mma_pmdmxvbf16gerx2pp, __builtin_mma_dmxvi8gerx4spp, __builtin_mma_pmdmxvi8gerx4spp Diff: --- gcc/config/rs6000/mma.md | 94 ++++++++++- gcc/config/rs6000/rs6000-builtins.def | 85 ++++++++++ gcc/testsuite/gcc.target/powerpc/dmf-builtin-1.c | 202 +++++++++++++++++++++++ 3 files changed, 372 insertions(+), 9 deletions(-) diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md index fd3a0e592d88..14f33724d69c 100644 --- a/gcc/config/rs6000/mma.md +++ b/gcc/config/rs6000/mma.md @@ -100,8 +100,20 @@ UNSPEC_DMF_DMXOR UNSPEC_DMF_DMXVI8GERX4 UNSPEC_DMF_DMXVI8GERX4PP + UNSPEC_DMF_DMXVI8GERX4SPP UNSPEC_DMF_PMDMXVI8GERX4 UNSPEC_DMF_PMDMXVI8GERX4PP + UNSPEC_DMF_PMDMXVI8GERX4SPP + UNSPEC_DMF_DMXVBF16GERX2 + UNSPEC_DMF_DMXVBF16GERX2PP + UNSPEC_DMF_DMXVBF16GERX2PN + UNSPEC_DMF_DMXVBF16GERX2NP + UNSPEC_DMF_DMXVBF16GERX2NN + UNSPEC_DMF_PMDMXVBF16GERX2 + UNSPEC_DMF_PMDMXVBF16GERX2PP + UNSPEC_DMF_PMDMXVBF16GERX2PN + UNSPEC_DMF_PMDMXVBF16GERX2NP + UNSPEC_DMF_PMDMXVBF16GERX2NN ]) (define_c_enum "unspecv" @@ -145,7 +157,8 @@ (define_int_iterator MMA_PV [UNSPEC_MMA_XVF64GER]) ;; DMF instructions with 1 vector pair and 1 vector arguments -(define_int_iterator DMF_PV [UNSPEC_DMF_DMXVI8GERX4]) +(define_int_iterator DMF_PV [UNSPEC_DMF_DMXVI8GERX4 + UNSPEC_DMF_DMXVBF16GERX2]) ;; MMA instructions with 1 accumulator, 1 vector pair and 1 vector arguments (define_int_iterator MMA_APV [UNSPEC_MMA_XVF64GERPP @@ -154,7 +167,12 @@ UNSPEC_MMA_XVF64GERNN]) ;; DMF instructions with 1 dmr, 1 vector pair and 1 vector arguments -(define_int_iterator DMF_DPV [UNSPEC_DMF_DMXVI8GERX4PP]) +(define_int_iterator DMF_DPV [UNSPEC_DMF_DMXVI8GERX4PP + UNSPEC_DMF_DMXVI8GERX4SPP + UNSPEC_DMF_DMXVBF16GERX2PP + UNSPEC_DMF_DMXVBF16GERX2PN + UNSPEC_DMF_DMXVBF16GERX2NP + UNSPEC_DMF_DMXVBF16GERX2NN]) ;; MMA instructions with 2 vector, 2 4-bit and 1 8-bit arguments (define_int_iterator MMA_VVI4I4I8 [UNSPEC_MMA_PMXVI4GER8]) @@ -211,7 +229,19 @@ ;; DMF instructions with 1dmr, 1 vector pair, 1 vector and 1 8-bit and ;; 2 4-bit arguments -(define_int_iterator DMF_DPVI8I4I4 [UNSPEC_DMF_PMDMXVI8GERX4PP]) +(define_int_iterator DMF_DPVI8I4I4 [UNSPEC_DMF_PMDMXVI8GERX4PP + UNSPEC_DMF_PMDMXVI8GERX4SPP]) + +;; DMF instructions with 1 vector pair, 1 vector, 1 8-bit, 1 4-bit +;; and 1 2-bit arguments +(define_int_iterator DMF_PVI8I4I2 [UNSPEC_DMF_PMDMXVBF16GERX2]) + +;; DMF instructions with 1dmr, 1 vector pair, 1 vector, 1 8-bit, +;; 1 4-bit and 1 2-bit arguments +(define_int_iterator DMF_DPVI8I4I2 [UNSPEC_DMF_PMDMXVBF16GERX2PP + UNSPEC_DMF_PMDMXVBF16GERX2PN + UNSPEC_DMF_PMDMXVBF16GERX2NP + UNSPEC_DMF_PMDMXVBF16GERX2NN]) (define_int_attr acc [(UNSPEC_MMA_XXMFACC "xxmfacc") (UNSPEC_MMA_XXMTACC "xxmtacc")]) @@ -243,13 +273,20 @@ (UNSPEC_MMA_XVF32GERNN "xvf32gernn")]) (define_int_attr pv [(UNSPEC_MMA_XVF64GER "xvf64ger") - (UNSPEC_DMF_DMXVI8GERX4 "dmxvi8gerx4")]) + (UNSPEC_DMF_DMXVI8GERX4 "dmxvi8gerx4") + (UNSPEC_DMF_DMXVBF16GERX2 "dmxvbf16gerx2")]) (define_int_attr apv [(UNSPEC_MMA_XVF64GERPP "xvf64gerpp") (UNSPEC_MMA_XVF64GERPN "xvf64gerpn") (UNSPEC_MMA_XVF64GERNP "xvf64gernp") - (UNSPEC_MMA_XVF64GERNN "xvf64gernn") - (UNSPEC_DMF_DMXVI8GERX4PP "dmxvi8gerx4pp")]) + (UNSPEC_MMA_XVF64GERNN "xvf64gernn")]) + +(define_int_attr dpv [(UNSPEC_DMF_DMXVI8GERX4PP "dmxvi8gerx4pp") + (UNSPEC_DMF_DMXVI8GERX4SPP "dmxvi8gerx4spp") + (UNSPEC_DMF_DMXVBF16GERX2PP "dmxvbf16gerx2pp") + (UNSPEC_DMF_DMXVBF16GERX2PN "dmxvbf16gerx2pn") + (UNSPEC_DMF_DMXVBF16GERX2NP "dmxvbf16gerx2np") + (UNSPEC_DMF_DMXVBF16GERX2NN "dmxvbf16gerx2nn")]) ;; The "pm" prefix is not in these expansions, so that we can generate ;; pmdmxvi4ger8 on systems with dense math registers and xvi4ger8 on systems @@ -295,7 +332,15 @@ (define_int_attr pvi8i4i4 [(UNSPEC_DMF_PMDMXVI8GERX4 "pmdmxvi8gerx4")]) -(define_int_attr dpvi8i4i4 [(UNSPEC_DMF_PMDMXVI8GERX4PP "pmdmxvi8gerx4pp")]) +(define_int_attr dpvi8i4i4 [(UNSPEC_DMF_PMDMXVI8GERX4PP "pmdmxvi8gerx4pp") + (UNSPEC_DMF_PMDMXVI8GERX4SPP "pmdmxvi8gerx4spp")]) + +(define_int_attr pvi8i4i2 [(UNSPEC_DMF_PMDMXVBF16GERX2 "pmdmxvbf16gerx2")]) + +(define_int_attr dpvi8i4i2 [(UNSPEC_DMF_PMDMXVBF16GERX2PP "pmdmxvbf16gerx2pp") + (UNSPEC_DMF_PMDMXVBF16GERX2PN "pmdmxvbf16gerx2pn") + (UNSPEC_DMF_PMDMXVBF16GERX2NP "pmdmxvbf16gerx2np") + (UNSPEC_DMF_PMDMXVBF16GERX2NN "pmdmxvbf16gerx2nn")]) ;; Vector pair support. OOmode can only live in VSRs. (define_expand "movoo" @@ -981,7 +1026,7 @@ } [(set_attr "type" "dmf")]) -(define_insn "dmf_<apv>" +(define_insn "dmf_<dpv>" [(set (match_operand:TDO 0 "accumulator_operand" "=wD") (unspec:TDO [(match_operand:TDO 1 "accumulator_operand" "0") (match_operand:OO 2 "vsx_register_operand" "wa") @@ -989,7 +1034,7 @@ DMF_DPV))] "TARGET_DENSE_MATH" { - return "<apv> %0,%x2,%x3"; + return "<dpv> %0,%x2,%x3"; } [(set_attr "type" "dmf")]) @@ -1023,3 +1068,34 @@ } [(set_attr "type" "dmf") (set_attr "prefixed" "yes")]) + +(define_insn "dmf_<pvi8i4i2>" + [(set (match_operand:TDO 0 "accumulator_operand" "=wD") + (unspec:TDO [(match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:V16QI 2 "vsx_register_operand" "wa") + (match_operand:SI 3 "u8bit_cint_operand" "n") + (match_operand:SI 4 "const_0_to_15_operand" "n") + (match_operand:SI 5 "const_0_to_3_operand" "n")] + DMF_PVI8I4I2))] + "TARGET_DENSE_MATH" +{ + return "<pvi8i4i2> %0,%x1,%x2,%3,%4,%5"; +} + [(set_attr "type" "dmf") + (set_attr "prefixed" "yes")]) + +(define_insn "dmf_<dpvi8i4i2>" + [(set (match_operand:TDO 0 "accumulator_operand" "=wD") + (unspec:TDO [(match_operand:TDO 1 "accumulator_operand" "0") + (match_operand:OO 2 "vsx_register_operand" "wa") + (match_operand:V16QI 3 "vsx_register_operand" "wa") + (match_operand:SI 4 "u8bit_cint_operand" "n") + (match_operand:SI 5 "const_0_to_15_operand" "n") + (match_operand:SI 6 "const_0_to_3_operand" "n")] + DMF_DPVI8I4I2))] + "TARGET_DENSE_MATH" +{ + return "<dpvi8i4i2> %0,%x2,%x3,%4,%5,%6"; +} + [(set_attr "type" "dmf") + (set_attr "prefixed" "yes")]) diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index 3630c6a6136c..07ead4b9ffee 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -3965,6 +3965,12 @@ dm1024 __builtin_mma_dmxvi8gerx4pp_internal (dm1024, v256, vuc); DMXVI8GERX4PP_INTERNAL dmf_dmxvi8gerx4pp {dm} + void __builtin_mma_dmxvi8gerx4spp (dm1024 *, v256, vuc); + DMXVI8GERX4SPP nothing {dm,dmint,dmr} + + dm1024 __builtin_mma_dmxvi8gerx4spp_internal (dm1024, v256, vuc); + DMXVI8GERX4SPP_INTERNAL dmf_dmxvi8gerx4spp {dm} + void __builtin_mma_pmdmxvi8gerx4 (dm1024 *, v256, vuc, const int<8>, \ const int<4>, const int<4>); PMDMXVI8GERX4 nothing {dm,pair,dmint} @@ -3982,6 +3988,85 @@ const int<4>); PMDMXVI8GERX4PP_INTERNAL dmf_pmdmxvi8gerx4pp {dm,pair} + void __builtin_mma_pmdmxvi8gerx4spp (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<4>); + PMDMXVI8GERX4SPP nothing {dm,pair,dmint,dmr} + + dm1024 __builtin_mma_pmdmxvi8gerx4spp_internal (dm1024, v256, vuc, \ + const int<8>, const int<4>, \ + const int<4>); + PMDMXVI8GERX4SPP_INTERNAL dmf_pmdmxvi8gerx4spp {dm,pair} + + void __builtin_mma_dmxvbf16gerx2 (dm1024 *, v256, vuc); + DMXVBF16GERX2 nothing {dm,dmint} + + dm1024 __builtin_mma_dmxvbf16gerx2_internal (v256, vuc); + DMXVBF16GERX2_INTERNAL dmf_dmxvbf16gerx2 {dm} + + void __builtin_mma_dmxvbf16gerx2pp (dm1024 *, v256, vuc); + DMXVBF16GERX2PP nothing {dm,dmint,dmr} + + dm1024 __builtin_mma_dmxvbf16gerx2pp_internal (dm1024, v256, vuc); + DMXVBF16GERX2PP_INTERNAL dmf_dmxvbf16gerx2pp {dm} + + void __builtin_mma_dmxvbf16gerx2pn (dm1024 *, v256, vuc); + DMXVBF16GERX2PN nothing {dm,dmint,dmr} + + dm1024 __builtin_mma_dmxvbf16gerx2pn_internal (dm1024, v256, vuc); + DMXVBF16GERX2PN_INTERNAL dmf_dmxvbf16gerx2pn {dm} + + void __builtin_mma_dmxvbf16gerx2np (dm1024 *, v256, vuc); + DMXVBF16GERX2NP nothing {dm,dmint,dmr} + + dm1024 __builtin_mma_dmxvbf16gerx2np_internal (dm1024, v256, vuc); + DMXVBF16GERX2NP_INTERNAL dmf_dmxvbf16gerx2np {dm} + + void __builtin_mma_dmxvbf16gerx2nn (dm1024 *, v256, vuc); + DMXVBF16GERX2NN nothing {dm,dmint,dmr} + + dm1024 __builtin_mma_dmxvbf16gerx2nn_internal (dm1024, v256, vuc); + DMXVBF16GERX2NN_INTERNAL dmf_dmxvbf16gerx2nn {dm} + + void __builtin_mma_pmdmxvbf16gerx2 (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2 nothing {dm,pair,dmint} + + dm1024 __builtin_mma_pmdmxvbf16gerx2_internal (v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2_INTERNAL dmf_pmdmxvbf16gerx2 {dm,pair} + + void __builtin_mma_pmdmxvbf16gerx2pp (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2PP nothing {dm,pair,dmint,dmr} + + dm1024 __builtin_mma_pmdmxvbf16gerx2pp_internal (dm1024, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2PP_INTERNAL dmf_pmdmxvbf16gerx2pp {dm,pair} + + void __builtin_mma_pmdmxvbf16gerx2pn (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2PN nothing {dm,pair,dmint,dmr} + + dm1024 __builtin_mma_pmdmxvbf16gerx2pn_internal (dm1024, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2PN_INTERNAL dmf_pmdmxvbf16gerx2pn {dm,pair} + + void __builtin_mma_pmdmxvbf16gerx2np (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2NP nothing {dm,pair,dmint,dmr} + + dm1024 __builtin_mma_pmdmxvbf16gerx2np_internal (dm1024, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2NP_INTERNAL dmf_pmdmxvbf16gerx2np {dm,pair} + + void __builtin_mma_pmdmxvbf16gerx2nn (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2NN nothing {dm,pair,dmint,dmr} + + dm1024 __builtin_mma_pmdmxvbf16gerx2nn_internal (dm1024, v256, vuc, const int<8>, \ + const int<4>, const int<2>); + PMDMXVBF16GERX2NN_INTERNAL dmf_pmdmxvbf16gerx2nn {dm,pair} + [future] const signed int __builtin_saturate_subtract32 (signed int, signed int); SAT_SUBSI sat_subsi3 {} diff --git a/gcc/testsuite/gcc.target/powerpc/dmf-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/dmf-builtin-1.c new file mode 100644 index 000000000000..d45a19b1a684 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/dmf-builtin-1.c @@ -0,0 +1,202 @@ +/* { dg-do compile } */ +/* { dg-options "-mdejagnu-cpu=future -O2" } */ + +typedef unsigned char vec_t __attribute__((vector_size(16))); + +void +foo (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmsetdmrz (&dmr); + __builtin_mma_dmxvbf16gerx2 (&dmr, vp, vec); + *dst = dmr; +} + +void +bar (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr = dst[0];; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmxvbf16gerx2 (&dmr, vp, vec); + dst[1] = dmr; +} + +/* { dg-final { scan-assembler-times {\mdmxvbf16gerx2\M} 2 } } */ + +void +foo_1 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmsetdmrz (&dmr); + __builtin_mma_dmxvbf16gerx2nn (&dmr, vp, vec); + *dst = dmr; +} + +void +bar_1 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr = dst[0];; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmxvbf16gerx2nn (&dmr, vp, vec); + dst[1] = dmr; +} + +/* { dg-final { scan-assembler-times {\mdmxvbf16gerx2nn\M} 2 } } */ + +void +foo_2 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmsetdmrz (&dmr); + __builtin_mma_dmxvbf16gerx2np (&dmr, vp, vec); + *dst = dmr; +} + +void +bar_2 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr = dst[0];; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmxvbf16gerx2np (&dmr, vp, vec); + dst[1] = dmr; +} + +/* { dg-final { scan-assembler-times {\mdmxvbf16gerx2np\M} 2 } } */ + +void +foo_3 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmsetdmrz (&dmr); + __builtin_mma_dmxvbf16gerx2pn (&dmr, vp, vec); + *dst = dmr; +} + +void +bar_3 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr = dst[0];; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmxvbf16gerx2pn (&dmr, vp, vec); + dst[1] = dmr; +} + +/* { dg-final { scan-assembler-times {\mdmxvbf16gerx2pn\M} 2 } } */ + +void +foo_4 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmsetdmrz (&dmr); + __builtin_mma_dmxvbf16gerx2pp (&dmr, vp, vec); + *dst = dmr; +} + +void +bar_4 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr = dst[0];; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmxvbf16gerx2pp (&dmr, vp, vec); + dst[1] = dmr; +} + +/* { dg-final { scan-assembler-times {\mdmxvbf16gerx2pp\M} 2 } } */ + +void +foo_5 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_pmdmxvbf16gerx2 (dst, vp, vec, 255, 15, 2); +} + +/* { dg-final { scan-assembler-times {\mpmdmxvbf16gerx2\M} 1 } } */ + +void +foo_6 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_pmdmxvbf16gerx2nn (dst, vp, vec, 255, 15, 2); +} + +/* { dg-final { scan-assembler-times {\mpmdmxvbf16gerx2nn\M} 1 } } */ + +void +foo_7 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_pmdmxvbf16gerx2np (dst, vp, vec, 255, 15, 2); +} + +/* { dg-final { scan-assembler-times {\mpmdmxvbf16gerx2np\M} 1 } } */ + +void +foo_8 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_pmdmxvbf16gerx2pn (dst, vp, vec, 255, 15, 2); +} + +/* { dg-final { scan-assembler-times {\mpmdmxvbf16gerx2pn\M} 1 } } */ + +void +foo_9 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_pmdmxvbf16gerx2pp (dst, vp, vec, 255, 15, 2); +} + +/* { dg-final { scan-assembler-times {\mpmdmxvbf16gerx2pp\M} 1 } } */ + +void +foo_10 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_pmdmxvi8gerx4spp (dst, vp, vec, 255, 15, 15); +} + +/* { dg-final { scan-assembler-times {\mpmdmxvi8gerx4spp\M} 1 } } */ + +void +foo_11 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmsetdmrz (&dmr); + __builtin_mma_dmxvi8gerx4spp (&dmr, vp, vec); + *dst = dmr; +} + +void +bar_11 (__dmr *dst, __vector_pair *vpp, vec_t *src) +{ + __dmr dmr = dst[0];; + __vector_pair vp = *vpp; + vec_t vec = *src; + __builtin_mma_dmxvi8gerx4spp (&dmr, vp, vec); + dst[1] = dmr; +} + +/* { dg-final { scan-assembler-times {\mdmxvi8gerx4spp\M} 2 } } */