https://gcc.gnu.org/g:b75aa06732e45bc81d48858c4b69a04f8396cb6f
commit b75aa06732e45bc81d48858c4b69a04f8396cb6f Author: Michael Meissner <meiss...@linux.ibm.com> Date: Fri Sep 27 00:08:55 2024 -0400 Initial vector-pair.h support 2024-09-26 Michael Meissner <meiss...@linux.ibm.com> gcc/ * config.gcc (powerpc*-*-*): Add vector-pair.h to extra headers. * config/rs6000/vector-pair.h: New file. * doc/extend.texi (PowerPC Vector Pair Support): Document the vector pair support functions. Diff: --- gcc/config.gcc | 2 +- gcc/config/rs6000/vector-pair.h | 563 ++++++++++++++++++++++++++++++++++++++++ gcc/doc/extend.texi | 98 +++++++ 3 files changed, 662 insertions(+), 1 deletion(-) diff --git a/gcc/config.gcc b/gcc/config.gcc index 0b794e977f6a..3627bed8b863 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -537,7 +537,7 @@ powerpc*-*-*) extra_headers="${extra_headers} pmmintrin.h tmmintrin.h smmintrin.h" extra_headers="${extra_headers} nmmintrin.h immintrin.h x86gprintrin.h" extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h" - extra_headers="${extra_headers} amo.h" + extra_headers="${extra_headers} amo.h vector-pair.h" case x$with_cpu in xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower1[01]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture) cpu_is_64bit=yes diff --git a/gcc/config/rs6000/vector-pair.h b/gcc/config/rs6000/vector-pair.h new file mode 100644 index 000000000000..e0023842f331 --- /dev/null +++ b/gcc/config/rs6000/vector-pair.h @@ -0,0 +1,563 @@ +/* PowerPC vector pair include file. + Copyright (C) 2024 Free Software Foundation, Inc. + Contributed by Aldy Hernandez (al...@redhat.com). + Rewritten by Paolo Bonzini (bonz...@gnu.org). + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* Provide support for vector pairs, even on systems that do not have native + support for loading and storing pairs of vectors. */ + +#ifndef _VECTOR_PAIR_H +#define _VECTOR_PAIR_H 1 + +/* During testing, allow vector-pair.h to be included multiple times. */ +#undef vector_pair_t +#undef vector_pair_f64_t +#undef vector_pair_f32_t + +#undef vpair_f64_abs +#undef vpair_f64_add +#undef vpair_f64_div +#undef vpair_f64_fma +#undef vpair_f64_fms +#undef vpair_f64_max +#undef vpair_f64_min +#undef vpair_f64_mul +#undef vpair_f64_nabs +#undef vpair_f64_neg +#undef vpair_f64_nfma +#undef vpair_f64_nfms +#undef vpair_f64_splat +#undef vpair_f64_sqrt +#undef vpair_f64_sub + +#undef vpair_f32_abs +#undef vpair_f32_add +#undef vpair_f32_div +#undef vpair_f32_fma +#undef vpair_f32_fms +#undef vpair_f32_max +#undef vpair_f32_min +#undef vpair_f32_mul +#undef vpair_f32_nabs +#undef vpair_f32_neg +#undef vpair_f32_nfma +#undef vpair_f32_nfms +#undef vpair_f32_splat +#undef vpair_f32_sqrt +#undef vpair_f32_sub + +/* Union of the various vector pair types. For testing, allow vector-pair.h to + be included multiple times, so protect the union from re-declaration. */ +#ifndef __VECTOR_PAIR_UNION__ +#define __VECTOR_PAIR_UNION__ 1 + +union __vpair_union { + +#ifdef __MMA__ + __vector_pair __vpair; +#endif + + vector double __vp_f64[2]; + vector float __vp_f32[2]; + vector unsigned char __vp_uc[2]; +}; + +typedef union __vpair_union vector_pair_t; +typedef union __vpair_union vector_pair_f64_t; +typedef union __vpair_union vector_pair_f32_t; +typedef union __vpair_union *__vpair_ptr_t; + +#endif /* __VECTOR_PAIR_UNION__. */ + +#if !__VPAIR_ASM__ && !__VPAIR_NOP10__ +#if __MMA__ +#define __VPAIR_ASM__ 1 + +#else +#define __VPAIR_NOP10__ 1 +#endif +#endif + +/* ISA 3.1 (power10/power11) support with explicit vector pair type. */ + +#if __VPAIR_ASM__ && __MMA__ + +#undef __VPAIR_FP_UNARY_ASM +#define __VPAIR_FP_UNARY_ASM(OPCODE, R, A) \ + __asm__ (OPCODE " %x0,%x1\n\t" OPCODE " %x0+1,%x1+1" \ + : "=wa" (((__vpair_ptr_t)(R))->__vpair) \ + : "wa" (((__vpair_ptr_t)(A))->__vpair)); + +#undef __VPAIR_FP_BINARY_ASM +#define __VPAIR_FP_BINARY_ASM(OPCODE, R, A, B) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" (((__vpair_ptr_t)(R))->__vpair) \ + : "wa" (((__vpair_ptr_t)(A))->__vpair), \ + "wa" (((__vpair_ptr_t)(B))->__vpair)); + + /* Note the 'a' version of the FMA instruction must be used. */ +#undef __VPAIR_FP_FMA_ASM +#define __VPAIR_FP_FMA_ASM(OPCODE, R, A, B, C) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" (((__vpair_ptr_t)(R))->__vpair) \ + : "wa" (((__vpair_ptr_t)(A))->__vpair), \ + "wa" (((__vpair_ptr_t)(B))->__vpair), \ + "0" (((__vpair_ptr_t)(C))->__vpair)); + +#define vpair_f64_splat(R, A) \ + __asm__ ("xxlor %x0+1,%x1,%x1" \ + : "=wa" (((__vpair_ptr_t)(R))->__vpair) \ + : "0" (__builtin_vec_splats ((double) (A)))) + +#define vpair_f64_abs(R,A) __VPAIR_FP_UNARY_ASM ("xvabsdp", R, A) +#define vpair_f64_nabs(R,A) __VPAIR_FP_UNARY_ASM ("xvnabsdp", R, A) +#define vpair_f64_neg(R,A) __VPAIR_FP_UNARY_ASM ("xvnegdp", R, A) +#define vpair_f64_sqrt(R,A) __VPAIR_FP_UNARY_ASM ("xvsqrtdp", R, A) + +#define vpair_f64_add(R,A,B) __VPAIR_FP_BINARY_ASM ("xvadddp", R, A, B) +#define vpair_f64_div(R,A,B) __VPAIR_FP_BINARY_ASM ("xvdivdp", R, A, B) +#define vpair_f64_max(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmaxdp", R, A, B) +#define vpair_f64_min(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmindp", R, A, B) +#define vpair_f64_mul(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmuldp", R, A, B) +#define vpair_f64_sub(R,A,B) __VPAIR_FP_BINARY_ASM ("xvsubdp", R, A, B) + +#define vpair_f64_fma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmaddadp", R, A, B, C) +#define vpair_f64_fms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmsubadp", R, A, B, C) +#define vpair_f64_nfma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmaddadp", R, A, B, C) +#define vpair_f64_nfms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmsubadp", R, A, B, C) + +#define vpair_f32_splat(R, A) \ + __asm__ ("xxlor %x0+1,%x1,%x1" \ + : "=wa" (((__vpair_ptr_t)(R))->__vpair) \ + : "0" (__builtin_vec_splats ((float) (A)))) + +#define vpair_f32_abs(R,A) __VPAIR_FP_UNARY_ASM ("xvabssp", R, A) +#define vpair_f32_nabs(R,A) __VPAIR_FP_UNARY_ASM ("xvnabssp", R, A) +#define vpair_f32_neg(R,A) __VPAIR_FP_UNARY_ASM ("xvnegsp", R, A) +#define vpair_f32_sqrt(R,A) __VPAIR_FP_UNARY_ASM ("xvsqrtsp", R, A) + +#define vpair_f32_add(R,A,B) __VPAIR_FP_BINARY_ASM ("xvaddsp", R, A, B) +#define vpair_f32_div(R,A,B) __VPAIR_FP_BINARY_ASM ("xvdivsp", R, A, B) +#define vpair_f32_max(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmaxsp", R, A, B) +#define vpair_f32_min(R,A,B) __VPAIR_FP_BINARY_ASM ("xvminsp", R, A, B) +#define vpair_f32_mul(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmulsp", R, A, B) +#define vpair_f32_sub(R,A,B) __VPAIR_FP_BINARY_ASM ("xvsubsp", R, A, B) + +#define vpair_f32_fma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmaddasp", R, A, B, C) +#define vpair_f32_fms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmsubasp", R, A, B, C) +#define vpair_f32_nfma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmaddasp", R, A, B, C) +#define vpair_f32_nfms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmsubasp", R, A, B, C) + + +#else /* ISA 2.8/3.0 support for machines without vector pair support. */ + +/* vector pair double operations on power8/power9. */ + +#define vpair_f64_splat(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vr->__vp_f64[0] = __vr->__vp_f64[1] \ + = __builtin_vec_splats ((double)(A)); \ + } \ + while (0) + +#define vpair_f64_abs(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f64[0] = __builtin_vsx_xvabsdp (__va->__vp_f64[0]); \ + __vr->__vp_f64[1] = __builtin_vsx_xvabsdp (__va->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_nabs(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f64[0] = __builtin_vsx_xvnabsdp (__va->__vp_f64[0]); \ + __vr->__vp_f64[1] = __builtin_vsx_xvnabsdp (__va->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_neg(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f64[0] = - __va->__vp_f64[0]; \ + __vr->__vp_f64[1] = - __va->__vp_f64[1]; \ + } \ + while (0) + +#define vpair_f64_sqrt(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f64[0] = __builtin_vsx_xvsqrtdp (__va->__vp_f64[0]); \ + __vr->__vp_f64[1] = __builtin_vsx_xvsqrtdp (__va->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_add(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f64[0] = __va->__vp_f64[0] + __vb->__vp_f64[0]; \ + __vr->__vp_f64[1] = __va->__vp_f64[1] + __vb->__vp_f64[1]; \ + } \ + while (0) + +#define vpair_f64_div(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f64[0] = __va->__vp_f64[0] / __vb->__vp_f64[0]; \ + __vr->__vp_f64[1] = __va->__vp_f64[1] / __vb->__vp_f64[1]; \ + } \ + while (0) + +#define vpair_f64_max(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f64[0] \ + = __builtin_vsx_xvmaxdp (__va->__vp_f64[0], __vb->__vp_f64[0]); \ + __vr->__vp_f64[1] \ + = __builtin_vsx_xvmaxdp (__va->__vp_f64[1], __vb->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_min(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f64[0] \ + = __builtin_vsx_xvmindp (__va->__vp_f64[0], __vb->__vp_f64[0]); \ + __vr->__vp_f64[1] \ + = __builtin_vsx_xvmindp (__va->__vp_f64[1], __vb->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_mul(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f64[0] = __va->__vp_f64[0] * __vb->__vp_f64[0]; \ + __vr->__vp_f64[1] = __va->__vp_f64[1] * __vb->__vp_f64[1]; \ + } \ + while (0) + +#define vpair_f64_sub(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f64[0] = __va->__vp_f64[0] - __vb->__vp_f64[0]; \ + __vr->__vp_f64[1] = __va->__vp_f64[1] - __vb->__vp_f64[1]; \ + } \ + while (0) + +#define vpair_f64_fma(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f64[0] \ + = __builtin_vsx_xvmadddp (__va->__vp_f64[0], \ + __vb->__vp_f64[0], \ + __vc->__vp_f64[0]); \ + __vr->__vp_f64[1] \ + = __builtin_vsx_xvmadddp (__va->__vp_f64[1], \ + __vb->__vp_f64[1], \ + __vc->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_fms(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f64[0] \ + = __builtin_vsx_xvmsubdp (__va->__vp_f64[0], \ + __vb->__vp_f64[0], \ + __vc->__vp_f64[0]); \ + __vr->__vp_f64[1] \ + = __builtin_vsx_xvmsubdp (__va->__vp_f64[1], \ + __vb->__vp_f64[1], \ + __vc->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_nfma(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f64[0] \ + = __builtin_vsx_xvnmadddp (__va->__vp_f64[0], \ + __vb->__vp_f64[0], \ + __vc->__vp_f64[0]); \ + __vr->__vp_f64[1] \ + = __builtin_vsx_xvnmadddp (__va->__vp_f64[1], \ + __vb->__vp_f64[1], \ + __vc->__vp_f64[1]); \ + } \ + while (0) + +#define vpair_f64_nfms(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f64[0] \ + = __builtin_vsx_xvnmsubdp (__va->__vp_f64[0], \ + __vb->__vp_f64[0], \ + __vc->__vp_f64[0]); \ + __vr->__vp_f64[1] \ + = __builtin_vsx_xvnmsubdp (__va->__vp_f64[1], \ + __vb->__vp_f64[1], \ + __vc->__vp_f64[1]); \ + } \ + while (0) + +/* vector pair float operations on power8/power9. */ + +#define vpair_f32_splat(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vr->__vp_f32[0] = __vr->__vp_f32[1] \ + = __builtin_vec_splats ((float)(A)); \ + } \ + while (0) + +#define vpair_f32_abs(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f32[0] = __builtin_vsx_xvabssp (__va->__vp_f32[0]); \ + __vr->__vp_f32[1] = __builtin_vsx_xvabssp (__va->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_nabs(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f32[0] = __builtin_vsx_xvnabssp (__va->__vp_f32[0]); \ + __vr->__vp_f32[1] = __builtin_vsx_xvnabssp (__va->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_neg(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f32[0] = - __va->__vp_f32[0]; \ + __vr->__vp_f32[1] = - __va->__vp_f32[1]; \ + } \ + while (0) + +#define vpair_f32_sqrt(R, A) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vr->__vp_f32[0] = __builtin_vsx_xvsqrtsp (__va->__vp_f32[0]); \ + __vr->__vp_f32[1] = __builtin_vsx_xvsqrtsp (__va->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_add(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f32[0] = __va->__vp_f32[0] + __vb->__vp_f32[0]; \ + __vr->__vp_f32[1] = __va->__vp_f32[1] + __vb->__vp_f32[1]; \ + } \ + while (0) + +#define vpair_f32_div(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f32[0] = __va->__vp_f32[0] / __vb->__vp_f32[0]; \ + __vr->__vp_f32[1] = __va->__vp_f32[1] / __vb->__vp_f32[1]; \ + } \ + while (0) + +#define vpair_f32_max(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f32[0] \ + = __builtin_vsx_xvmaxsp (__va->__vp_f32[0], __vb->__vp_f32[0]); \ + __vr->__vp_f32[1] \ + = __builtin_vsx_xvmaxsp (__va->__vp_f32[1], __vb->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_min(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f32[0] \ + = __builtin_vsx_xvminsp (__va->__vp_f32[0], __vb->__vp_f32[0]); \ + __vr->__vp_f32[1] \ + = __builtin_vsx_xvminsp (__va->__vp_f32[1], __vb->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_mul(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f32[0] = __va->__vp_f32[0] * __vb->__vp_f32[0]; \ + __vr->__vp_f32[1] = __va->__vp_f32[1] * __vb->__vp_f32[1]; \ + } \ + while (0) + +#define vpair_f32_sub(R, A, B) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vr->__vp_f32[0] = __va->__vp_f32[0] - __vb->__vp_f32[0]; \ + __vr->__vp_f32[1] = __va->__vp_f32[1] - __vb->__vp_f32[1]; \ + } \ + while (0) + +#define vpair_f32_fma(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f32[0] \ + = __builtin_vsx_xvmaddsp (__va->__vp_f32[0], \ + __vb->__vp_f32[0], \ + __vc->__vp_f32[0]); \ + __vr->__vp_f32[1] \ + = __builtin_vsx_xvmaddsp (__va->__vp_f32[1], \ + __vb->__vp_f32[1], \ + __vc->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_fms(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f32[0] \ + = __builtin_vsx_xvmsubsp (__va->__vp_f32[0], \ + __vb->__vp_f32[0], \ + __vc->__vp_f32[0]); \ + __vr->__vp_f32[1] \ + = __builtin_vsx_xvmsubsp (__va->__vp_f32[1], \ + __vb->__vp_f32[1], \ + __vc->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_nfma(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f32[0] \ + = __builtin_vsx_xvnmaddsp (__va->__vp_f32[0], \ + __vb->__vp_f32[0], \ + __vc->__vp_f32[0]); \ + __vr->__vp_f32[1] \ + = __builtin_vsx_xvnmaddsp (__va->__vp_f32[1], \ + __vb->__vp_f32[1], \ + __vc->__vp_f32[1]); \ + } \ + while (0) + +#define vpair_f32_nfms(R, A, B, C) \ + do \ + { \ + __vpair_ptr_t __vr = (__vpair_ptr_t)(R); \ + __vpair_ptr_t __va = (__vpair_ptr_t)(A); \ + __vpair_ptr_t __vb = (__vpair_ptr_t)(B); \ + __vpair_ptr_t __vc = (__vpair_ptr_t)(C); \ + __vr->__vp_f32[0] \ + = __builtin_vsx_xvnmsubsp (__va->__vp_f32[0], \ + __vb->__vp_f32[0], \ + __vc->__vp_f32[0]); \ + __vr->__vp_f32[1] \ + = __builtin_vsx_xvnmsubsp (__va->__vp_f32[1], \ + __vb->__vp_f32[1], \ + __vc->__vp_f32[1]); \ + } \ + while (0) + +#endif /* Vector pair support for power8/power9 systems. */ + +#endif /* _VECTOR_PAIR_H. */ diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index c95df8456344..2ee1fa654ee6 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -16128,6 +16128,7 @@ instructions, but allow the compiler to schedule those calls. * PowerPC Hardware Transactional Memory Built-in Functions:: * PowerPC Atomic Memory Operation Functions:: * PowerPC Matrix-Multiply Assist Built-in Functions:: +* PowerPC Vector Pair Support:: * PRU Built-in Functions:: * RISC-V Built-in Functions:: * RISC-V Vector Intrinsics:: @@ -24644,6 +24645,103 @@ __vector_pair __builtin_vsx_lxvp (size_t, __vector_pair *); void __builtin_vsx_stxvp (__vector_pair, size_t, __vector_pair *); @end smallexample +@node PowerPC Vector Pair Support +@subsection PowerPC Vector Pair Support +ISA 3.1 (power10) added instructions to load and store pairs of +vectors with a single instruction. + +GCC now provides an include file (@file{vector-pair.h}) on PowerPC +systems that allows users to write code that can write 32-bit and +64-bit floating point code that processes data in 256-bit chunks +rather than 128-bit chunks. + +If the code is compiled on an ISA 3.1 system with MMA enabled, the +vector pair functions will use the @code{__vector_pair} type to have +values in adjacent vectors and do the operation as a pair of +operations. + +If the code is compiled on a VSX system, but not one with MMA enabled, the vector +pair functions will use 2 separate vectors to do the operation. + +Two types are provided: @code{vector_pair_f64_t} is for vector pairs +that will operate on units of 4 64-bit floating point values, and +@code{vector_pair_f32_t} for operating on units of 8 32-bit floating +point values. + +@node PowerPC Vector Pair Support for 64-bit floating point +@subsection PowerPC Vector Pair Support for 64-bit floating point. + +The following functions are provided for operating on vector pairs +that consist of 4 64-bit floating point values: + +@smallexample +void vpair_f64_splat (vector_pair_f64_t *, double); + +void vpair_f64_abs (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nabs (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_neg (vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_sqrt (vector_pair_f64_t *, vector_pair_f64_t *); + +void vpair_f64_add (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_div (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_max (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_min (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_mul (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); +void vpair_f64_sub (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *); + +void vpair_f64_fma (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_fms (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nfma (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +void vpair_f64_nfms (vector_pair_f64_t *, vector_pair_f64_t *, + vector_pair_f64_t *, vector_pair_f64_t *); +@end smallexample + +@node PowerPC Vector Pair Support for 32-bit floating point +@subsection PowerPC Vector Pair Support for 32-bit floating point. + +The following functions are provided for operating on vector pairs +that consist of 8 32-bit floating point values: + +@smallexample +void vpair_f32_splat (vector_pair_f32_t *, float); + +void vpair_f32_abs (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nabs (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_neg (vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_sqrt (vector_pair_f32_t *, vector_pair_f32_t *); + +void vpair_f32_add (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_div (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_max (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_min (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_mul (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); +void vpair_f32_sub (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *); + +void vpair_f32_fma (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_fms (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nfma (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +void vpair_f32_nfms (vector_pair_f32_t *, vector_pair_f32_t *, + vector_pair_f32_t *, vector_pair_f32_t *); +@end smallexample + @node PRU Built-in Functions @subsection PRU Built-in Functions