https://gcc.gnu.org/g:b75aa06732e45bc81d48858c4b69a04f8396cb6f

commit b75aa06732e45bc81d48858c4b69a04f8396cb6f
Author: Michael Meissner <meiss...@linux.ibm.com>
Date:   Fri Sep 27 00:08:55 2024 -0400

    Initial vector-pair.h support
    
    2024-09-26  Michael Meissner  <meiss...@linux.ibm.com>
    
    gcc/
    
            * config.gcc (powerpc*-*-*): Add vector-pair.h to extra headers.
            * config/rs6000/vector-pair.h: New file.
            * doc/extend.texi (PowerPC Vector Pair Support): Document the vector
            pair support functions.

Diff:
---
 gcc/config.gcc                  |   2 +-
 gcc/config/rs6000/vector-pair.h | 563 ++++++++++++++++++++++++++++++++++++++++
 gcc/doc/extend.texi             |  98 +++++++
 3 files changed, 662 insertions(+), 1 deletion(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 0b794e977f6a..3627bed8b863 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -537,7 +537,7 @@ powerpc*-*-*)
        extra_headers="${extra_headers} pmmintrin.h tmmintrin.h smmintrin.h"
        extra_headers="${extra_headers} nmmintrin.h immintrin.h x86gprintrin.h"
        extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h 
si2vmx.h"
-       extra_headers="${extra_headers} amo.h"
+       extra_headers="${extra_headers} amo.h vector-pair.h"
        case x$with_cpu in
            
xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower1[01]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture)
                cpu_is_64bit=yes
diff --git a/gcc/config/rs6000/vector-pair.h b/gcc/config/rs6000/vector-pair.h
new file mode 100644
index 000000000000..e0023842f331
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.h
@@ -0,0 +1,563 @@
+/* PowerPC vector pair include file.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez (al...@redhat.com).
+   Rewritten by Paolo Bonzini (bonz...@gnu.org).
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Provide support for vector pairs, even on systems that do not have native
+   support for loading and storing pairs of vectors.  */
+
+#ifndef _VECTOR_PAIR_H
+#define _VECTOR_PAIR_H 1
+
+/* During testing, allow vector-pair.h to be included multiple times.  */
+#undef  vector_pair_t
+#undef  vector_pair_f64_t
+#undef  vector_pair_f32_t
+
+#undef  vpair_f64_abs
+#undef  vpair_f64_add
+#undef  vpair_f64_div
+#undef  vpair_f64_fma
+#undef  vpair_f64_fms
+#undef  vpair_f64_max
+#undef  vpair_f64_min
+#undef  vpair_f64_mul
+#undef  vpair_f64_nabs
+#undef  vpair_f64_neg
+#undef  vpair_f64_nfma
+#undef  vpair_f64_nfms
+#undef  vpair_f64_splat
+#undef  vpair_f64_sqrt
+#undef  vpair_f64_sub
+
+#undef  vpair_f32_abs
+#undef  vpair_f32_add
+#undef  vpair_f32_div
+#undef  vpair_f32_fma
+#undef  vpair_f32_fms
+#undef  vpair_f32_max
+#undef  vpair_f32_min
+#undef  vpair_f32_mul
+#undef  vpair_f32_nabs
+#undef  vpair_f32_neg
+#undef  vpair_f32_nfma
+#undef  vpair_f32_nfms
+#undef  vpair_f32_splat
+#undef  vpair_f32_sqrt
+#undef  vpair_f32_sub
+
+/* Union of the various vector pair types.  For testing, allow vector-pair.h to
+   be included multiple times, so protect the union from re-declaration.  */
+#ifndef __VECTOR_PAIR_UNION__
+#define __VECTOR_PAIR_UNION__  1
+
+union __vpair_union {
+
+#ifdef __MMA__
+  __vector_pair                __vpair;
+#endif
+
+  vector double                __vp_f64[2];
+  vector float         __vp_f32[2];
+  vector unsigned char __vp_uc[2];
+};
+
+typedef union __vpair_union    vector_pair_t;
+typedef union __vpair_union    vector_pair_f64_t;
+typedef union __vpair_union    vector_pair_f32_t;
+typedef union __vpair_union    *__vpair_ptr_t;
+
+#endif /* __VECTOR_PAIR_UNION__.  */
+
+#if !__VPAIR_ASM__ && !__VPAIR_NOP10__
+#if __MMA__
+#define __VPAIR_ASM__          1
+
+#else
+#define __VPAIR_NOP10__                1
+#endif
+#endif
+
+/* ISA 3.1 (power10/power11) support with explicit vector pair type.  */
+
+#if __VPAIR_ASM__ && __MMA__
+
+#undef  __VPAIR_FP_UNARY_ASM
+#define __VPAIR_FP_UNARY_ASM(OPCODE, R, A)                             \
+  __asm__ (OPCODE " %x0,%x1\n\t" OPCODE " %x0+1,%x1+1"                 \
+           : "=wa" (((__vpair_ptr_t)(R))->__vpair)                     \
+           : "wa" (((__vpair_ptr_t)(A))->__vpair));
+
+#undef  __VPAIR_FP_BINARY_ASM
+#define __VPAIR_FP_BINARY_ASM(OPCODE, R, A, B)                         \
+  __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1"       \
+           : "=wa" (((__vpair_ptr_t)(R))->__vpair)                     \
+           : "wa" (((__vpair_ptr_t)(A))->__vpair),                     \
+             "wa" (((__vpair_ptr_t)(B))->__vpair));
+
+    /* Note the 'a' version of the FMA instruction must be used.  */
+#undef  __VPAIR_FP_FMA_ASM
+#define __VPAIR_FP_FMA_ASM(OPCODE, R, A, B, C)                         \
+  __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1"       \
+           : "=wa" (((__vpair_ptr_t)(R))->__vpair)                     \
+           : "wa" (((__vpair_ptr_t)(A))->__vpair),                     \
+             "wa" (((__vpair_ptr_t)(B))->__vpair),                     \
+             "0"  (((__vpair_ptr_t)(C))->__vpair));
+
+#define vpair_f64_splat(R, A)                                          \
+  __asm__ ("xxlor %x0+1,%x1,%x1"                                       \
+          : "=wa" (((__vpair_ptr_t)(R))->__vpair)                      \
+          : "0" (__builtin_vec_splats ((double) (A))))
+
+#define vpair_f64_abs(R,A)     __VPAIR_FP_UNARY_ASM ("xvabsdp",  R, A)
+#define vpair_f64_nabs(R,A)    __VPAIR_FP_UNARY_ASM ("xvnabsdp", R, A)
+#define vpair_f64_neg(R,A)     __VPAIR_FP_UNARY_ASM ("xvnegdp",  R, A)
+#define vpair_f64_sqrt(R,A)    __VPAIR_FP_UNARY_ASM ("xvsqrtdp", R, A)
+
+#define vpair_f64_add(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvadddp", R, A, B)
+#define vpair_f64_div(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvdivdp", R, A, B)
+#define vpair_f64_max(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvmaxdp", R, A, B)
+#define vpair_f64_min(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvmindp", R, A, B)
+#define vpair_f64_mul(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvmuldp", R, A, B)
+#define vpair_f64_sub(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvsubdp", R, A, B)
+
+#define vpair_f64_fma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmaddadp",  R, A, B, C)
+#define vpair_f64_fms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmsubadp",  R, A, B, C)
+#define vpair_f64_nfma(R,A,B,C)        __VPAIR_FP_FMA_ASM ("xvnmaddadp", R, A, 
B, C)
+#define vpair_f64_nfms(R,A,B,C)        __VPAIR_FP_FMA_ASM ("xvnmsubadp", R, A, 
B, C)
+
+#define vpair_f32_splat(R, A)                                          \
+  __asm__ ("xxlor %x0+1,%x1,%x1"                                       \
+          : "=wa" (((__vpair_ptr_t)(R))->__vpair)                      \
+          : "0" (__builtin_vec_splats ((float) (A))))
+
+#define vpair_f32_abs(R,A)     __VPAIR_FP_UNARY_ASM ("xvabssp",  R, A)
+#define vpair_f32_nabs(R,A)    __VPAIR_FP_UNARY_ASM ("xvnabssp", R, A)
+#define vpair_f32_neg(R,A)     __VPAIR_FP_UNARY_ASM ("xvnegsp",  R, A)
+#define vpair_f32_sqrt(R,A)    __VPAIR_FP_UNARY_ASM ("xvsqrtsp", R, A)
+
+#define vpair_f32_add(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvaddsp", R, A, B)
+#define vpair_f32_div(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvdivsp", R, A, B)
+#define vpair_f32_max(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvmaxsp", R, A, B)
+#define vpair_f32_min(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvminsp", R, A, B)
+#define vpair_f32_mul(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvmulsp", R, A, B)
+#define vpair_f32_sub(R,A,B)   __VPAIR_FP_BINARY_ASM ("xvsubsp", R, A, B)
+
+#define vpair_f32_fma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmaddasp",  R, A, B, C)
+#define vpair_f32_fms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmsubasp",  R, A, B, C)
+#define vpair_f32_nfma(R,A,B,C)        __VPAIR_FP_FMA_ASM ("xvnmaddasp", R, A, 
B, C)
+#define vpair_f32_nfms(R,A,B,C)        __VPAIR_FP_FMA_ASM ("xvnmsubasp", R, A, 
B, C)
+
+
+#else  /* ISA 2.8/3.0 support for machines without vector pair support.  */
+
+/* vector pair double operations on power8/power9.  */
+
+#define vpair_f64_splat(R, A)                                          \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vr->__vp_f64[0] = __vr->__vp_f64[1]                            \
+       = __builtin_vec_splats ((double)(A));                           \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_abs(R, A)                                            \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f64[0] = __builtin_vsx_xvabsdp (__va->__vp_f64[0]);   \
+      __vr->__vp_f64[1] = __builtin_vsx_xvabsdp (__va->__vp_f64[1]);   \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_nabs(R, A)                                           \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f64[0] = __builtin_vsx_xvnabsdp (__va->__vp_f64[0]);  \
+      __vr->__vp_f64[1] = __builtin_vsx_xvnabsdp (__va->__vp_f64[1]);  \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_neg(R, A)                                            \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f64[0] = - __va->__vp_f64[0];                         \
+      __vr->__vp_f64[1] = - __va->__vp_f64[1];                         \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_sqrt(R, A)                                           \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f64[0] = __builtin_vsx_xvsqrtdp (__va->__vp_f64[0]);  \
+      __vr->__vp_f64[1] = __builtin_vsx_xvsqrtdp (__va->__vp_f64[1]);  \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_add(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f64[0] = __va->__vp_f64[0] + __vb->__vp_f64[0];       \
+      __vr->__vp_f64[1] = __va->__vp_f64[1] + __vb->__vp_f64[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_div(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f64[0] = __va->__vp_f64[0] / __vb->__vp_f64[0];       \
+      __vr->__vp_f64[1] = __va->__vp_f64[1] / __vb->__vp_f64[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_max(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f64[0]                                                        
\
+       = __builtin_vsx_xvmaxdp (__va->__vp_f64[0], __vb->__vp_f64[0]); \
+      __vr->__vp_f64[1]                                                        
\
+       = __builtin_vsx_xvmaxdp (__va->__vp_f64[1], __vb->__vp_f64[1]); \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_min(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f64[0]                                                        
\
+       = __builtin_vsx_xvmindp (__va->__vp_f64[0], __vb->__vp_f64[0]); \
+      __vr->__vp_f64[1]                                                        
\
+       = __builtin_vsx_xvmindp (__va->__vp_f64[1], __vb->__vp_f64[1]); \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_mul(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f64[0] = __va->__vp_f64[0] * __vb->__vp_f64[0];       \
+      __vr->__vp_f64[1] = __va->__vp_f64[1] * __vb->__vp_f64[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_sub(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f64[0] = __va->__vp_f64[0] - __vb->__vp_f64[0];       \
+      __vr->__vp_f64[1] = __va->__vp_f64[1] - __vb->__vp_f64[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_fma(R, A, B, C)                                      \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f64[0]                                                        
\
+       = __builtin_vsx_xvmadddp (__va->__vp_f64[0],                    \
+                                 __vb->__vp_f64[0],                    \
+                                 __vc->__vp_f64[0]);                   \
+      __vr->__vp_f64[1]                                                        
\
+       = __builtin_vsx_xvmadddp (__va->__vp_f64[1],                    \
+                                 __vb->__vp_f64[1],                    \
+                                 __vc->__vp_f64[1]);                   \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_fms(R, A, B, C)                                      \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f64[0]                                                        
\
+       = __builtin_vsx_xvmsubdp (__va->__vp_f64[0],                    \
+                                 __vb->__vp_f64[0],                    \
+                                 __vc->__vp_f64[0]);                   \
+      __vr->__vp_f64[1]                                                        
\
+       = __builtin_vsx_xvmsubdp (__va->__vp_f64[1],                    \
+                                 __vb->__vp_f64[1],                    \
+                                 __vc->__vp_f64[1]);                   \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_nfma(R, A, B, C)                                     \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f64[0]                                                        
\
+       = __builtin_vsx_xvnmadddp (__va->__vp_f64[0],                   \
+                                  __vb->__vp_f64[0],                   \
+                                  __vc->__vp_f64[0]);                  \
+      __vr->__vp_f64[1]                                                        
\
+       = __builtin_vsx_xvnmadddp (__va->__vp_f64[1],                   \
+                                  __vb->__vp_f64[1],                   \
+                                  __vc->__vp_f64[1]);                  \
+    }                                                                  \
+  while (0)
+
+#define vpair_f64_nfms(R, A, B, C)                                     \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f64[0]                                                        
\
+       = __builtin_vsx_xvnmsubdp (__va->__vp_f64[0],                   \
+                                  __vb->__vp_f64[0],                   \
+                                  __vc->__vp_f64[0]);                  \
+      __vr->__vp_f64[1]                                                        
\
+       = __builtin_vsx_xvnmsubdp (__va->__vp_f64[1],                   \
+                                  __vb->__vp_f64[1],                   \
+                                  __vc->__vp_f64[1]);                  \
+    }                                                                  \
+  while (0)
+
+/* vector pair float operations on power8/power9.  */
+
+#define vpair_f32_splat(R, A)                                          \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vr->__vp_f32[0] = __vr->__vp_f32[1]                            \
+       = __builtin_vec_splats ((float)(A));                            \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_abs(R, A)                                            \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f32[0] = __builtin_vsx_xvabssp (__va->__vp_f32[0]);   \
+      __vr->__vp_f32[1] = __builtin_vsx_xvabssp (__va->__vp_f32[1]);   \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_nabs(R, A)                                           \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f32[0] = __builtin_vsx_xvnabssp (__va->__vp_f32[0]);  \
+      __vr->__vp_f32[1] = __builtin_vsx_xvnabssp (__va->__vp_f32[1]);  \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_neg(R, A)                                            \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f32[0] = - __va->__vp_f32[0];                         \
+      __vr->__vp_f32[1] = - __va->__vp_f32[1];                         \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_sqrt(R, A)                                           \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vr->__vp_f32[0] = __builtin_vsx_xvsqrtsp (__va->__vp_f32[0]);  \
+      __vr->__vp_f32[1] = __builtin_vsx_xvsqrtsp (__va->__vp_f32[1]);  \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_add(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f32[0] = __va->__vp_f32[0] + __vb->__vp_f32[0];       \
+      __vr->__vp_f32[1] = __va->__vp_f32[1] + __vb->__vp_f32[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_div(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f32[0] = __va->__vp_f32[0] / __vb->__vp_f32[0];       \
+      __vr->__vp_f32[1] = __va->__vp_f32[1] / __vb->__vp_f32[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_max(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f32[0]                                                        
\
+       = __builtin_vsx_xvmaxsp (__va->__vp_f32[0], __vb->__vp_f32[0]); \
+      __vr->__vp_f32[1]                                                        
\
+       = __builtin_vsx_xvmaxsp (__va->__vp_f32[1], __vb->__vp_f32[1]); \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_min(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f32[0]                                                        
\
+       = __builtin_vsx_xvminsp (__va->__vp_f32[0], __vb->__vp_f32[0]); \
+      __vr->__vp_f32[1]                                                        
\
+       = __builtin_vsx_xvminsp (__va->__vp_f32[1], __vb->__vp_f32[1]); \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_mul(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f32[0] = __va->__vp_f32[0] * __vb->__vp_f32[0];       \
+      __vr->__vp_f32[1] = __va->__vp_f32[1] * __vb->__vp_f32[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_sub(R, A, B)                                         \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vr->__vp_f32[0] = __va->__vp_f32[0] - __vb->__vp_f32[0];       \
+      __vr->__vp_f32[1] = __va->__vp_f32[1] - __vb->__vp_f32[1];       \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_fma(R, A, B, C)                                      \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f32[0]                                                        
\
+       = __builtin_vsx_xvmaddsp (__va->__vp_f32[0],                    \
+                                 __vb->__vp_f32[0],                    \
+                                 __vc->__vp_f32[0]);                   \
+      __vr->__vp_f32[1]                                                        
\
+       = __builtin_vsx_xvmaddsp (__va->__vp_f32[1],                    \
+                                 __vb->__vp_f32[1],                    \
+                                 __vc->__vp_f32[1]);                   \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_fms(R, A, B, C)                                      \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f32[0]                                                        
\
+       = __builtin_vsx_xvmsubsp (__va->__vp_f32[0],                    \
+                                 __vb->__vp_f32[0],                    \
+                                 __vc->__vp_f32[0]);                   \
+      __vr->__vp_f32[1]                                                        
\
+       = __builtin_vsx_xvmsubsp (__va->__vp_f32[1],                    \
+                                 __vb->__vp_f32[1],                    \
+                                 __vc->__vp_f32[1]);                   \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_nfma(R, A, B, C)                                     \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f32[0]                                                        
\
+       = __builtin_vsx_xvnmaddsp (__va->__vp_f32[0],                   \
+                                  __vb->__vp_f32[0],                   \
+                                  __vc->__vp_f32[0]);                  \
+      __vr->__vp_f32[1]                                                        
\
+       = __builtin_vsx_xvnmaddsp (__va->__vp_f32[1],                   \
+                                  __vb->__vp_f32[1],                   \
+                                  __vc->__vp_f32[1]);                  \
+    }                                                                  \
+  while (0)
+
+#define vpair_f32_nfms(R, A, B, C)                                     \
+  do                                                                   \
+    {                                                                  \
+      __vpair_ptr_t __vr = (__vpair_ptr_t)(R);                         \
+      __vpair_ptr_t __va = (__vpair_ptr_t)(A);                         \
+      __vpair_ptr_t __vb = (__vpair_ptr_t)(B);                         \
+      __vpair_ptr_t __vc = (__vpair_ptr_t)(C);                         \
+      __vr->__vp_f32[0]                                                        
\
+       = __builtin_vsx_xvnmsubsp (__va->__vp_f32[0],                   \
+                                  __vb->__vp_f32[0],                   \
+                                  __vc->__vp_f32[0]);                  \
+      __vr->__vp_f32[1]                                                        
\
+       = __builtin_vsx_xvnmsubsp (__va->__vp_f32[1],                   \
+                                  __vb->__vp_f32[1],                   \
+                                  __vc->__vp_f32[1]);                  \
+    }                                                                  \
+  while (0)
+
+#endif /* Vector pair support for power8/power9 systems.  */
+
+#endif /* _VECTOR_PAIR_H.  */
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index c95df8456344..2ee1fa654ee6 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -16128,6 +16128,7 @@ instructions, but allow the compiler to schedule those 
calls.
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
 * PowerPC Matrix-Multiply Assist Built-in Functions::
+* PowerPC Vector Pair Support::
 * PRU Built-in Functions::
 * RISC-V Built-in Functions::
 * RISC-V Vector Intrinsics::
@@ -24644,6 +24645,103 @@ __vector_pair __builtin_vsx_lxvp (size_t, 
__vector_pair *);
 void __builtin_vsx_stxvp (__vector_pair, size_t, __vector_pair *);
 @end smallexample
 
+@node PowerPC Vector Pair Support
+@subsection PowerPC Vector Pair Support
+ISA 3.1 (power10) added instructions to load and store pairs of
+vectors with a single instruction.
+
+GCC now provides an include file (@file{vector-pair.h}) on PowerPC
+systems that allows users to write code that can write 32-bit and
+64-bit floating point code that processes data in 256-bit chunks
+rather than 128-bit chunks.
+
+If the code is compiled on an ISA 3.1 system with MMA enabled, the
+vector pair functions will use the @code{__vector_pair} type to have
+values in adjacent vectors and do the operation as a pair of
+operations.
+
+If the code is compiled on a VSX system, but not one with MMA enabled, the 
vector
+pair functions will use 2 separate vectors to do the operation.
+
+Two types are provided: @code{vector_pair_f64_t} is for vector pairs
+that will operate on units of 4 64-bit floating point values, and
+@code{vector_pair_f32_t} for operating on units of 8 32-bit floating
+point values.
+
+@node PowerPC Vector Pair Support for 64-bit floating point
+@subsection PowerPC Vector Pair Support for 64-bit floating point.
+
+The following functions are provided for operating on vector pairs
+that consist of 4 64-bit floating point values:
+
+@smallexample
+void vpair_f64_splat (vector_pair_f64_t *, double);
+
+void vpair_f64_abs (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nabs (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_neg (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_sqrt (vector_pair_f64_t *, vector_pair_f64_t *);
+
+void vpair_f64_add (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_div (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_max (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_min (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_mul (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_sub (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+
+void vpair_f64_fma (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_fms (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nfma (vector_pair_f64_t *, vector_pair_f64_t *,
+                     vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nfms (vector_pair_f64_t *, vector_pair_f64_t *,
+                     vector_pair_f64_t *, vector_pair_f64_t *);
+@end smallexample
+
+@node PowerPC Vector Pair Support for 32-bit floating point
+@subsection PowerPC Vector Pair Support for 32-bit floating point.
+
+The following functions are provided for operating on vector pairs
+that consist of 8 32-bit floating point values:
+
+@smallexample
+void vpair_f32_splat (vector_pair_f32_t *, float);
+
+void vpair_f32_abs (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nabs (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_neg (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_sqrt (vector_pair_f32_t *, vector_pair_f32_t *);
+
+void vpair_f32_add (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_div (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_max (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_min (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_mul (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_sub (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+
+void vpair_f32_fma (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_fms (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nfma (vector_pair_f32_t *, vector_pair_f32_t *,
+                     vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nfms (vector_pair_f32_t *, vector_pair_f32_t *,
+                     vector_pair_f32_t *, vector_pair_f32_t *);
+@end smallexample
+
 @node PRU Built-in Functions
 @subsection PRU Built-in Functions

Reply via email to