https://gcc.gnu.org/g:e0d91e0cb833167e43f2bc54fd4b3668a2fe4532
commit e0d91e0cb833167e43f2bc54fd4b3668a2fe4532 Author: Michael Meissner <meiss...@linux.ibm.com> Date: Thu Sep 5 16:29:35 2024 -0400 Rewrite vector-pair.h 2024-09-04 Michael Meissner <meiss...@linux.ibm.com> * config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Signal that we have the vector pair built-in functions. * config/rs6000/vector-pair.h: Rewrite. Diff: --- gcc/config/rs6000/rs6000-c.cc | 8 +- gcc/config/rs6000/vector-pair.h | 716 ++++++++++++++++++++++++++++------------ 2 files changed, 519 insertions(+), 205 deletions(-) diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index 82826f96a8e..2d674f9b236 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -590,9 +590,13 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags, if (rs6000_cpu == PROCESSOR_CELL) rs6000_define_or_undefine_macro (define_p, "__PPU__"); - /* Tell the user if we support the MMA instructions. */ + /* Tell the user if we support the MMA instructions. Also say that we + support the vector pair built-in functions. */ if ((flags & OPTION_MASK_MMA) != 0) - rs6000_define_or_undefine_macro (define_p, "__MMA__"); + { + rs6000_define_or_undefine_macro (define_p, "__MMA__"); + rs6000_define_or_undefine_macro (define_p, "__VPAIR__"); + } /* Whether pc-relative code is being generated. */ if ((flags & OPTION_MASK_PCREL) != 0) rs6000_define_or_undefine_macro (define_p, "__PCREL__"); diff --git a/gcc/config/rs6000/vector-pair.h b/gcc/config/rs6000/vector-pair.h index 3c03e44f3f4..ebd1df9a532 100644 --- a/gcc/config/rs6000/vector-pair.h +++ b/gcc/config/rs6000/vector-pair.h @@ -30,19 +30,55 @@ #ifndef _VECTOR_PAIR_H #define _VECTOR_PAIR_H 1 -/* If we have MMA support, use power10 support. */ -#if __MMA__ -typedef __vector_pair vector_pair_t; - -#define VPAIR_FP_CONSTRAINT "wa" /* Allow all VSX registers. */ -#define VPAIR_FP_SECOND "S" /* Access 2nd VSX register. */ +/* During testing, allow vector-pair.h to be included multiple times. */ +#undef vector_pair_t +#undef vector_pair_f64_t +#undef vector_pair_f32_t + +#undef vpair_f64_abs +#undef vpair_f64_add +#undef vpair_f64_div +#undef vpair_f64_fma +#undef vpair_f64_fms +#undef vpair_f64_max +#undef vpair_f64_min +#undef vpair_f64_mul +#undef vpair_f64_nabs +#undef vpair_f64_neg +#undef vpair_f64_nfma +#undef vpair_f64_nfms +#undef vpair_f64_splat +#undef vpair_f64_sqrt +#undef vpair_f64_sub + +#undef vpair_f32_abs +#undef vpair_f32_add +#undef vpair_f32_div +#undef vpair_f32_fma +#undef vpair_f32_fms +#undef vpair_f32_max +#undef vpair_f32_min +#undef vpair_f32_mul +#undef vpair_f32_nabs +#undef vpair_f32_neg +#undef vpair_f32_nfma +#undef vpair_f32_nfms +#undef vpair_f32_splat +#undef vpair_f32_sqrt +#undef vpair_f32_sub + +/* Do we have MMA support and the vector pair built-in function? */ +#if __MMA__ && __VPAIR__ && !__NO_VPAIR_BUILTIN__ +#define vector_pair_t __vector_pair +#define vector_pair_d64_t __vector_pair +#define vector_pair_d32_t __vector_pair /* vector pair double operations on power10. */ #define vpair_f64_splat(R, A) (*R) = __builtin_vpair_f64_splat (A) -#define vpair_f64_neg(R,A) (*R) = __builtin_vpair_f64_neg (*A) #define vpair_f64_abs(R,A) (*R) = __builtin_vpair_f64_abs (*A) #define vpair_f64_nabs(R,A) (*R) = __builtin_vpair_f64_nabs (*A) +#define vpair_f64_neg(R,A) (*R) = __builtin_vpair_f64_neg (*A) #define vpair_f64_sqrt(R,A) (*R) = __builtin_vpair_f64_sqrt (*A) #define vpair_f64_add(R,A,B) (*R) = __builtin_vpair_f64_add (*A, *B) @@ -57,13 +93,12 @@ typedef __vector_pair vector_pair_t; #define vpair_f64_nfma(R,A,B,C) (*R) = __builtin_vpair_f64_nfma (*A, *B, *C) #define vpair_f64_nfms(R,A,B,C) (*R) = __builtin_vpair_f64_nfms (*A, *B, *C) - /* vector pair float operations on power10. */ #define vpair_f32_splat(R, A) (*R) = __builtin_vpair_f32_splat (A) -#define vpair_f32_neg(R,A) (*R) = __builtin_vpair_f32_neg (*A) #define vpair_f32_abs(R,A) (*R) = __builtin_vpair_f32_abs (*A) #define vpair_f32_nabs(R,A) (*R) = __builtin_vpair_f32_nabs (*A) +#define vpair_f32_neg(R,A) (*R) = __builtin_vpair_f32_neg (*A) #define vpair_f32_sqrt(R,A) (*R) = __builtin_vpair_f32_sqrt (*A) #define vpair_f32_add(R,A,B) (*R) = __builtin_vpair_f32_add (*A, *B) @@ -78,212 +113,487 @@ typedef __vector_pair vector_pair_t; #define vpair_f32_nfma(R,A,B,C) (*R) = __builtin_vpair_f32_nfma (*A, *B, *C) #define vpair_f32_nfms(R,A,B,C) (*R) = __builtin_vpair_f32_nfma (*A, *B, *C) + +/* Do we have the __vector_pair type available, but we don't have the built-in + functions? */ + +#elif __MMA__ && !__NO_VPAIR_ASM__ +#define vector_pair_t __vector_pair +#define vector_pair_d64_t __vector_pair +#define vector_pair_d32_t __vector_pair + +#undef __VPAIR_FP_UNARY_ASM +#define __VPAIR_FP_UNARY_ASM(OPCODE, R, A) \ + __asm__ (OPCODE " %x0,%x1\n\t" OPCODE " %x0+1,%x1+1" \ + : "=wa" (*(__vector_pair *)(R)) \ + : "wa" (*(__vector_pair *)(A))); + +#undef __VPAIR_FP_BINARY_ASM +#define __VPAIR_FP_BINARY_ASM(OPCODE, R, A, B) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" (*(__vector_pair *)(R)) \ + : "wa" (*(__vector_pair *)(A)), \ + "wa" (*(__vector_pair *)(B))); + + /* Note the 'a' version of the FMA instruction must be used. */ +#undef __VPAIR_FP_FMA_ASM +#define __VPAIR_FP_FMA_ASM(OPCODE, R, A, B, C) \ + __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \ + : "=wa" (*(__vector_pair *)(R)) \ + : "wa" (*(__vector_pair *)(A)), \ + "wa" (*(__vector_pair *)(B)), \ + "0" (*(__vector_pair *)(C))); + +#define vpair_f64_splat(R, A) \ + __asm__ ("xxlor %x0+1,%x1,%x1" \ + : "=wa" (*(__vector_pair *)(R)) \ + : "0" (__builtin_vec_splats ((double) (A)))) + +#define vpair_f64_abs(R,A) __VPAIR_FP_UNARY_ASM ("xvabsdp", R, A) +#define vpair_f64_nabs(R,A) __VPAIR_FP_UNARY_ASM ("xvnabsdp", R, A) +#define vpair_f64_neg(R,A) __VPAIR_FP_UNARY_ASM ("xvnegdp", R, A) +#define vpair_f64_sqrt(R,A) __VPAIR_FP_UNARY_ASM ("xvsqrtdp", R, A) + +#define vpair_f64_add(R,A,B) __VPAIR_FP_BINARY_ASM ("xvadddp", R, A, B) +#define vpair_f64_div(R,A,B) __VPAIR_FP_BINARY_ASM ("xvdivdp", R, A, B) +#define vpair_f64_max(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmaxdp", R, A, B) +#define vpair_f64_min(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmindp", R, A, B) +#define vpair_f64_mul(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmuldp", R, A, B) +#define vpair_f64_sub(R,A,B) __VPAIR_FP_BINARY_ASM ("xvsubdp", R, A, B) + +#define vpair_f64_fma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmaddadp", R, A, B, C) +#define vpair_f64_fms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmsubadp", R, A, B, C) +#define vpair_f64_nfma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmaddadp", R, A, B, C) +#define vpair_f64_nfms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmsubadp", R, A, B, C) + +#define vpair_f32_splat(R, A) \ + __asm__ ("xxlor %x0+1,%x1,%x1" \ + : "=wa" (*(__vector_pair *)(R)) \ + : "0" (__builtin_vec_splats ((float) (A)))) + +#define vpair_f32_abs(R,A) __VPAIR_FP_UNARY_ASM ("xvabssp", R, A) +#define vpair_f32_nabs(R,A) __VPAIR_FP_UNARY_ASM ("xvnabssp", R, A) +#define vpair_f32_neg(R,A) __VPAIR_FP_UNARY_ASM ("xvnegsp", R, A) +#define vpair_f32_sqrt(R,A) __VPAIR_FP_UNARY_ASM ("xvsqrtsp", R, A) + +#define vpair_f32_add(R,A,B) __VPAIR_FP_BINARY_ASM ("xvaddsp", R, A, B) +#define vpair_f32_div(R,A,B) __VPAIR_FP_BINARY_ASM ("xvdivsp", R, A, B) +#define vpair_f32_max(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmaxsp", R, A, B) +#define vpair_f32_min(R,A,B) __VPAIR_FP_BINARY_ASM ("xvminsp", R, A, B) +#define vpair_f32_mul(R,A,B) __VPAIR_FP_BINARY_ASM ("xvmulsp", R, A, B) +#define vpair_f32_sub(R,A,B) __VPAIR_FP_BINARY_ASM ("xvsubsp", R, A, B) + +#define vpair_f32_fma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmaddasp", R, A, B, C) +#define vpair_f32_fms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvmsubasp", R, A, B, C) +#define vpair_f32_nfma(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmaddasp", R, A, B, C) +#define vpair_f32_nfms(R,A,B,C) __VPAIR_FP_FMA_ASM ("xvnmsubasp", R, A, B, C) + #else /* !__MMA__. */ -typedef union { + +#ifndef __VECTOR_PAIR_UNION__ +#define __VECTOR_PAIR_UNION__ 1 + +union vpair_union { /* Double vector pairs. */ - double __attribute__((__vector_size__(32))) __vpair_vp_f64; - vector double __vpair_vec_f64[2]; - double __vpair_scalar_f64[4]; + vector double __vdbl[2]; /* Float vector pairs. */ - float __attribute__((__vector_size__(32))) __vpair_vp_f32; - vector float __vpair_vec_f32[2]; - float __vpair_scalar_f32[8]; + vector float __vflt[2]; -} vector_pair_t; +}; +#endif /* __VECTOR_PAIR_UNION__. */ -#define VPAIR_FP_CONSTRAINT "d" /* Only use FPR registers. */ -#define VPAIR_FP_SECOND "L" /* Access 2nd FPR register. */ +#define vector_pair_t union vpair_union +#define vector_pair_d64_t union vpair_union +#define vector_pair_d32_t union vpair_union /* vector pair double operations on power8/power9. */ -#define vpair_f64_splat(R,A) \ - ((R)->__vpair_vec_f64[0] = (R)->__vpair_vec_f64[1] \ - = __builtin_vec_splats ((double) (A))) - -#define vpair_f64_neg(R,A) \ - ((R)->__vpair_vp_f64 = - (A)->__vpair_vp_f64) - -#define vpair_f64_abs(R,A) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvabsdp ((A)->__vpair_vec_f64[0])), \ - ((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvabsdp ((A)->__vpair_vec_f64[1]))) - -#define vpair_f64_nabs(R,A) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvnabsdp ((A)->__vpair_vec_f64[0])), \ - ((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvnabsdp ((A)->__vpair_vec_f64[1]))) - -#define vpair_f64_sqrt(R,A) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvsqrtdp ((A)->__vpair_vec_f64[0])), \ - ((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvsqrtdp ((A)->__vpair_vec_f64[1]))) - -#define vpair_f64_add(R,A,B) \ - ((R)->__vpair_vp_f64 = (A)->__vpair_vp_f64 + (B)->__vpair_vp_f64) - -#define vpair_f64_div(R,A,B) \ - ((R)->__vpair_vp_f64 = (A)->__vpair_vp_f64 / (B)->__vpair_vp_f64) - -#define vpair_f64_max(R,A,B) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvmaxdp ((A)->__vpair_vec_f64[0], \ - (B)->__vpair_vec_f64[0])), \ - (((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvmaxdp ((A)->__vpair_vec_f64[1], \ - (B)->__vpair_vec_f64[1])))) - -#define vpair_f64_min(R,A,B) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvmindp ((A)->__vpair_vec_f64[0], \ - (B)->__vpair_vec_f64[0])), \ - (((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvmindp ((A)->__vpair_vec_f64[1], \ - (B)->__vpair_vec_f64[1])))) - -#define vpair_f64_mul(R,A,B) \ - ((R)->__vpair_vp_f64 = (A)->__vpair_vp_f64 * (B)->__vpair_vp_f64) - -#define vpair_f64_sub(R,A,B) \ - ((R)->__vpair_vp_f64 = (A)->__vpair_vp_f64 - (B)->__vpair_vp_f64) - -#define vpair_f64_fma(R,A,B,C) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvmadddp ((A)->__vpair_vec_f64[0], \ - (B)->__vpair_vec_f64[0], \ - (C)->__vpair_vec_f64[0])), \ - (((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvmadddp ((A)->__vpair_vec_f64[1], \ - (B)->__vpair_vec_f64[1], \ - (C)->__vpair_vec_f64[1])))) - -#define vpair_f64_fms(R,A,B,C) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvmsubdp ((A)->__vpair_vec_f64[0], \ - (B)->__vpair_vec_f64[0], \ - (C)->__vpair_vec_f64[0])), \ - (((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvmsubdp ((A)->__vpair_vec_f64[1], \ - (B)->__vpair_vec_f64[1], \ - (C)->__vpair_vec_f64[1])))) - -#define vpair_f64_nfma(R,A,B,C) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvnmadddp ((A)->__vpair_vec_f64[0], \ - (B)->__vpair_vec_f64[0], \ - (C)->__vpair_vec_f64[0])), \ - (((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvnmadddp ((A)->__vpair_vec_f64[1], \ - (B)->__vpair_vec_f64[1], \ - (C)->__vpair_vec_f64[1])))) - -#define vpair_f64_nfms(R,A,B,C) \ - (((R)->__vpair_vec_f64[0] \ - = __builtin_vsx_xvnmsubdp ((A)->__vpair_vec_f64[0], \ - (B)->__vpair_vec_f64[0], \ - (C)->__vpair_vec_f64[0])), \ - (((R)->__vpair_vec_f64[1] \ - = __builtin_vsx_xvnmsubdp ((A)->__vpair_vec_f64[1], \ - (B)->__vpair_vec_f64[1], \ - (C)->__vpair_vec_f64[1])))) +#define vpair_f64_splat(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + __vr->__vdbl[0] = __vr->__vdbl[1] \ + = __builtin_vec_splats ((double)(A)); \ + } \ + while (0) + +#define vpair_f64_abs(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vdbl[0] = __builtin_vsx_xvabsdp (__va->__vdbl[0]); \ + __vr->__vdbl[1] = __builtin_vsx_xvabsdp (__va->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_nabs(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vdbl[0] = __builtin_vsx_xvnabsdp (__va->__vdbl[0]); \ + __vr->__vdbl[1] = __builtin_vsx_xvnabsdp (__va->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_neg(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vdbl[0] = - __va->__vdbl[0]; \ + __vr->__vdbl[1] = - __va->__vdbl[1]; \ + } \ + while (0) + +#define vpair_f64_sqrt(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vdbl[0] = __builtin_vsx_xvsqrtdp (__va->__vdbl[0]); \ + __vr->__vdbl[1] = __builtin_vsx_xvsqrtdp (__va->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_add(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vdbl[0] = __va->__vdbl[0] + __vb->__vdbl[0]; \ + __vr->__vdbl[1] = __va->__vdbl[1] + __vb->__vdbl[1]; \ + } \ + while (0) + +#define vpair_f64_div(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vdbl[0] = __va->__vdbl[0] / __vb->__vdbl[0]; \ + __vr->__vdbl[1] = __va->__vdbl[1] / __vb->__vdbl[1]; \ + } \ + while (0) + +#define vpair_f64_max(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vdbl[0] \ + = __builtin_vsx_xvmaxdp (__va->__vdbl[0], __vb->__vdbl[0]); \ + __vr->__vdbl[1] \ + = __builtin_vsx_xvmaxdp (__va->__vdbl[1], __vb->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_min(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vdbl[0] \ + = __builtin_vsx_xvmindp (__va->__vdbl[0], __vb->__vdbl[0]); \ + __vr->__vdbl[1] \ + = __builtin_vsx_xvmindp (__va->__vdbl[1], __vb->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_mul(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vdbl[0] = __va->__vdbl[0] * __vb->__vdbl[0]; \ + __vr->__vdbl[1] = __va->__vdbl[1] * __vb->__vdbl[1]; \ + } \ + while (0) + +#define vpair_f64_sub(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vdbl[0] = __va->__vdbl[0] - __vb->__vdbl[0]; \ + __vr->__vdbl[1] = __va->__vdbl[1] - __vb->__vdbl[1]; \ + } \ + while (0) + +#define vpair_f64_fma(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vdbl[0] \ + = __builtin_vsx_xvmadddp (__va->__vdbl[0], \ + __vb->__vdbl[0], \ + __vc->__vdbl[0]); \ + __vr->__vdbl[1] \ + = __builtin_vsx_xvmadddp (__va->__vdbl[1], \ + __vb->__vdbl[1], \ + __vc->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_fms(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vdbl[0] \ + = __builtin_vsx_xvmsubdp (__va->__vdbl[0], \ + __vb->__vdbl[0], \ + __vc->__vdbl[0]); \ + __vr->__vdbl[1] \ + = __builtin_vsx_xvmsubdp (__va->__vdbl[1], \ + __vb->__vdbl[1], \ + __vc->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_nfma(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vdbl[0] \ + = __builtin_vsx_xvnmadddp (__va->__vdbl[0], \ + __vb->__vdbl[0], \ + __vc->__vdbl[0]); \ + __vr->__vdbl[1] \ + = __builtin_vsx_xvnmadddp (__va->__vdbl[1], \ + __vb->__vdbl[1], \ + __vc->__vdbl[1]); \ + } \ + while (0) + +#define vpair_f64_nfms(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vdbl[0] \ + = __builtin_vsx_xvnmsubdp (__va->__vdbl[0], \ + __vb->__vdbl[0], \ + __vc->__vdbl[0]); \ + __vr->__vdbl[1] \ + = __builtin_vsx_xvnmsubdp (__va->__vdbl[1], \ + __vb->__vdbl[1], \ + __vc->__vdbl[1]); \ + } \ + while (0) /* vector pair float operations on power8/power9. */ -#define vpair_f32_splat(R,A) \ - ((R)->__vpair_vec_f32[0] = (R)->__vpair_vec_f32[1] \ - = __builtin_vec_splats ((float) (A))) - -#define vpair_f32_neg(R,A) \ - ((R)->__vpair_vp_f64 = - (A)->__vpair_vp_f64) - -#define vpair_f32_abs(R,A) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvnabssp ((A)->__vpair_vec_f32[0])), \ - ((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvnabssp ((A)->__vpair_vec_f32[1]))) - -#define vpair_f32_nabs(R,A) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvnabssp ((A)->__vpair_vec_f32[0])), \ - ((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvnabssp ((A)->__vpair_vec_f32[1]))) - -#define vpair_f32_sqrt(R,A) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvsqrtsp ((A)->__vpair_vec_f32[0])), \ - ((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvsqrtsp ((A)->__vpair_vec_f32[1]))) - -#define vpair_f32_add(R,A,B) \ - ((R)->__vpair_vp_f32 = (A)->__vpair_vp_f32 + (B)->__vpair_vp_f32) - -#define vpair_f32_div(R,A,B) \ - ((R)->__vpair_vp_f32 = (A)->__vpair_vp_f32 / (B)->__vpair_vp_f32) - -#define vpair_f32_max(R,A,B) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvmaxsp ((A)->__vpair_vec_f32[0], \ - (B)->__vpair_vec_f32[0])), \ - (((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvmaxsp ((A)->__vpair_vec_f32[1], \ - (B)->__vpair_vec_f32[1])))) - -#define vpair_f32_min(R,A,B) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvminsp ((A)->__vpair_vec_f32[0], \ - (B)->__vpair_vec_f32[0])), \ - (((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvminsp ((A)->__vpair_vec_f32[1], \ - (B)->__vpair_vec_f32[1])))) - -#define vpair_f32_mul(R,A,B) \ - ((R)->__vpair_vp_f32 = (A)->__vpair_vp_f32 * (B)->__vpair_vp_f32) - -#define vpair_f32_sub(R,A,B) \ - ((R)->__vpair_vp_f32 = (A)->__vpair_vp_f32 - (B)->__vpair_vp_f32) - -#define vpair_f32_fma(R,A,B,C) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvmaddsp ((A)->__vpair_vec_f32[0], \ - (B)->__vpair_vec_f32[0], \ - (C)->__vpair_vec_f32[0])), \ - (((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvmaddsp ((A)->__vpair_vec_f32[1], \ - (B)->__vpair_vec_f32[1], \ - (C)->__vpair_vec_f32[1])))) - -#define vpair_f32_fms(R,A,B,C) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvmsubsp ((A)->__vpair_vec_f32[0], \ - (B)->__vpair_vec_f32[0], \ - (C)->__vpair_vec_f32[0])), \ - (((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvmsubsp ((A)->__vpair_vec_f32[1], \ - (B)->__vpair_vec_f32[1], \ - (C)->__vpair_vec_f32[1])))) - -#define vpair_f32_nfma(R,A,B,C) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvnmaddsp ((A)->__vpair_vec_f32[0], \ - (B)->__vpair_vec_f32[0], \ - (C)->__vpair_vec_f32[0])), \ - (((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvnmaddsp ((A)->__vpair_vec_f32[1], \ - (B)->__vpair_vec_f32[1], \ - (C)->__vpair_vec_f32[1])))) - -#define vpair_f32_nfms(R,A,B,C) \ - (((R)->__vpair_vec_f32[0] \ - = __builtin_vsx_xvnmsubsp ((A)->__vpair_vec_f32[0], \ - (B)->__vpair_vec_f32[0], \ - (C)->__vpair_vec_f32[0])), \ - (((R)->__vpair_vec_f32[1] \ - = __builtin_vsx_xvnmsubsp ((A)->__vpair_vec_f32[1], \ - (B)->__vpair_vec_f32[1], \ - (C)->__vpair_vec_f32[1])))) + +/* vector pair float operations on power8/power9. */ +#define vpair_f32_splat(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + __vr->__vflt[0] = __vr->__vflt[1] \ + = __builtin_vec_splats ((float)(A)); \ + } \ + while (0) + +#define vpair_f32_abs(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vflt[0] = __builtin_vsx_xvabssp (__va->__vflt[0]); \ + __vr->__vflt[1] = __builtin_vsx_xvabssp (__va->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_nabs(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vflt[0] = __builtin_vsx_xvnabssp (__va->__vflt[0]); \ + __vr->__vflt[1] = __builtin_vsx_xvnabssp (__va->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_neg(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vflt[0] = - __va->__vflt[0]; \ + __vr->__vflt[1] = - __va->__vflt[1]; \ + } \ + while (0) + +#define vpair_f32_sqrt(R, A) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + __vr->__vflt[0] = __builtin_vsx_xvsqrtsp (__va->__vflt[0]); \ + __vr->__vflt[1] = __builtin_vsx_xvsqrtsp (__va->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_add(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vflt[0] = __va->__vflt[0] + __vb->__vflt[0]; \ + __vr->__vflt[1] = __va->__vflt[1] + __vb->__vflt[1]; \ + } \ + while (0) + +#define vpair_f32_div(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vflt[0] = __va->__vflt[0] / __vb->__vflt[0]; \ + __vr->__vflt[1] = __va->__vflt[1] / __vb->__vflt[1]; \ + } \ + while (0) + +#define vpair_f32_max(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vflt[0] \ + = __builtin_vsx_xvmaxsp (__va->__vflt[0], __vb->__vflt[0]); \ + __vr->__vflt[1] \ + = __builtin_vsx_xvmaxsp (__va->__vflt[1], __vb->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_min(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vflt[0] \ + = __builtin_vsx_xvminsp (__va->__vflt[0], __vb->__vflt[0]); \ + __vr->__vflt[1] \ + = __builtin_vsx_xvminsp (__va->__vflt[1], __vb->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_mul(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vflt[0] = __va->__vflt[0] * __vb->__vflt[0]; \ + __vr->__vflt[1] = __va->__vflt[1] * __vb->__vflt[1]; \ + } \ + while (0) + +#define vpair_f32_sub(R, A, B) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + __vr->__vflt[0] = __va->__vflt[0] - __vb->__vflt[0]; \ + __vr->__vflt[1] = __va->__vflt[1] - __vb->__vflt[1]; \ + } \ + while (0) + +#define vpair_f32_fma(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vflt[0] \ + = __builtin_vsx_xvmaddsp (__va->__vflt[0], \ + __vb->__vflt[0], \ + __vc->__vflt[0]); \ + __vr->__vflt[1] \ + = __builtin_vsx_xvmaddsp (__va->__vflt[1], \ + __vb->__vflt[1], \ + __vc->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_fms(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vflt[0] \ + = __builtin_vsx_xvmsubsp (__va->__vflt[0], \ + __vb->__vflt[0], \ + __vc->__vflt[0]); \ + __vr->__vflt[1] \ + = __builtin_vsx_xvmsubsp (__va->__vflt[1], \ + __vb->__vflt[1], \ + __vc->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_nfma(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vflt[0] \ + = __builtin_vsx_xvnmaddsp (__va->__vflt[0], \ + __vb->__vflt[0], \ + __vc->__vflt[0]); \ + __vr->__vflt[1] \ + = __builtin_vsx_xvnmaddsp (__va->__vflt[1], \ + __vb->__vflt[1], \ + __vc->__vflt[1]); \ + } \ + while (0) + +#define vpair_f32_nfms(R, A, B, C) \ + do \ + { \ + union vpair_union *__vr = (union vpair_union *)(R); \ + union vpair_union *__va = (union vpair_union *)(A); \ + union vpair_union *__vb = (union vpair_union *)(B); \ + union vpair_union *__vc = (union vpair_union *)(C); \ + __vr->__vflt[0] \ + = __builtin_vsx_xvnmsubsp (__va->__vflt[0], \ + __vb->__vflt[0], \ + __vc->__vflt[0]); \ + __vr->__vflt[1] \ + = __builtin_vsx_xvnmsubsp (__va->__vflt[1], \ + __vb->__vflt[1], \ + __vc->__vflt[1]); \ + } \ + while (0) #endif /* __MMA__. */