qiucf created this revision.
qiucf added reviewers: jsji, nemanjai, PowerPC, shchenz.
Herald added subscribers: kbarton, krytarowski.
qiucf requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D119407
Files:
clang/lib/Headers/ppc_wrappers/bmi2intrin.h
clang/lib/Headers/ppc_wrappers/bmiintrin.h
clang/lib/Headers/ppc_wrappers/emmintrin.h
clang/lib/Headers/ppc_wrappers/immintrin.h
clang/lib/Headers/ppc_wrappers/nmmintrin.h
clang/lib/Headers/ppc_wrappers/pmmintrin.h
clang/lib/Headers/ppc_wrappers/smmintrin.h
clang/lib/Headers/ppc_wrappers/tmmintrin.h
clang/lib/Headers/ppc_wrappers/x86gprintrin.h
clang/lib/Headers/ppc_wrappers/x86intrin.h
clang/lib/Headers/ppc_wrappers/xmmintrin.h
Index: clang/lib/Headers/ppc_wrappers/xmmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/xmmintrin.h
+++ clang/lib/Headers/ppc_wrappers/xmmintrin.h
@@ -31,10 +31,8 @@
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
-#ifndef _XMMINTRIN_H_INCLUDED
-#define _XMMINTRIN_H_INCLUDED
-
-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#ifndef XMMINTRIN_H_
+#define XMMINTRIN_H_
/* Define four value permute mask */
#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
@@ -52,6 +50,8 @@
#undef bool
#endif
+#include <assert.h>
+
/* We need type definitions from the MMX header file. */
#include <mmintrin.h>
@@ -62,13 +62,14 @@
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
-typedef vector float __m128 __attribute__((__may_alias__));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
/* Unaligned version of the same type. */
-typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
+ __aligned__ (1)));
/* Internal data types for implementing the intrinsics. */
-typedef vector float __v4sf;
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
/* Create an undefined vector. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -89,6 +90,7 @@
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps (float const *__P)
{
+ assert(((unsigned long)__P & 0xfUL) == 0UL);
return ((__m128)vec_ld(0, (__v4sf*)__P));
}
@@ -145,6 +147,7 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps (float *__P, __m128 __A)
{
+ assert(((unsigned long)__P & 0xfUL) == 0UL);
vec_st((__v4sf)__A, 0, (__v4sf*)__P);
}
@@ -881,7 +884,7 @@
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32 (__m128 __A)
{
- __m64 res = 0;
+ int res;
#ifdef _ARCH_PWR8
double dtmp;
__asm__(
@@ -914,8 +917,8 @@
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64 (__m128 __A)
{
- __m64 res = 0;
-#ifdef _ARCH_PWR8
+ long long res;
+#if defined (_ARCH_PWR8) && defined (__powerpc64__)
double dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
@@ -1328,6 +1331,9 @@
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_ps (__m128 __A)
{
+#ifdef _ARCH_PWR10
+ return vec_extractm ((__vector unsigned int) __A);
+#else
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
@@ -1347,6 +1353,7 @@
#else
return result[0];
#endif
+#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@@ -1553,6 +1560,7 @@
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8 (__m64 __A)
{
+#ifdef __powerpc64__
unsigned long long p =
#ifdef __LITTLE_ENDIAN__
0x0008101820283038UL; // permute control for sign bits
@@ -1560,6 +1568,18 @@
0x3830282018100800UL; // permute control for sign bits
#endif
return __builtin_bpermd (p, __A);
+#else
+#ifdef __LITTLE_ENDIAN__
+ unsigned int mask = 0x20283038UL;
+ unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
+ unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+#else
+ unsigned int mask = 0x38302820UL;
+ unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+ unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
+#endif
+ return (r2 << 4) | r1;
+#endif
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1841,4 +1861,4 @@
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
-#endif /* _XMMINTRIN_H_INCLUDED */
+#endif /* XMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/x86intrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/x86intrin.h
@@ -0,0 +1,28 @@
+/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef X86INTRIN_H_
+#define X86INTRIN_H_
+
+#ifdef __ALTIVEC__
+#include <immintrin.h>
+#endif /* __ALTIVEC__ */
+
+#endif /* X86INTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/x86gprintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/x86gprintrin.h
@@ -0,0 +1,17 @@
+/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef X86GPRINTRIN_H_
+#define X86GPRINTRIN_H_
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#endif /* X86GPRINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/tmmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/tmmintrin.h
+++ clang/lib/Headers/ppc_wrappers/tmmintrin.h
@@ -28,6 +28,7 @@
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
#include <altivec.h>
+#include <assert.h>
/* We need definitions from the SSE header files. */
#include <pmmintrin.h>
@@ -339,6 +340,7 @@
return (__m64) ((__v2du) (__C))[0];
}
+#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8 (__m128i __A, __m128i __B)
@@ -350,7 +352,9 @@
__v16qi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16 (__m128i __A, __m128i __B)
@@ -362,7 +366,9 @@
__v8hi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32 (__m128i __A, __m128i __B)
@@ -374,7 +380,9 @@
__v4si __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8 (__m64 __A, __m64 __B)
@@ -385,7 +393,9 @@
__C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16 (__m64 __A, __m64 __B)
@@ -396,7 +406,9 @@
__C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32 (__m64 __A, __m64 __B)
@@ -407,6 +419,7 @@
__C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
+#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
Index: clang/lib/Headers/ppc_wrappers/smmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ clang/lib/Headers/ppc_wrappers/smmintrin.h
@@ -34,77 +34,683 @@
#include <altivec.h>
#include <tmmintrin.h>
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_epi8(__m128i __X, const int __N) {
- return (unsigned char)((__v16qi)__X)[__N & 15];
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_ZERO 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_NEG_INF 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+
+#define _MM_FROUND_NINT \
+ (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR \
+ (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL \
+ (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC \
+ (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT \
+ (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT \
+ (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NO_EXC 0x08
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __A, int __rounding)
+{
+ __v2df __r;
+ union {
+ double __fr;
+ long long __fpscr;
+ } __enables_save, __fpscr_save;
+
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Save enabled exceptions, disable all exceptions,
+ and preserve the rounding mode. */
+#ifdef _ARCH_PWR9
+ __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+ __fpscr_save.__fr = __builtin_mffs ();
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+ __fpscr_save.__fpscr &= ~0xf8;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+ }
+
+ switch (__rounding)
+ {
+ case _MM_FROUND_TO_NEAREST_INT:
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __attribute__ ((fallthrough));
+ case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+ __builtin_set_fpscr_rn (0b00);
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+
+ __r = vec_rint ((__v2df) __A);
+
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ __builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+ break;
+ case _MM_FROUND_TO_NEG_INF:
+ case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+ __r = vec_floor ((__v2df) __A);
+ break;
+ case _MM_FROUND_TO_POS_INF:
+ case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+ __r = vec_ceil ((__v2df) __A);
+ break;
+ case _MM_FROUND_TO_ZERO:
+ case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+ __r = vec_trunc ((__v2df) __A);
+ break;
+ case _MM_FROUND_CUR_DIRECTION:
+ __r = vec_rint ((__v2df) __A);
+ break;
+ }
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ /* Restore enabled exceptions. */
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __fpscr_save.__fpscr |= __enables_save.__fpscr;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+ }
+ return (__m128d) __r;
}
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_epi32(__m128i __X, const int __N) {
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd (__m128d __A, __m128d __B, int __rounding)
+{
+ __B = _mm_round_pd (__B, __rounding);
+ __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] };
+ return (__m128d) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __A, int __rounding)
+{
+ __v4sf __r;
+ union {
+ double __fr;
+ long long __fpscr;
+ } __enables_save, __fpscr_save;
+
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Save enabled exceptions, disable all exceptions,
+ and preserve the rounding mode. */
+#ifdef _ARCH_PWR9
+ __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+ __fpscr_save.__fr = __builtin_mffs ();
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+ __fpscr_save.__fpscr &= ~0xf8;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+ }
+
+ switch (__rounding)
+ {
+ case _MM_FROUND_TO_NEAREST_INT:
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __attribute__ ((fallthrough));
+ case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+ __builtin_set_fpscr_rn (0b00);
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+
+ __r = vec_rint ((__v4sf) __A);
+
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ __builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+ break;
+ case _MM_FROUND_TO_NEG_INF:
+ case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+ __r = vec_floor ((__v4sf) __A);
+ break;
+ case _MM_FROUND_TO_POS_INF:
+ case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+ __r = vec_ceil ((__v4sf) __A);
+ break;
+ case _MM_FROUND_TO_ZERO:
+ case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+ __r = vec_trunc ((__v4sf) __A);
+ break;
+ case _MM_FROUND_CUR_DIRECTION:
+ __r = vec_rint ((__v4sf) __A);
+ break;
+ }
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ /* Restore enabled exceptions. */
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __fpscr_save.__fpscr |= __enables_save.__fpscr;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+ }
+ return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __A, __m128 __B, int __rounding)
+{
+ __B = _mm_round_ps (__B, __rounding);
+ __v4sf __r = (__v4sf) __A;
+ __r[0] = ((__v4sf) __B)[0];
+ return (__m128) __r;
+}
+
+#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
+{
+ __v16qi result = (__v16qi)__A;
+
+ result [__N & 0xf] = __D;
+
+ return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
+{
+ __v4si result = (__v4si)__A;
+
+ result [__N & 3] = __D;
+
+ return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
+{
+ __v2di result = (__v2di)__A;
+
+ result [__N & 1] = __D;
+
+ return (__m128i) result;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+ return (unsigned char) ((__v16qi)__X)[__N & 15];
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
return ((__v4si)__X)[__N & 3];
}
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_epi64(__m128i __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
return ((__v2di)__X)[__N & 1];
}
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_ps(__m128 __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
return ((__v4si)__X)[__N & 3];
}
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
+{
+ __v16qi __charmask = vec_splats ((signed char) __imm8);
+ __charmask = vec_gb (__charmask);
+ __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+ #ifdef __BIG_ENDIAN__
+ __shortmask = vec_reve (__shortmask);
+ #endif
+ return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
+{
+#ifdef _ARCH_PWR10
+ return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask);
+#else
+ const __v16qu __seven = vec_splats ((unsigned char) 0x07);
+ __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven);
+ return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask);
+#endif
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+ __v16qu __pcv[] =
+ {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+ };
+ __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+ return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+#ifdef _ARCH_PWR10
+ return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask);
+#else
+ const __v4si __zero = {0};
+ const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+ return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+#endif
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+ __v16qu __pcv[] =
+ {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
+ };
+ __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+ return (__m128d) __r;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+#ifdef _ARCH_PWR10
+ return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask);
+#else
+ const __v2di __zero = {0};
+ const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
+ return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+#endif
+}
+#endif
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __A, __m128i __B)
+{
+ /* Note: This implementation does NOT set "zero" or "carry" flags. */
+ const __v16qu __zero = {0};
+ return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __A, __m128i __B)
+{
+ /* Note: This implementation does NOT set "zero" or "carry" flags. */
+ const __v16qu __zero = {0};
+ const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A);
+ return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __A, __m128i __B)
+{
+ /* Note: This implementation does NOT set "zero" or "carry" flags. */
+ return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0;
+}
+
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+ _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+#ifdef _ARCH_PWR8
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
- __v16qi __charmask = vec_splats((signed char)__imm8);
- __charmask = vec_gb(__charmask);
- __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
-#ifdef __BIG_ENDIAN__
- __shortmask = vec_reve(__shortmask);
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
+}
#endif
- return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
- const __v16qu __seven = vec_splats((unsigned char)0x07);
- __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
- return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
- __v16qi result = (__v16qi)__A;
- result[__N & 0xf] = __D;
- return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
- __v4si result = (__v4si)__A;
- result[__N & 3] = __D;
- return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
- __v2di result = (__v2di)__A;
- result[__N & 1] = __D;
- return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi16 (__m128i __A)
+{
+ return (__m128i) vec_unpackh ((__v16qi) __A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi32 (__m128i __A)
+{
+ __A = (__m128i) vec_unpackh ((__v16qi) __A);
+ return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi64 (__m128i __A)
+{
+ __A = (__m128i) vec_unpackh ((__v16qi) __A);
+ __A = (__m128i) vec_unpackh ((__v8hi) __A);
+ return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi32 (__m128i __A)
+{
+ return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi64 (__m128i __A)
+{
+ __A = (__m128i) vec_unpackh ((__v8hi) __A);
+ return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi64 (__m128i __A)
+{
+ return (__m128i) vec_unpackh ((__v4si) __A);
}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi16 (__m128i __A)
+{
+ const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi32 (__m128i __A)
+{
+ const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+ __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi64 (__m128i __A)
+{
+ const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+ __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+ __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+ __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi32 (__m128i __A)
+{
+ const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi64 (__m128i __A)
+{
+ const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+ __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+ __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_epi64 (__m128i __A)
+{
+ const __v4su __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v4su) __A, __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+ and bits [18:16] respectively. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __A)
+{
+ union __u
+ {
+ __m128i __m;
+ __v8hu __uh;
+ };
+ union __u __u = { .__m = __A }, __r = { .__m = {0} };
+ unsigned short __ridx = 0;
+ unsigned short __rmin = __u.__uh[__ridx];
+ for (unsigned long __i = 1; __i < 8; __i++)
+ {
+ if (__u.__uh[__i] < __rmin)
+ {
+ __rmin = __u.__uh[__i];
+ __ridx = __i;
+ }
+ }
+ __r.__uh[0] = __rmin;
+ __r.__uh[1] = __ridx;
+ return __r.__m;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
+}
+#endif
#else
#include_next <smmintrin.h>
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
-#endif /* _SMMINTRIN_H_ */
+#endif /* SMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/pmmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/pmmintrin.h
+++ clang/lib/Headers/ppc_wrappers/pmmintrin.h
@@ -111,17 +111,21 @@
vec_mergel ((__v2df) __X, (__v2df)__Y));
}
+#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehdup_ps (__m128 __X)
{
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_moveldup_ps (__m128 __X)
{
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
}
+#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loaddup_pd (double const *__P)
Index: clang/lib/Headers/ppc_wrappers/nmmintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/nmmintrin.h
@@ -0,0 +1,26 @@
+/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+#endif
+
+#ifndef NMMINTRIN_H_
+#define NMMINTRIN_H_
+
+/* We just include SSE4.1 header file. */
+#include <smmintrin.h>
+
+#endif /* NMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/immintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/immintrin.h
@@ -0,0 +1,27 @@
+/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef IMMINTRIN_H_
+#define IMMINTRIN_H_
+
+#include <x86gprintrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#endif /* IMMINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/emmintrin.h
===================================================================
--- clang/lib/Headers/ppc_wrappers/emmintrin.h
+++ clang/lib/Headers/ppc_wrappers/emmintrin.h
@@ -38,6 +38,7 @@
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
#include <altivec.h>
+#include <assert.h>
/* We need definitions from the SSE header files. */
#include <xmmintrin.h>
@@ -127,6 +128,7 @@
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd (double const *__P)
{
+ assert(((unsigned long)__P & 0xfUL) == 0UL);
return ((__m128d)vec_ld(0, (__v16qu*)__P));
}
@@ -169,6 +171,7 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd (double *__P, __m128d __A)
{
+ assert(((unsigned long)__P & 0xfUL) == 0UL);
vec_st((__v16qu)__A, 0, (__v16qu*)__P);
}
@@ -405,20 +408,10 @@
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd (__m128d __A, __m128d __B)
{
-#if _ARCH_PWR8
__v2du c, d;
/* Compare against self will return false (0's) if NAN. */
c = (__v2du)vec_cmpeq (__A, __A);
d = (__v2du)vec_cmpeq (__B, __B);
-#else
- __v2du a, b;
- __v2du c, d;
- const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
- a = (__v2du)vec_abs ((__v2df)__A);
- b = (__v2du)vec_abs ((__v2df)__B);
- c = (__v2du)vec_cmpgt (double_exp_mask, a);
- d = (__v2du)vec_cmpgt (double_exp_mask, b);
-#endif
/* A != NAN and B != NAN. */
return ((__m128d)vec_and(c, d));
}
@@ -777,6 +770,7 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128 (__m128i *__P, __m128i __B)
{
+ assert(((unsigned long )__P & 0xfUL) == 0UL);
vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
}
@@ -861,7 +855,11 @@
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -896,7 +894,11 @@
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4sf) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -925,7 +927,11 @@
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -1205,6 +1211,9 @@
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd (__m128d __A)
{
+#ifdef _ARCH_PWR10
+ return vec_extractm ((__v2du) __A);
+#else
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
@@ -1224,6 +1233,7 @@
#else
return result[0];
#endif
+#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@@ -1434,6 +1444,7 @@
return ((__m64)a * (__m64)b);
}
+#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
{
@@ -1460,6 +1471,7 @@
return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
#endif
}
+#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
@@ -1749,7 +1761,7 @@
lshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (lshift, shmax);
result = vec_sl ((__v2du) __A, lshift);
- result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+ result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
@@ -1843,7 +1855,7 @@
rshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (rshift, shmax);
result = vec_sr ((__v2du) __A, rshift);
- result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+ result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
@@ -1995,10 +2007,14 @@
#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum. */
-/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
+/* Return a mask created from the most significant bit of each 8-bit
+ element in A. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8 (__m128i __A)
{
+#ifdef _ARCH_PWR10
+ return vec_extractm ((__v16qu) __A);
+#else
__vector unsigned long long result;
static const __vector unsigned char perm_mask =
{
@@ -2015,6 +2031,7 @@
#else
return result[0];
#endif
+#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@@ -2158,27 +2175,37 @@
_mm_sad_epu8 (__m128i __A, __m128i __B)
{
__v16qu a, b;
- __v16qu vmin, vmax, vabsdiff;
+ __v16qu vabsdiff;
__v4si vsum;
const __v4su zero = { 0, 0, 0, 0 };
__v4si result;
a = (__v16qu) __A;
b = (__v16qu) __B;
- vmin = vec_min (a, b);
- vmax = vec_max (a, b);
+#ifndef _ARCH_PWR9
+ __v16qu vmin = vec_min (a, b);
+ __v16qu vmax = vec_max (a, b);
vabsdiff = vec_sub (vmax, vmin);
+#else
+ vabsdiff = vec_absd (a, b);
+#endif
/* Sum four groups of bytes into integers. */
vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+#ifdef __LITTLE_ENDIAN__
+ /* Sum across four integers with two integer results. */
+ asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero));
+ /* Note: vec_sum2s could be used here, but on little-endian, vector
+ shifts are added that are not needed for this use-case.
+ A vector shift to correctly position the 32-bit integer results
+ (currently at [0] and [2]) to [1] and [3] would then need to be
+ swapped back again since the desired results are two 64-bit
+ integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
+#else
/* Sum across four integers with two integer results. */
result = vec_sum2s (vsum, (__vector signed int) zero);
/* Rotate the sums into the correct position. */
-#ifdef __LITTLE_ENDIAN__
- result = vec_sld (result, result, 4);
-#else
result = vec_sld (result, result, 6);
#endif
- /* Rotate the sums into the correct position. */
return (__m128i) result;
}
Index: clang/lib/Headers/ppc_wrappers/bmiintrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/bmiintrin.h
@@ -0,0 +1,165 @@
+/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMIINTRIN_H_
+#define BMIINTRIN_H_
+
+extern __inline unsigned short
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __tzcnt_u16(unsigned short __X) {
+ return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __andn_u32(unsigned int __X, unsigned int __Y) {
+ return (~__X & __Y);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
+ return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __bextr_u32(unsigned int __X, unsigned int __Y) {
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y >> 8) & 0xFF;
+ return (_bextr_u32(__X, __P, __L));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsi_u32(unsigned int __X) {
+ return (__X & -__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsi_u32(unsigned int __X) {
+ return __blsi_u32(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsmsk_u32(unsigned int __X) {
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsmsk_u32(unsigned int __X) {
+ return __blsmsk_u32(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsr_u32(unsigned int __X) {
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsr_u32(unsigned int __X) {
+ return __blsr_u32(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __tzcnt_u32(unsigned int __X) {
+ return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _tzcnt_u32(unsigned int __X) {
+ return __builtin_ctz(__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+ for long long. */
+#ifdef __PPC64__
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __andn_u64(unsigned long long __X, unsigned long long __Y) {
+ return (~__X & __Y);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
+ return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __bextr_u64(unsigned long long __X, unsigned long long __Y) {
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y & 0xFF00) >> 8;
+ return (_bextr_u64(__X, __P, __L));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsi_u64(unsigned long long __X) {
+ return __X & -__X;
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsi_u64(unsigned long long __X) {
+ return __blsi_u64(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsmsk_u64(unsigned long long __X) {
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsmsk_u64(unsigned long long __X) {
+ return __blsmsk_u64(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsr_u64(unsigned long long __X) {
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsr_u64(unsigned long long __X) {
+ return __blsr_u64(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __tzcnt_u64(unsigned long long __X) {
+ return __builtin_ctzll(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _tzcnt_u64(unsigned long long __X) {
+ return __builtin_ctzll(__X);
+}
+#endif /* __PPC64__ */
+
+#endif /* BMIINTRIN_H_ */
Index: clang/lib/Headers/ppc_wrappers/bmi2intrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/ppc_wrappers/bmi2intrin.h
@@ -0,0 +1,133 @@
+/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMI2INTRIN_H_
+#define BMI2INTRIN_H_
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bzhi_u32(unsigned int __X, unsigned int __Y) {
+ return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
+ unsigned long long __res = (unsigned long long)__X * __Y;
+ *__P = (unsigned int)(__res >> 32);
+ return (unsigned int)__res;
+}
+
+#ifdef __PPC64__
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
+ return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit. */
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mulx_u64(unsigned long long __X, unsigned long long __Y,
+ unsigned long long *__P) {
+ unsigned __int128 __res = (unsigned __int128)__X * __Y;
+ *__P = (unsigned long long)(__res >> 64);
+ return (unsigned long long)__res;
+}
+
+#ifdef _ARCH_PWR7
+/* popcount and bpermd require power7 minimum. */
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pdep_u64(unsigned long long __X, unsigned long long __M) {
+ unsigned long result = 0x0UL;
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c, t;
+ unsigned long p;
+
+ /* The pop-count of the mask gives the number of the bits from
+ source to process. This is also needed to shift bits from the
+ source into the correct position for the result. */
+ p = 64 - __builtin_popcountl(__M);
+
+ /* The loop is for the number of '1' bits in the mask and clearing
+ each mask bit as it is processed. */
+ while (m != 0) {
+ c = __builtin_clzl(m);
+ t = __X << (p - c);
+ m ^= (mask >> c);
+ result |= (t & (mask >> c));
+ p++;
+ }
+ return (result);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pext_u64(unsigned long long __X, unsigned long long __M) {
+ unsigned long p = 0x4040404040404040UL; // initial bit permute control
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c;
+ unsigned long result;
+
+ /* if the mask is constant and selects 8 bits or less we can use
+ the Power8 Bit permute instruction. */
+ if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
+ /* Also if the pext mask is constant, then the popcount is
+ constant, we can evaluate the following loop at compile
+ time and use a constant bit permute vector. */
+ for (long i = 0; i < __builtin_popcountl(__M); i++) {
+ c = __builtin_clzl(m);
+ p = (p << 8) | c;
+ m ^= (mask >> c);
+ }
+ result = __builtin_bpermd(p, __X);
+ } else {
+ p = 64 - __builtin_popcountl(__M);
+ result = 0;
+ /* We could a use a for loop here, but that combined with
+ -funroll-loops can expand to a lot of code. The while
+ loop avoids unrolling and the compiler commons the xor
+ from clearing the mask bit with the (m != 0) test. The
+ result is a more compact loop setup and body. */
+ while (m != 0) {
+ unsigned long t;
+ c = __builtin_clzl(m);
+ t = (__X & (mask >> c)) >> (p - c);
+ m ^= (mask >> c);
+ result |= (t);
+ p++;
+ }
+ }
+ return (result);
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext
+ which depend on _ARCH_PWR7. */
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pdep_u32(unsigned int __X, unsigned int __Y) {
+ return _pdep_u64(__X, __Y);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pext_u32(unsigned int __X, unsigned int __Y) {
+ return _pext_u64(__X, __Y);
+}
+#endif /* _ARCH_PWR7 */
+#endif /* __PPC64__ */
+
+#endif /* BMI2INTRIN_H_ */
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits