On Mon, Oct 30, 2023 at 3:47 PM Haochen Jiang <haochen.ji...@intel.com> wrote: > > Hi all, > > This patch fixed two obvious bug in current evex512 implementation. > > Also, I moved AVX512CD+AVX512VL part out of the AVX512VL to avoid > accidental handle miss in avx512cd in the future. > > Ok for trunk? Ok. > > BRs, > Haochen > > gcc/ChangeLog: > > * config/i386/avx512cdintrin.h (target): Push evex512 for > avx512cd. > * config/i386/avx512vlintrin.h (target): Split avx512cdvl part > out from avx512vl. > * config/i386/i386-builtin.def (BDESC): Do not check evex512 > for builtins not needed. > --- > gcc/config/i386/avx512cdintrin.h | 2 +- > gcc/config/i386/avx512vlintrin.h | 1792 +++++++++++++++--------------- > gcc/config/i386/i386-builtin.def | 4 +- > 3 files changed, 899 insertions(+), 899 deletions(-) > > diff --git a/gcc/config/i386/avx512cdintrin.h > b/gcc/config/i386/avx512cdintrin.h > index a5f5eabb68d..56a786aa9a3 100644 > --- a/gcc/config/i386/avx512cdintrin.h > +++ b/gcc/config/i386/avx512cdintrin.h > @@ -30,7 +30,7 @@ > > #ifndef __AVX512CD__ > #pragma GCC push_options > -#pragma GCC target("avx512cd") > +#pragma GCC target("avx512cd,evex512") > #define __DISABLE_AVX512CD__ > #endif /* __AVX512CD__ */ > > diff --git a/gcc/config/i386/avx512vlintrin.h > b/gcc/config/i386/avx512vlintrin.h > index 08e49e8d8ab..a40aa91b948 100644 > --- a/gcc/config/i386/avx512vlintrin.h > +++ b/gcc/config/i386/avx512vlintrin.h > @@ -8396,1281 +8396,1003 @@ _mm_mask_min_epu32 (__m128i __W, __mmask8 __M, > __m128i __A, > (__v4si) __W, __M); > } > > -#ifndef __AVX512CD__ > -#pragma GCC push_options > -#pragma GCC target("avx512vl,avx512cd") > -#define __DISABLE_AVX512VLCD__ > -#endif > - > -extern __inline __m128i > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_broadcastmb_epi64 (__mmask8 __A) > +_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A, > + __m256d __B) > { > - return (__m128i) __builtin_ia32_broadcastmb128 (__A); > + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, > + (__v4df) __B, > + (__v4df) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_broadcastmb_epi64 (__mmask8 __A) > +_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B) > { > - return (__m256i) __builtin_ia32_broadcastmb256 (__A); > + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, > + (__v4df) __B, > + (__v4df) > + _mm256_setzero_pd (), > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m128d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_broadcastmw_epi32 (__mmask16 __A) > +_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A, > + __m128d __B) > { > - return (__m128i) __builtin_ia32_broadcastmw128 (__A); > + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, > + (__v2df) __B, > + (__v2df) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_broadcastmw_epi32 (__mmask16 __A) > +_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B) > { > - return (__m256i) __builtin_ia32_broadcastmw256 (__A); > + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, > + (__v2df) __B, > + (__v2df) > + _mm_setzero_pd (), > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_lzcnt_epi32 (__m256i __A) > +_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A, > + __m256 __B) > { > - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, > - (__v8si) > - _mm256_setzero_si256 (), > - (__mmask8) -1); > + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, > + (__v8sf) __B, > + (__v8sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) > +_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A, > + __m256d __B) > { > - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, > - (__v8si) __W, > - (__mmask8) __U); > + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, > + (__v4df) __B, > + (__v4df) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) > +_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B) > { > - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, > - (__v8si) > - _mm256_setzero_si256 (), > - (__mmask8) __U); > + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, > + (__v4df) __B, > + (__v4df) > + _mm256_setzero_pd (), > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_lzcnt_epi64 (__m256i __A) > +_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A, > + __m128d __B) > { > - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, > - (__v4di) > - _mm256_setzero_si256 (), > - (__mmask8) -1); > + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, > + (__v2df) __B, > + (__v2df) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) > +_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B) > { > - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, > - (__v4di) __W, > - (__mmask8) __U); > + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, > + (__v2df) __B, > + (__v2df) > + _mm_setzero_pd (), > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) > +_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A, > + __m256 __B) > { > - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, > - (__v4di) > - _mm256_setzero_si256 (), > - (__mmask8) __U); > + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, > + (__v8sf) __B, > + (__v8sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_conflict_epi64 (__m256i __A) > +_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B) > { > - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, > - (__v4di) > - _mm256_setzero_si256 > (), > - (__mmask8) -1); > + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, > + (__v8sf) __B, > + (__v8sf) > + _mm256_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) > +_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) > { > - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, > - (__v4di) __W, > - (__mmask8) > - __U); > + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, > + (__v4sf) __B, > + (__v4sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) > +_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B) > { > - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, > - (__v4di) > - _mm256_setzero_si256 > (), > - (__mmask8) > - __U); > + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, > + (__v4sf) __B, > + (__v4sf) > + _mm_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_conflict_epi32 (__m256i __A) > +_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) > { > - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, > - (__v8si) > - _mm256_setzero_si256 > (), > - (__mmask8) -1); > + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, > + (__v4sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) > +_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) > { > - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, > - (__v8si) __W, > - (__mmask8) > - __U); > + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, > + (__v4sf) > + _mm_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) > +_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B) > { > - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, > - (__v8si) > - _mm256_setzero_si256 > (), > - (__mmask8) > - __U); > + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, > + (__v8sf) __B, > + (__v8sf) > + _mm256_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_lzcnt_epi32 (__m128i __A) > +_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) > { > - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, > - (__v4si) > - _mm_setzero_si128 (), > - (__mmask8) -1); > + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, > + (__v8sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) > +_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) > { > - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, > - (__v4si) __W, > - (__mmask8) __U); > + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, > + (__v8sf) > + _mm256_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) > +_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) > { > - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, > - (__v4si) > - _mm_setzero_si128 (), > - (__mmask8) __U); > + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, > + (__v4sf) __B, > + (__v4sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_lzcnt_epi64 (__m128i __A) > +_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B) > { > - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, > - (__v2di) > - _mm_setzero_si128 (), > - (__mmask8) -1); > + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, > + (__v4sf) __B, > + (__v4sf) > + _mm_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) > +_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A, > + __m128i __B) > { > - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, > - (__v2di) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, > + (__v4si) __B, > + (__v8si) __W, > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) > +_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B) > { > - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, > - (__v2di) > - _mm_setzero_si128 (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, > + (__v4si) __B, > + (__v8si) > + _mm256_setzero_si256 (), > + (__mmask8) __U); > } > > extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_conflict_epi64 (__m128i __A) > +_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A, > + __m128i __B) > { > - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, > - (__v2di) > - _mm_setzero_si128 (), > - (__mmask8) -1); > + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, > + (__v4si) __B, > + (__v4si) __W, > + (__mmask8) __U); > } > > extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) > +_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B) > { > - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, > - (__v2di) __W, > - (__mmask8) > - __U); > + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, > + (__v4si) __B, > + (__v4si) > + _mm_setzero_si128 (), > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) > +_mm256_sra_epi64 (__m256i __A, __m128i __B) > { > - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, > - (__v2di) > - _mm_setzero_si128 (), > - (__mmask8) > - __U); > + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, > + (__v2di) __B, > + (__v4di) > + _mm256_setzero_si256 (), > + (__mmask8) -1); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_conflict_epi32 (__m128i __A) > +_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A, > + __m128i __B) > { > - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, > - (__v4si) > - _mm_setzero_si128 (), > - (__mmask8) -1); > + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, > + (__v2di) __B, > + (__v4di) __W, > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) > +_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B) > { > - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, > - (__v4si) __W, > - (__mmask8) > - __U); > + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, > + (__v2di) __B, > + (__v4di) > + _mm256_setzero_si256 (), > + (__mmask8) __U); > } > > extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) > +_mm_sra_epi64 (__m128i __A, __m128i __B) > { > - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, > - (__v4si) > - _mm_setzero_si128 (), > - (__mmask8) > - __U); > + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, > + (__v2di) __B, > + (__v2di) > + _mm_setzero_si128 (), > + (__mmask8) -1); > } > > -#ifdef __DISABLE_AVX512VLCD__ > -#pragma GCC pop_options > -#endif > - > -extern __inline __m256d > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A, > - __m256d __B) > +_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A, > + __m128i __B) > { > - return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, > - (__v4df) __B, > - (__v4df) __W, > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, > + (__v2di) __B, > + (__v2di) __W, > + (__mmask8) __U); > } > > -extern __inline __m256d > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B) > +_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B) > { > - return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, > - (__v4df) __B, > - (__v4df) > - _mm256_setzero_pd (), > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, > + (__v2di) __B, > + (__v2di) > + _mm_setzero_si128 (), > + (__mmask8) __U); > } > > -extern __inline __m128d > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A, > - __m128d __B) > +_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A, > + __m128i __B) > { > - return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, > - (__v2df) __B, > - (__v2df) __W, > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, > + (__v4si) __B, > + (__v4si) __W, > + (__mmask8) __U); > } > > -extern __inline __m128d > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B) > +_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) > { > - return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, > - (__v2df) __B, > - (__v2df) > - _mm_setzero_pd (), > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, > + (__v4si) __B, > + (__v4si) > + _mm_setzero_si128 (), > + (__mmask8) __U); > } > > -extern __inline __m256 > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A, > - __m256 __B) > +_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A, > + __m128i __B) > { > - return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, > - (__v8sf) __B, > - (__v8sf) __W, > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, > + (__v2di) __B, > + (__v2di) __W, > + (__mmask8) __U); > } > > -extern __inline __m256d > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A, > - __m256d __B) > +_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B) > { > - return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, > - (__v4df) __B, > - (__v4df) __W, > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, > + (__v2di) __B, > + (__v2di) > + _mm_setzero_si128 (), > + (__mmask8) __U); > } > > -extern __inline __m256d > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B) > +_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A, > + __m128i __B) > { > - return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, > - (__v4df) __B, > - (__v4df) > - _mm256_setzero_pd (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, > + (__v4si) __B, > + (__v8si) __W, > + (__mmask8) __U); > } > > -extern __inline __m128d > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A, > - __m128d __B) > +_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B) > { > - return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, > - (__v2df) __B, > - (__v2df) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, > + (__v4si) __B, > + (__v8si) > + _mm256_setzero_si256 (), > + (__mmask8) __U); > } > > -extern __inline __m128d > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B) > +_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A, > + __m128i __B) > { > - return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, > - (__v2df) __B, > - (__v2df) > - _mm_setzero_pd (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, > + (__v2di) __B, > + (__v4di) __W, > + (__mmask8) __U); > } > > -extern __inline __m256 > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A, > - __m256 __B) > +_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B) > { > - return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, > - (__v8sf) __B, > - (__v8sf) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, > + (__v2di) __B, > + (__v4di) > + _mm256_setzero_si256 (), > + (__mmask8) __U); > } > > extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B) > +_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X, > + __m256 __Y) > { > - return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, > - (__v8sf) __B, > - (__v8sf) > - _mm256_setzero_ps (), > - (__mmask8) __U); > + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, > + (__v8si) __X, > + (__v8sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m128 > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) > +_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y) > { > - return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, > - (__v4sf) __B, > - (__v4sf) __W, > - (__mmask8) __U); > + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, > + (__v8si) __X, > + (__v8sf) > + _mm256_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m128 > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B) > +_mm256_permutexvar_pd (__m256i __X, __m256d __Y) > { > - return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, > - (__v4sf) __B, > - (__v4sf) > - _mm_setzero_ps (), > - (__mmask8) __U); > + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, > + (__v4di) __X, > + (__v4df) > + _mm256_setzero_pd (), > + (__mmask8) -1); > } > > -extern __inline __m128 > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) > +_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, > + __m256d __Y) > { > - return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, > - (__v4sf) __W, > - (__mmask8) __U); > + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, > + (__v4di) __X, > + (__v4df) __W, > + (__mmask8) __U); > } > > -extern __inline __m128 > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) > +_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) > { > - return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, > - (__v4sf) > - _mm_setzero_ps (), > - (__mmask8) __U); > + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, > + (__v4di) __X, > + (__v4df) > + _mm256_setzero_pd (), > + (__mmask8) __U); > } > > -extern __inline __m256 > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B) > +_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A, > + __m256i __C) > { > - return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, > - (__v8sf) __B, > - (__v8sf) > - _mm256_setzero_ps (), > - (__mmask8) __U); > + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, > + (__v4di) __C, > + (__v4df) __W, > + (__mmask8) > + __U); > } > > -extern __inline __m256 > +extern __inline __m256d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) > +_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C) > { > - return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, > - (__v8sf) __W, > - (__mmask8) __U); > + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, > + (__v4di) __C, > + (__v4df) > + _mm256_setzero_pd (), > + (__mmask8) > + __U); > } > > extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) > -{ > - return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, > - (__v8sf) > - _mm256_setzero_ps (), > - (__mmask8) __U); > -} > - > -extern __inline __m128 > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) > +_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A, > + __m256i __C) > { > - return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, > - (__v4sf) __B, > - (__v4sf) __W, > - (__mmask8) __U); > + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, > + (__v8si) __C, > + (__v8sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m128 > +extern __inline __m256 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B) > +_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C) > { > - return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, > - (__v4sf) __B, > - (__v4sf) > - _mm_setzero_ps (), > - (__mmask8) __U); > + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, > + (__v8si) __C, > + (__v8sf) > + _mm256_setzero_ps (), > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A, > - __m128i __B) > +_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A, > + __m128i __C) > { > - return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, > - (__v4si) __B, > - (__v8si) __W, > - (__mmask8) __U); > + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, > + (__v2di) __C, > + (__v2df) __W, > + (__mmask8) __U); > } > > -extern __inline __m256i > +extern __inline __m128d > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B) > +_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C) > { > - return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, > - (__v4si) __B, > - (__v8si) > - _mm256_setzero_si256 (), > - (__mmask8) __U); > + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, > + (__v2di) __C, > + (__v2df) > + _mm_setzero_pd (), > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A, > - __m128i __B) > +_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A, > + __m128i __C) > { > - return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, > - (__v4si) __B, > - (__v4si) __W, > - (__mmask8) __U); > + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, > + (__v4si) __C, > + (__v4sf) __W, > + (__mmask8) __U); > } > > -extern __inline __m128i > +extern __inline __m128 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B) > +_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C) > { > - return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, > - (__v4si) __B, > - (__v4si) > - _mm_setzero_si128 (), > - (__mmask8) __U); > + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, > + (__v4si) __C, > + (__v4sf) > + _mm_setzero_ps (), > + (__mmask8) __U); > } > > extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_sra_epi64 (__m256i __A, __m128i __B) > +_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) > { > - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, > - (__v2di) __B, > - (__v4di) > - _mm256_setzero_si256 (), > - (__mmask8) -1); > + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, > + (__v8si) __B, > + (__v8si) > + _mm256_setzero_si256 (), > + __M); > } > > extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A, > - __m128i __B) > +_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, > - (__v2di) __B, > - (__v4di) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, > + (__v4di) __X, > + (__v4di) > + _mm256_setzero_si256 (), > + __M); > } > > extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B) > +_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, > + __m256i __B) > { > - return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, > - (__v2di) __B, > - (__v4di) > - _mm256_setzero_si256 (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, > + (__v8si) __B, > + (__v8si) __W, __M); > } > > extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_sra_epi64 (__m128i __A, __m128i __B) > +_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) > { > - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, > - (__v2di) __B, > - (__v2di) > - _mm_setzero_si128 (), > - (__mmask8) -1); > + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, > + (__v4si) __B, > + (__v4si) > + _mm_setzero_si128 (), > + __M); > } > > extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A, > - __m128i __B) > +_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A, > + __m128i __B) > { > - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, > - (__v2di) __B, > - (__v2di) __W, > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, > + (__v4si) __B, > + (__v4si) __W, __M); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B) > +_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, > + __m256i __Y) > { > - return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, > - (__v2di) __B, > - (__v2di) > - _mm_setzero_si128 (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, > + (__v8si) __Y, > + (__v4di) __W, __M); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A, > - __m128i __B) > +_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, > - (__v4si) __B, > - (__v4si) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, > + (__v8si) __Y, > + (__v4di) > + _mm256_setzero_si256 (), > + __M); > } > > extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) > +_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, > + __m128i __Y) > { > - return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, > - (__v4si) __B, > - (__v4si) > - _mm_setzero_si128 (), > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, > + (__v4si) __Y, > + (__v2di) __W, __M); > } > > extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A, > - __m128i __B) > +_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) > { > - return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, > - (__v2di) __B, > - (__v2di) __W, > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, > + (__v4si) __Y, > + (__v2di) > + _mm_setzero_si128 (), > + __M); > } > > -extern __inline __m128i > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B) > +_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, > - (__v2di) __B, > - (__v2di) > - _mm_setzero_si128 (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, > + (__v4di) __X, > + (__v4di) > + _mm256_setzero_si256 (), > + (__mmask8) -1); > } > > extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A, > - __m128i __B) > +_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, > + __m256i __Y) > { > - return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, > - (__v4si) __B, > - (__v8si) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, > + (__v4di) __X, > + (__v4di) __W, > + __M); > } > > extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B) > +_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, > + __m256i __Y) > { > - return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, > - (__v4si) __B, > - (__v8si) > - _mm256_setzero_si256 (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, > + (__v8si) __Y, > + (__v4di) __W, __M); > } > > extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A, > - __m128i __B) > +_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, > - (__v2di) __B, > - (__v4di) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, > + (__v8si) __X, > + (__v8si) > + _mm256_setzero_si256 (), > + __M); > } > > extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B) > +_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, > - (__v2di) __B, > - (__v4di) > - _mm256_setzero_si256 (), > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, > + (__v8si) __Y, > + (__v4di) > + _mm256_setzero_si256 (), > + __M); > } > > -extern __inline __m256 > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X, > - __m256 __Y) > +_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, > + __m128i __Y) > { > - return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, > - (__v8si) __X, > - (__v8sf) __W, > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, > + (__v4si) __Y, > + (__v2di) __W, __M); > } > > -extern __inline __m256 > +extern __inline __m128i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y) > +_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) > { > - return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, > - (__v8si) __X, > - (__v8sf) > - _mm256_setzero_ps (), > - (__mmask8) __U); > + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, > + (__v4si) __Y, > + (__v2di) > + _mm_setzero_si128 (), > + __M); > } > > -extern __inline __m256d > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_permutexvar_pd (__m256i __X, __m256d __Y) > +_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) > { > - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, > - (__v4di) __X, > - (__v4df) > - _mm256_setzero_pd (), > + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, > + (__v8si) __X, > + (__v8si) > + _mm256_setzero_si256 (), > (__mmask8) -1); > } > > -extern __inline __m256d > +extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, > - __m256d __Y) > +_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X, > + __m256i __Y) > { > - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, > - (__v4di) __X, > - (__v4df) __W, > - (__mmask8) __U); > + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, > + (__v8si) __X, > + (__v8si) __W, > + __M); > } > > -extern __inline __m256d > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, > - (__v4di) __X, > - (__v4df) > - _mm256_setzero_pd (), > - (__mmask8) __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 4, > + (__mmask8) __M); > } > > -extern __inline __m256d > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A, > - __m256i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y) > { > - return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, > - (__v4di) __C, > - (__v4df) __W, > - (__mmask8) > - __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 4, > + (__mmask8) -1); > } > > -extern __inline __m256d > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, > - (__v4di) __C, > - (__v4df) > - _mm256_setzero_pd (), > - (__mmask8) > - __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 1, > + (__mmask8) __M); > } > > -extern __inline __m256 > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A, > - __m256i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y) > { > - return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, > - (__v8si) __C, > - (__v8sf) __W, > - (__mmask8) __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 1, > + (__mmask8) -1); > } > > -extern __inline __m256 > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, > - (__v8si) __C, > - (__v8sf) > - _mm256_setzero_ps (), > - (__mmask8) __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 5, > + (__mmask8) __M); > } > > -extern __inline __m128d > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A, > - __m128i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y) > { > - return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, > - (__v2di) __C, > - (__v2df) __W, > - (__mmask8) __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 5, > + (__mmask8) -1); > } > > -extern __inline __m128d > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, > - (__v2di) __C, > - (__v2df) > - _mm_setzero_pd (), > - (__mmask8) __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 2, > + (__mmask8) __M); > } > > -extern __inline __m128 > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A, > - __m128i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y) > { > - return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, > - (__v4si) __C, > - (__v4sf) __W, > - (__mmask8) __U); > + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 2, > + (__mmask8) -1); > } > > -extern __inline __m128 > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, > - (__v4si) __C, > - (__v4sf) > - _mm_setzero_ps (), > - (__mmask8) __U); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 4, > + (__mmask8) __M); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) > -{ > - return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, > - (__v8si) __B, > - (__v8si) > - _mm256_setzero_si256 (), > - __M); > -} > - > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, > - (__v4di) __X, > - (__v4di) > - _mm256_setzero_si256 (), > - __M); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 4, > + (__mmask8) -1); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, > - __m256i __B) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, > - (__v8si) __B, > - (__v8si) __W, __M); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 1, > + (__mmask8) __M); > } > > -extern __inline __m128i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, > - (__v4si) __B, > - (__v4si) > - _mm_setzero_si128 (), > - __M); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 1, > + (__mmask8) -1); > } > > -extern __inline __m128i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A, > - __m128i __B) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, > - (__v4si) __B, > - (__v4si) __W, __M); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 5, > + (__mmask8) __M); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, > - __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, > - (__v8si) __Y, > - (__v4di) __W, __M); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 5, > + (__mmask8) -1); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, > - (__v8si) __Y, > - (__v4di) > - _mm256_setzero_si256 (), > - __M); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 2, > + (__mmask8) __M); > } > > -extern __inline __m128i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, > - __m128i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, > - (__v4si) __Y, > - (__v2di) __W, __M); > + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 2, > + (__mmask8) -1); > } > > -extern __inline __m128i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, > - (__v4si) __Y, > - (__v2di) > - _mm_setzero_si128 (), > - __M); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 4, > + (__mmask8) __M); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, > - (__v4di) __X, > - (__v4di) > - _mm256_setzero_si256 (), > - (__mmask8) -1); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 4, > + (__mmask8) -1); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, > - __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, > - (__v4di) __X, > - (__v4di) __W, > - __M); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 1, > + (__mmask8) __M); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, > - __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, > - (__v8si) __Y, > - (__v4di) __W, __M); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 1, > + (__mmask8) -1); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, > - (__v8si) __X, > - (__v8si) > - _mm256_setzero_si256 (), > - __M); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 5, > + (__mmask8) __M); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, > - (__v8si) __Y, > - (__v4di) > - _mm256_setzero_si256 (), > - __M); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 5, > + (__mmask8) -1); > } > > -extern __inline __m128i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, > - __m128i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, > - (__v4si) __Y, > - (__v2di) __W, __M); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 2, > + (__mmask8) __M); > } > > -extern __inline __m128i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y) > { > - return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, > - (__v4si) __Y, > - (__v2di) > - _mm_setzero_si128 (), > - __M); > + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > + (__v8si) __Y, 2, > + (__mmask8) -1); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, > - (__v8si) __X, > - (__v8si) > - _mm256_setzero_si256 (), > - (__mmask8) -1); > + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 4, > + (__mmask8) __M); > } > > -extern __inline __m256i > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X, > - __m256i __Y) > +extern __inline __mmask8 > + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y) > { > - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, > - (__v8si) __X, > - (__v8si) __W, > - __M); > + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 4, > + (__mmask8) -1); > } > > extern __inline __mmask8 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > +_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > { > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 4, > - (__mmask8) __M); > + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 1, > + (__mmask8) __M); > } > > extern __inline __mmask8 > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y) > +_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y) > { > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 4, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 1, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 1, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 5, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 5, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 2, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 2, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 4, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 4, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 1, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 1, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 5, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 5, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 2, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 2, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 4, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 4, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 1, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 1, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 5, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 5, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 2, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, > - (__v8si) __Y, 2, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 4, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 4, > - (__mmask8) -1); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 1, > - (__mmask8) __M); > -} > - > -extern __inline __mmask8 > - __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y) > -{ > - return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > - (__v4di) __Y, 1, > - (__mmask8) -1); > + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, > + (__v4di) __Y, 1, > + (__mmask8) -1); > } > > extern __inline __mmask8 > @@ -13861,4 +13583,282 @@ _mm256_permutex_pd (__m256d __X, const int __M) > #pragma GCC pop_options > #endif /* __DISABLE_AVX512VL__ */ > > +#if !defined (__AVX512CD__) || !defined (__AVX512VL__) > +#pragma GCC push_options > +#pragma GCC target("avx512vl,avx512cd") > +#define __DISABLE_AVX512VLCD__ > +#endif > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_broadcastmb_epi64 (__mmask8 __A) > +{ > + return (__m128i) __builtin_ia32_broadcastmb128 (__A); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_broadcastmb_epi64 (__mmask8 __A) > +{ > + return (__m256i) __builtin_ia32_broadcastmb256 (__A); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_broadcastmw_epi32 (__mmask16 __A) > +{ > + return (__m128i) __builtin_ia32_broadcastmw128 (__A); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_broadcastmw_epi32 (__mmask16 __A) > +{ > + return (__m256i) __builtin_ia32_broadcastmw256 (__A); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_lzcnt_epi32 (__m256i __A) > +{ > + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, > + (__v8si) > + _mm256_setzero_si256 (), > + (__mmask8) -1); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, > + (__v8si) __W, > + (__mmask8) __U); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, > + (__v8si) > + _mm256_setzero_si256 (), > + (__mmask8) __U); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_lzcnt_epi64 (__m256i __A) > +{ > + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, > + (__v4di) > + _mm256_setzero_si256 (), > + (__mmask8) -1); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, > + (__v4di) __W, > + (__mmask8) __U); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, > + (__v4di) > + _mm256_setzero_si256 (), > + (__mmask8) __U); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_conflict_epi64 (__m256i __A) > +{ > + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, > + (__v4di) > + _mm256_setzero_si256 > (), > + (__mmask8) -1); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, > + (__v4di) __W, > + (__mmask8) > + __U); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, > + (__v4di) > + _mm256_setzero_si256 > (), > + (__mmask8) > + __U); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_conflict_epi32 (__m256i __A) > +{ > + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, > + (__v8si) > + _mm256_setzero_si256 > (), > + (__mmask8) -1); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, > + (__v8si) __W, > + (__mmask8) > + __U); > +} > + > +extern __inline __m256i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) > +{ > + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, > + (__v8si) > + _mm256_setzero_si256 > (), > + (__mmask8) > + __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_lzcnt_epi32 (__m128i __A) > +{ > + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, > + (__v4si) > + _mm_setzero_si128 (), > + (__mmask8) -1); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, > + (__v4si) __W, > + (__mmask8) __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, > + (__v4si) > + _mm_setzero_si128 (), > + (__mmask8) __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_lzcnt_epi64 (__m128i __A) > +{ > + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, > + (__v2di) > + _mm_setzero_si128 (), > + (__mmask8) -1); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, > + (__v2di) __W, > + (__mmask8) __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, > + (__v2di) > + _mm_setzero_si128 (), > + (__mmask8) __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_conflict_epi64 (__m128i __A) > +{ > + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, > + (__v2di) > + _mm_setzero_si128 (), > + (__mmask8) -1); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, > + (__v2di) __W, > + (__mmask8) > + __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, > + (__v2di) > + _mm_setzero_si128 (), > + (__mmask8) > + __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_conflict_epi32 (__m128i __A) > +{ > + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, > + (__v4si) > + _mm_setzero_si128 (), > + (__mmask8) -1); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, > + (__v4si) __W, > + (__mmask8) > + __U); > +} > + > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) > +{ > + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, > + (__v4si) > + _mm_setzero_si128 (), > + (__mmask8) > + __U); > +} > + > +#ifdef __DISABLE_AVX512VLCD__ > +#pragma GCC pop_options > +#endif > + > #endif /* _AVX512VLINTRIN_H_INCLUDED */ > diff --git a/gcc/config/i386/i386-builtin.def > b/gcc/config/i386/i386-builtin.def > index b90d5ccc969..19fa5c107c7 100644 > --- a/gcc/config/i386/i386-builtin.def > +++ b/gcc/config/i386/i386-builtin.def > @@ -1615,8 +1615,8 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktestqi, > "__builtin_ia32_ktestcqi", > BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktestqi, > "__builtin_ia32_ktestzqi", IX86_BUILTIN_KTESTZ8, UNKNOWN, (int) > UQI_FTYPE_UQI_UQI) > BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktesthi, > "__builtin_ia32_ktestchi", IX86_BUILTIN_KTESTC16, UNKNOWN, (int) > UHI_FTYPE_UHI_UHI) > BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_ktesthi, > "__builtin_ia32_ktestzhi", IX86_BUILTIN_KTESTZ16, UNKNOWN, (int) > UHI_FTYPE_UHI_UHI) > -BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestsi, > "__builtin_ia32_ktestcsi", IX86_BUILTIN_KTESTC32, UNKNOWN, (int) > USI_FTYPE_USI_USI) > -BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestsi, > "__builtin_ia32_ktestzsi", IX86_BUILTIN_KTESTZ32, UNKNOWN, (int) > USI_FTYPE_USI_USI) > +BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_ktestsi, > "__builtin_ia32_ktestcsi", IX86_BUILTIN_KTESTC32, UNKNOWN, (int) > USI_FTYPE_USI_USI) > +BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_ktestsi, > "__builtin_ia32_ktestzsi", IX86_BUILTIN_KTESTZ32, UNKNOWN, (int) > USI_FTYPE_USI_USI) > BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestdi, > "__builtin_ia32_ktestcdi", IX86_BUILTIN_KTESTC64, UNKNOWN, (int) > UDI_FTYPE_UDI_UDI) > BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_ktestdi, > "__builtin_ia32_ktestzdi", IX86_BUILTIN_KTESTZ64, UNKNOWN, (int) > UDI_FTYPE_UDI_UDI) > BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_kortestqi, > "__builtin_ia32_kortestcqi", IX86_BUILTIN_KORTESTC8, UNKNOWN, (int) > UQI_FTYPE_UQI_UQI) > -- > 2.31.1 >
-- BR, Hongtao