On Thu, Jul 05, 2018 at 11:57:26PM +0300, Grazvydas Ignotas wrote: > I think it would be more efficient if you took care of it. I won't > have time for at least a few days anyway.
Here is what I have ATM, but will still need to work on the testsuite side for it and bootstrap/regtest it. In addition to the checks I've posted I've also done: echo `sed -n '/^_mm.*__mmask/,/^}/p' config/i386/*.h | sed 's/^}/@@@/'` | sed 's/@@@/}\n/g' > /tmp/11 echo `sed -n '/^#define[ \t]_mm/,/)$/p' config/i386/*.h | sed 's/)$/@@@/' | sed 's/\\$//'` | sed 's/@@@/)\n/g' | grep __mmask >> /tmp/11 for i in `grep '__builtin.*_UQI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask8; done for i in `grep '__builtin.*_UHI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask16; done for i in `grep '__builtin.*_USI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask32; done for i in `grep '__builtin.*_UDI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask64; done for i in `grep '__builtin.*_QI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask8; done for i in `grep '__builtin.*_HI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask16; done for i in `grep '__builtin.*_SI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask32; done for i in `grep '__builtin.*_DI)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask64; done for i in `grep '__builtin.*_UQI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask8; done for i in `grep '__builtin.*_UHI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask16; done for i in `grep '__builtin.*_USI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask32; done for i in `grep '__builtin.*_UDI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask64; done for i in `grep '__builtin.*_QI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask8; done for i in `grep '__builtin.*_HI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask16; done for i in `grep '__builtin.*_SI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask32; done for i in `grep '__builtin.*_DI_INT)' config/i386/i386-builtin.def | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask64; done and finally manual checking (could be automated too) of: for i in `grep '__builtin.*_INT)' config/i386/i386-builtin.def | grep -v '_U\?[QHSD]I_INT)' | sed 's/^[^"]*"//;s/".*$//' | sort -u`; do grep $i'[ \t(].*__mmask' /tmp/11 | grep -v __mmask32; done For this last one, it is about trying to verify what kind of '__v\(2\|4\|8\|16\|32\|64\)[qhsd][if]' is used with the different __mmask, 2/4/8 should be used with __mmask8, 16 with __mmask16, 32 with __mmask32 and 64 with __mmask64. Some fixes in the patch are mostly for consistency and harmless for code generation (e.g. when argument should have been __mmask8 but was __mmask16), but several changes should fix wrong-code bugs. --- gcc/config/i386/avx512bwintrin.h.jj 2018-01-03 10:20:06.699535804 +0100 +++ gcc/config/i386/avx512bwintrin.h 2018-07-06 09:53:44.657235040 +0200 @@ -3043,7 +3043,7 @@ _mm512_cmp_epi16_mask (__m512i __X, __m5 extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmp_epi8_mask (__mmask32 __U, __m512i __X, __m512i __Y, +_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y, const int __P) { return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, @@ -3081,7 +3081,7 @@ _mm512_cmp_epu16_mask (__m512i __X, __m5 extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmp_epu8_mask (__mmask32 __U, __m512i __X, __m512i __Y, +_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y, const int __P) { return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, --- gcc/config/i386/avx512bitalgintrin.h.jj 2018-01-26 12:43:26.374922539 +0100 +++ gcc/config/i386/avx512bitalgintrin.h 2018-07-06 09:51:19.536082408 +0200 @@ -107,7 +107,7 @@ _mm512_bitshuffle_epi64_mask (__m512i __ extern __inline __mmask64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_bitshuffle_epi64_mask (__mmask8 __M, __m512i __A, __m512i __B) +_mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B) { return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, (__v64qi) __B, --- gcc/config/i386/avx512fintrin.h.jj 2018-05-21 13:15:43.494581775 +0200 +++ gcc/config/i386/avx512fintrin.h 2018-07-06 11:41:19.879801396 +0200 @@ -7377,7 +7377,7 @@ _mm512_xor_epi64 (__m512i __A, __m512i _ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +_mm512_mask_xor_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, @@ -7387,7 +7387,7 @@ _mm512_mask_xor_epi64 (__m512i __W, __mm extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B) +_mm512_maskz_xor_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, (__v8di) __B, @@ -9615,7 +9615,7 @@ _mm512_cmpneq_epu32_mask (__m512i __X, _ extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cmpneq_epi64_mask (__mmask16 __M, __m512i __X, __m512i __Y) +_mm512_mask_cmpneq_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) { return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, (__v8di) __Y, 4, @@ -10877,22 +10877,22 @@ _mm512_mask_insertf32x4 (__m512 __A, __m #define _mm512_maskz_insertf32x4(A, X, Y, C) \ ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(), \ - (__mmask8)(A))) + (__mmask16)(A))) #define _mm512_maskz_inserti32x4(A, X, Y, C) \ ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (), \ - (__mmask8)(A))) + (__mmask16)(A))) #define _mm512_mask_insertf32x4(A, B, X, Y, C) \ ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A), \ - (__mmask8)(B))) + (__mmask16)(B))) #define _mm512_mask_inserti32x4(A, B, X, Y, C) \ ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A), \ - (__mmask8)(B))) + (__mmask16)(B))) #endif extern __inline __m512i --- gcc/config/i386/avx512vlintrin.h.jj 2018-01-03 10:20:06.152535716 +0100 +++ gcc/config/i386/avx512vlintrin.h 2018-07-06 10:13:02.209506028 +0200 @@ -466,7 +466,7 @@ _mm256_maskz_add_pd (__mmask8 __U, __m25 extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) +_mm_mask_add_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, @@ -476,7 +476,7 @@ _mm_mask_add_ps (__m128 __W, __mmask16 _ extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_ps (__mmask16 __U, __m128 __A, __m128 __B) +_mm_maskz_add_ps (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, (__v4sf) __B, @@ -487,7 +487,7 @@ _mm_maskz_add_ps (__mmask16 __U, __m128 extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_add_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) +_mm256_mask_add_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, @@ -497,7 +497,7 @@ _mm256_mask_add_ps (__m256 __W, __mmask1 extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_add_ps (__mmask16 __U, __m256 __A, __m256 __B) +_mm256_maskz_add_ps (__mmask8 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, (__v8sf) __B, @@ -551,7 +551,7 @@ _mm256_maskz_sub_pd (__mmask8 __U, __m25 extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_ps (__m128 __W, __mmask16 __U, __m128 __A, __m128 __B) +_mm_mask_sub_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, @@ -561,7 +561,7 @@ _mm_mask_sub_ps (__m128 __W, __mmask16 _ extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_ps (__mmask16 __U, __m128 __A, __m128 __B) +_mm_maskz_sub_ps (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, (__v4sf) __B, @@ -572,7 +572,7 @@ _mm_maskz_sub_ps (__mmask16 __U, __m128 extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_sub_ps (__m256 __W, __mmask16 __U, __m256 __A, __m256 __B) +_mm256_mask_sub_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, @@ -582,7 +582,7 @@ _mm256_mask_sub_ps (__m256 __W, __mmask1 extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_sub_ps (__mmask16 __U, __m256 __A, __m256 __B) +_mm256_maskz_sub_ps (__mmask8 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, (__v8sf) __B, @@ -1320,7 +1320,7 @@ _mm256_mask_cvtepi32_ps (__m256 __W, __m extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) +_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) { return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, (__v8sf) @@ -1339,7 +1339,7 @@ _mm_mask_cvtepi32_ps (__m128 __W, __mmas extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) +_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) { return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, (__v4sf) --- gcc/config/i386/avx512vlbwintrin.h.jj 2018-01-03 10:20:06.598535787 +0100 +++ gcc/config/i386/avx512vlbwintrin.h 2018-07-06 12:16:56.109933815 +0200 @@ -1467,7 +1467,7 @@ _mm256_cmp_epi16_mask (__m256i __X, __m2 extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmp_epi8_mask (__mmask8 __U, __m128i __X, __m128i __Y, +_mm_mask_cmp_epi8_mask (__mmask16 __U, __m128i __X, __m128i __Y, const int __P) { return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, @@ -1486,7 +1486,7 @@ _mm_cmp_epi8_mask (__m128i __X, __m128i extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmp_epi8_mask (__mmask16 __U, __m256i __X, __m256i __Y, +_mm256_mask_cmp_epi8_mask (__mmask32 __U, __m256i __X, __m256i __Y, const int __P) { return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, @@ -1543,7 +1543,7 @@ _mm256_cmp_epu16_mask (__m256i __X, __m2 extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmp_epu8_mask (__mmask8 __U, __m128i __X, __m128i __Y, +_mm_mask_cmp_epu8_mask (__mmask16 __U, __m128i __X, __m128i __Y, const int __P) { return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, @@ -1562,7 +1562,7 @@ _mm_cmp_epu8_mask (__m128i __X, __m128i extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmp_epu8_mask (__mmask16 __U, __m256i __X, __m256i __Y, +_mm256_mask_cmp_epu8_mask (__mmask32 __U, __m256i __X, __m256i __Y, const int __P) { return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, @@ -1998,7 +1998,7 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m1 #define _mm_mask_cmp_epi16_mask(M, X, Y, P) \ ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), \ (__v8hi)(__m128i)(Y), (int)(P),\ - (__mmask16)(M))) + (__mmask8)(M))) #define _mm_mask_cmp_epi8_mask(M, X, Y, P) \ ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), \ @@ -2430,7 +2430,7 @@ _mm_maskz_mullo_epi16 (__mmask8 __U, __m extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) +_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, (__v16hi) __W, @@ -2449,7 +2449,7 @@ _mm256_maskz_cvtepi8_epi16 (__mmask16 __ extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) +_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, (__v8hi) __W, @@ -2468,7 +2468,7 @@ _mm_maskz_cvtepi8_epi16 (__mmask8 __U, _ extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask32 __U, __m128i __A) +_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, (__v16hi) __W, @@ -2487,7 +2487,7 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __ extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask32 __U, __m128i __A) +_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, (__v8hi) __W, @@ -4541,148 +4541,148 @@ _mm_mask_cmple_epi16_mask (__mmask8 __M, (__mmask8) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 4, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 1, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 5, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epu8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 2, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 4, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 1, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 5, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epu16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 2, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 4, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 1, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 5, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epi8_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpb256_mask ((__v32qi) __X, - (__v32qi) __Y, 2, - (__mmask8) __M); + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpneq_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpneq_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 4, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmplt_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmplt_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 1, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmpge_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmpge_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 5, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); } -extern __inline __mmask8 +extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cmple_epi16_mask (__mmask8 __M, __m256i __X, __m256i __Y) +_mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) { - return (__mmask8) __builtin_ia32_cmpw256_mask ((__v16hi) __X, - (__v16hi) __Y, 2, - (__mmask8) __M); + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); } #ifdef __DISABLE_AVX512VLBW__ --- gcc/config/i386/avx512vbmi2vlintrin.h.jj 2018-01-03 10:20:06.085535705 +0100 +++ gcc/config/i386/avx512vbmi2vlintrin.h 2018-07-06 12:09:13.797471324 +0200 @@ -541,7 +541,7 @@ _mm_shldi_epi64 (__m128i __A, __m128i __ (__v4si)(__m128i)(B),(int)(C)) #define _mm_mask_shrdi_epi32(A, B, C, D, E) \ ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), \ - (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask16)(B)) + (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B)) #define _mm_maskz_shrdi_epi32(A, B, C, D) \ ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), \ (__v4si)(__m128i)(C),(int)(D), \ @@ -601,7 +601,7 @@ _mm_shldi_epi64 (__m128i __A, __m128i __ (__v4si)(__m128i)(B),(int)(C)) #define _mm_mask_shldi_epi32(A, B, C, D, E) \ ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \ - (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask16)(B)) + (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B)) #define _mm_maskz_shldi_epi32(A, B, C, D) \ ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), \ (__v4si)(__m128i)(C),(int)(D), \ Jakub