[PATCH] Add myself for write after approval
ChangeLog: * MAINTAINERS (Write After Approval): Add myself. --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 49aa6bae73b..90e2c81f0c2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -460,6 +460,7 @@ Matthew Hiller Kazu Hirata Manfred Hollstein Cong Hou +Lin Hu Falk Hueffner Andrew John Hughes Dominique d'Humieres -- 2.31.1
[PATCH] i386: refactor macros.
Hi, all This patch aims to refactor macros in case some other thing is added to AMX_TILE_SET in future. OK for trunk? BRs, Lin gcc/ChangeLog: * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AMX_INT8_SET): Change OPTION_MASK_ISA2_AMX_TILE to OPTION_MASK_ISA2_AMX_TILE_SET. (OPTION_MASK_ISA2_AMX_FP16_SET): Ditto (OPTION_MASK_ISA2_AMX_COMPLEX_SET): Ditto (OPTION_MASK_ISA_ABM_SET): Change OPTION_MASK_ISA_POPCNT to OPTION_MASK_ISA_POPCNT_SET. --- gcc/common/config/i386/i386-common.cc | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index bf126f14073..4f79afba917 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -107,18 +107,18 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_AVX512VP2INTERSECT_SET OPTION_MASK_ISA2_AVX512VP2INTERSECT #define OPTION_MASK_ISA2_AMX_TILE_SET OPTION_MASK_ISA2_AMX_TILE #define OPTION_MASK_ISA2_AMX_INT8_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_INT8) #define OPTION_MASK_ISA2_AMX_BF16_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_BF16) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_BF16) #define OPTION_MASK_ISA2_AVXVNNIINT8_SET OPTION_MASK_ISA2_AVXVNNIINT8 #define OPTION_MASK_ISA2_AVXNECONVERT_SET OPTION_MASK_ISA2_AVXNECONVERT #define OPTION_MASK_ISA2_CMPCCXADD_SET OPTION_MASK_ISA2_CMPCCXADD #define OPTION_MASK_ISA2_AMX_FP16_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_FP16) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_FP16) #define OPTION_MASK_ISA2_PREFETCHI_SET OPTION_MASK_ISA2_PREFETCHI #define OPTION_MASK_ISA2_RAOINT_SET OPTION_MASK_ISA2_RAOINT #define OPTION_MASK_ISA2_AMX_COMPLEX_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_COMPLEX) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_COMPLEX) /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same as -msse4.2. */ @@ -143,7 +143,7 @@ along with GCC; see the file COPYING3. If not see (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2_SET) #define OPTION_MASK_ISA_ABM_SET \ - (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT) + (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT_SET) #define OPTION_MASK_ISA2_PCONFIG_SET OPTION_MASK_ISA2_PCONFIG #define OPTION_MASK_ISA2_WBNOINVD_SET OPTION_MASK_ISA2_WBNOINVD -- 2.31.1
[PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins
The implementation of these builtins requires support for both AVX512VL and VAES. However, the builtins didn't request AVX512VL. As a result, compiling pr109117-1.c with the options -mvaes -mno-avx512vl caused an ICE. This patch aims to fix the bug. gcc/ChangeLog: PR target/109117 * config/i386/i386-builtin.def (__builtin_ia32_vaesdec_v16qi, __builtin_ia32_vaesdeclast_v16qi,__builtin_ia32_vaesenc_v16qi, __builtin_ia32_vaesenclast_v16qi): Require OPTION_MASK_ISA_AVX512VL. gcc/testsuite/ChangeLog: PR target/109117 * gcc.target/i386/pr109117-1.c: New test. --- gcc/config/i386/i386-builtin.def | 8 gcc/testsuite/gcc.target/i386/pr109117-1.c | 14 ++ 2 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr109117-1.c diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index f1c295c34f6..17dfe40fac7 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2797,16 +2797,16 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssds_mask, BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, "__builtin_ia32_rdpid", IX86_BUILTIN_RDPID, UNKNOWN, (int) UNSIGNED_FTYPE_VOID) /* VAES. */ -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) diff --git a/gcc/testsuite/gcc.target/i386/pr109117-1.c b/gcc/testsuite/gcc.target/i386/pr109117-1.c new file mode 100644 index 000..87a5c0e7fc9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109117-1.c @@ -0,0 +1,14 @@ +/* PR target/109117 */ +/* { dg-do compile } */ +/* { dg-options "-mvaes -mno-avx512vl" } */ + +typedef char __v16qi __attribute__ ((__vector_size__(16))); +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); +volatile __v16qi x, y; +volatile __m128i res; + +void +foo (void) +{ + res = __builtin_ia32_vaesdec_v16qi (x, y); /* { dg-warning "implicit declaration of function" } */ +} /* { dg-error "incompatible types when assigning to type" "" { target *-*-* } .-1 } */ -- 2.31.1
RE: [PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins
It has regtested on x86_64-pc-linux-gnu. OK for trunk? Thanks. Lin -Original Message- From: Uros Bizjak Sent: Tuesday, March 14, 2023 3:05 PM To: Hu, Lin1 Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao Subject: Re: [PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins On Tue, Mar 14, 2023 at 7:2 AM Hu, Lin1 wrote: > > The implementation of these builtins requires support for both > AVX512VL and VAES. However, the builtins didn't request AVX512VL. As a > result, compiling pr109117-1.c with the options -mvaes -mno-avx512vl caused > an ICE. > > This patch aims to fix the bug. > > gcc/ChangeLog: > > PR target/109117 > * config/i386/i386-builtin.def (__builtin_ia32_vaesdec_v16qi, > __builtin_ia32_vaesdeclast_v16qi,__builtin_ia32_vaesenc_v16qi, > __builtin_ia32_vaesenclast_v16qi): Require OPTION_MASK_ISA_AVX512VL. > > gcc/testsuite/ChangeLog: > > PR target/109117 > * gcc.target/i386/pr109117-1.c: New test. OK. Thanks, Uros. > --- > gcc/config/i386/i386-builtin.def | 8 > gcc/testsuite/gcc.target/i386/pr109117-1.c | 14 ++ > 2 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 > gcc/testsuite/gcc.target/i386/pr109117-1.c > > diff --git a/gcc/config/i386/i386-builtin.def > b/gcc/config/i386/i386-builtin.def > index f1c295c34f6..17dfe40fac7 100644 > --- a/gcc/config/i386/i386-builtin.def > +++ b/gcc/config/i386/i386-builtin.def > @@ -2797,16 +2797,16 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, > CODE_FOR_avx5124vnniw_vp4dpwssds_mask, > BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, > "__builtin_ia32_rdpid", IX86_BUILTIN_RDPID, UNKNOWN, (int) > UNSIGNED_FTYPE_VOID) > > /* VAES. */ > -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, > "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) > V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", > +IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, > "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) > V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, > CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", > IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC > (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, > "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, > UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", > +IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, > "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, > UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, > OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, > "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, > UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, > OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, > "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) > V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", > +IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, > "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) > V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, > CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", > IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC > (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, > "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, > UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", > +IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, > "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, > UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, > OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, > "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, > UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) > > diff --git a/gcc/testsuite/gcc.target/i386/pr109117-1.c > b/gcc/testsuite/gcc.target/i386/pr109117-1.c > new file mode 100644 > index 000..87a5c0e7fc9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr109117-1.c > @@ -0,0 +1,14 @@ > +/* PR target/109117 */ > +/* { dg-do compile } */ > +/* { dg-options "-mvaes -mno-avx512vl" } */ > + > +typedef char __v16qi __attribute__ ((__vector_size__(16))); typedef > +long long __m128i __at
[PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli}
Hi all, This patch aims to fix incorrect intrinsic signature for _mm{512|256|}_s{lli|rai|rli}_epi*. And it has been tested on x86_64-pc-linux-gnu. OK for trunk? BRs, Lin gcc/ChangeLog: PR target/109173 PR target/109174 * config/i386/avx512bwintrin.h (_mm512_srli_epi16): Change type from int to const int. (_mm512_mask_srli_epi16): Ditto. (_mm512_slli_epi16): Ditto. (_mm512_mask_slli_epi16): Ditto. (_mm512_maskz_slli_epi16): Ditto. (_mm512_srai_epi16): Ditto. (_mm512_mask_srai_epi16): Ditto. (_mm512_maskz_srai_epi16): Ditto. * config/i386/avx512vlintrin.h (_mm256_mask_srli_epi32): Ditto. (_mm256_maskz_srli_epi32): Ditto. (_mm_mask_srli_epi32): Ditto. (_mm_maskz_srli_epi32): Ditto. (_mm256_mask_srli_epi64): Ditto. (_mm256_maskz_srli_epi64): Ditto. (_mm_mask_srli_epi64): Ditto. (_mm_maskz_srli_epi64): Ditto. (_mm256_mask_srai_epi32): Ditto. (_mm256_maskz_srai_epi32): Ditto. (_mm_mask_srai_epi32): Ditto. (_mm_maskz_srai_epi32): Ditto. (_mm256_srai_epi64): Ditto. (_mm256_mask_srai_epi64): Ditto. (_mm256_maskz_srai_epi64): Ditto. (_mm_srai_epi64): Ditto. (_mm_mask_srai_epi64): Ditto. (_mm_maskz_srai_epi64): Ditto. (_mm_mask_slli_epi32): Ditto. (_mm_maskz_slli_epi32): Ditto. (_mm_mask_slli_epi64): Ditto. (_mm_maskz_slli_epi64): Ditto. (_mm256_mask_slli_epi32): Ditto. (_mm256_maskz_slli_epi32): Ditto. (_mm256_mask_slli_epi64): Ditto. (_mm256_maskz_slli_epi64): Ditto. (_mm_mask_srai_epi16): Ditto. (_mm_maskz_srai_epi16): Ditto. (_mm256_srai_epi16): Ditto. (_mm256_mask_srai_epi16): Ditto. (_mm_mask_slli_epi16): Ditto. (_mm_maskz_slli_epi16): Ditto. (_mm256_mask_slli_epi16): Ditto. (_mm256_maskz_slli_epi16): Ditto. gcc/testsuite/ChangeLog: PR target/109173 PR target/109174 * gcc.target/i386/pr109173-1.c: New test. * gcc.target/i386/pr109174-1.c: Ditto. --- gcc/config/i386/avx512bwintrin.h | 32 +++--- gcc/config/i386/avx512fintrin.h| 58 +++ gcc/config/i386/avx512vlbwintrin.h | 36 --- gcc/config/i386/avx512vlintrin.h | 112 +++-- gcc/testsuite/gcc.target/i386/pr109173-1.c | 57 +++ gcc/testsuite/gcc.target/i386/pr109174-1.c | 45 + 6 files changed, 236 insertions(+), 104 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr109173-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr109174-1.c diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h index 89790f7917b..791d4e35f32 100644 --- a/gcc/config/i386/avx512bwintrin.h +++ b/gcc/config/i386/avx512bwintrin.h @@ -2880,7 +2880,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B, extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_srli_epi16 (__m512i __A, const int __imm) +_mm512_srli_epi16 (__m512i __A, const unsigned int __imm) { return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, (__v32hi) @@ -2891,7 +2891,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) + const unsigned int __imm) { return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, (__v32hi) __W, @@ -2910,7 +2910,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_slli_epi16 (__m512i __A, const int __B) +_mm512_slli_epi16 (__m512i __A, const unsigned int __B) { return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, (__v32hi) @@ -2921,7 +2921,7 @@ _mm512_slli_epi16 (__m512i __A, const int __B) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - const int __B) + const unsigned int __B) { return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, (__v32hi) __W, @@ -2930,7 +2930,7 @@ _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B) +_mm512_maskz_slli_epi16 (__mmask32 __U, __m51
RE: [PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli}
OK, I update the change log and modify a part of format. The attached file is the new version. -Original Message- From: Hongtao Liu Sent: Thursday, May 25, 2023 11:40 AM To: Hu, Lin1 Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; ubiz...@gmail.com Subject: Re: [PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli} On Thu, May 25, 2023 at 10:55 AM Hu, Lin1 via Gcc-patches wrote: > > Hi all, > > This patch aims to fix incorrect intrinsic signature for > _mm{512|256|}_s{lli|rai|rli}_epi*. And it has been tested on > x86_64-pc-linux-gnu. OK for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/109173 > PR target/109174 > * config/i386/avx512bwintrin.h (_mm512_srli_epi16): Change type from > int to const int. int to unsigned int or const int to const unsigned int. Others LGTM. > (_mm512_mask_srli_epi16): Ditto. > (_mm512_slli_epi16): Ditto. > (_mm512_mask_slli_epi16): Ditto. > (_mm512_maskz_slli_epi16): Ditto. > (_mm512_srai_epi16): Ditto. > (_mm512_mask_srai_epi16): Ditto. > (_mm512_maskz_srai_epi16): Ditto. > * config/i386/avx512vlintrin.h (_mm256_mask_srli_epi32): Ditto. > (_mm256_maskz_srli_epi32): Ditto. > (_mm_mask_srli_epi32): Ditto. > (_mm_maskz_srli_epi32): Ditto. > (_mm256_mask_srli_epi64): Ditto. > (_mm256_maskz_srli_epi64): Ditto. > (_mm_mask_srli_epi64): Ditto. > (_mm_maskz_srli_epi64): Ditto. > (_mm256_mask_srai_epi32): Ditto. > (_mm256_maskz_srai_epi32): Ditto. > (_mm_mask_srai_epi32): Ditto. > (_mm_maskz_srai_epi32): Ditto. > (_mm256_srai_epi64): Ditto. > (_mm256_mask_srai_epi64): Ditto. > (_mm256_maskz_srai_epi64): Ditto. > (_mm_srai_epi64): Ditto. > (_mm_mask_srai_epi64): Ditto. > (_mm_maskz_srai_epi64): Ditto. > (_mm_mask_slli_epi32): Ditto. > (_mm_maskz_slli_epi32): Ditto. > (_mm_mask_slli_epi64): Ditto. > (_mm_maskz_slli_epi64): Ditto. > (_mm256_mask_slli_epi32): Ditto. > (_mm256_maskz_slli_epi32): Ditto. > (_mm256_mask_slli_epi64): Ditto. > (_mm256_maskz_slli_epi64): Ditto. > (_mm_mask_srai_epi16): Ditto. > (_mm_maskz_srai_epi16): Ditto. > (_mm256_srai_epi16): Ditto. > (_mm256_mask_srai_epi16): Ditto. > (_mm_mask_slli_epi16): Ditto. > (_mm_maskz_slli_epi16): Ditto. > (_mm256_mask_slli_epi16): Ditto. > (_mm256_maskz_slli_epi16): Ditto. > > gcc/testsuite/ChangeLog: > > PR target/109173 > PR target/109174 > * gcc.target/i386/pr109173-1.c: New test. > * gcc.target/i386/pr109174-1.c: Ditto. > --- > gcc/config/i386/avx512bwintrin.h | 32 +++--- > gcc/config/i386/avx512fintrin.h| 58 +++ > gcc/config/i386/avx512vlbwintrin.h | 36 --- > gcc/config/i386/avx512vlintrin.h | 112 +++-- > gcc/testsuite/gcc.target/i386/pr109173-1.c | 57 +++ > gcc/testsuite/gcc.target/i386/pr109174-1.c | 45 + > 6 files changed, 236 insertions(+), 104 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr109173-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr109174-1.c > > diff --git a/gcc/config/i386/avx512bwintrin.h > b/gcc/config/i386/avx512bwintrin.h > index 89790f7917b..791d4e35f32 100644 > --- a/gcc/config/i386/avx512bwintrin.h > +++ b/gcc/config/i386/avx512bwintrin.h > @@ -2880,7 +2880,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, > __m512i __B, > > extern __inline __m512i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm512_srli_epi16 (__m512i __A, const int __imm) > +_mm512_srli_epi16 (__m512i __A, const unsigned int __imm) > { >return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, > (__v32hi) > @@ -2891,7 +2891,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm) > extern __inline __m512i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, > - const int __imm) > + const unsigned int __imm) > { >return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, > (__v32hi) __W, > @@ -2910,7 +2910,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, > const int __imm) > > extern __inline __m512i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)
[PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
Hi all, This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result. Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Lin gcc/ChangeLog: PR target/94962 * config/i386/constraints.md (BH): New define_constraint. * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate. (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd". * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate. (vector_all_ones_zero_extend_quarter_operand): Ditto. * config/i386/sse.md: Add constraint to insn "mov_internal". gcc/testsuite/ChangeLog: PR target/94962 * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. * gcc.target/i386/pr94962-1.c: New test. * gcc.target/i386/pr94962-2.c: Ditto. * gcc.target/i386/pr94962-3.c: Ditto. * gcc.target/i386/pr94962-4.c: Ditto. --- gcc/config/i386/constraints.md| 8 +++ gcc/config/i386/i386.cc | 26 +++- gcc/config/i386/predicates.md | 49 ++ gcc/config/i386/sse.md| 8 +-- .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- .../i386/avx256-unaligned-store-1.c | 4 +- .../i386/avx256-unaligned-store-2.c | 4 +- .../i386/avx256-unaligned-store-3.c | 4 +- gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 + gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++ gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++ 12 files changed, 235 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -168,6 +168,9 @@ ;; z Constant call address operand. ;; C Integer SSE constant with all bits set operand. ;; F Floating-point SSE constant with all bits set operand. +;; H Integer SSE constant that is 128/256bit all ones +;; and zero-extand to 256/512bit, or 128bit all ones +;; and zero-extend to 512bit. ;; M x86-64 memory operand. (define_constraint "Bf" @@ -233,6 +236,11 @@ (and (match_test "TARGET_SSE") (match_operand 0 "float_vector_all_ones_operand"))) +(define_constraint "BH" + "@internal integer constant with last half/quarter bits set operand." + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") + (match_operand 0 "vector_all_ones_zero_extend_quarter_operand"))) + ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg X))' ;; where X is a base register. diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index dadf453d6c0..ca799da5d7e 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) XFmode); } -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 +/* Return 1 if X is all bits 0, 2 if X is all bits 1 + and 3 if X is all bits 1 with zero extend in supported SSE/AVX vector mode. */ int @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) } } + if (vector_all_ones_zero_extend_half_operand (x, mode) + || vector_all_ones_zero_extend_quarter_operand (x, mode)) +return 3; + return 0; } @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) gcc_unreachable (); } } + else if (vector_all_ones_zero_extend_half_operand (x, mode)) +{ + if (GET_MODE_SIZE (mode) == 64) + { + gcc_assert (TARGET_AVX512F); + return "vpcmpeqd \t %t0, %t0, %t0"; + } + else if (GET_MODE_SIZE (mode) == 32) + { + gcc_assert (TARGET_AVX); + return "vpcmpeqd \t %x0, %x0, %x0"; + } + gcc_unreachable (); +} + else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) +{ + gcc_assert (TARGET_AVX512F); + return "vpcmpeqd \t %x0, %x0, %x0"; +} gcc_unreachable (); } diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644 --- a/gcc/config/i386/predicates.md +++
RE: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
Hi, Hongtao I have modefied this patch and regtested on x86_64-pc-linux-gnu. BRs. Lin -Original Message- From: Hongtao Liu Sent: Friday, September 23, 2022 9:48 AM To: Hu, Lin1 Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao Subject: Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches wrote: > > Hi all, > > This patch aims to optimize code generation of > __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of > instructions required to achieve the final result. > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/94962 > * config/i386/constraints.md (BH): New define_constraint. > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when > operand matches new predicate. > (standard_sse_constant_opcode): Add new alternative branch to return > "vpcmpeqd". > * config/i386/predicates.md > (vector_all_ones_zero_extend_half_operand): New define_predicate. > (vector_all_ones_zero_extend_quarter_operand): Ditto. > * config/i386/sse.md: Add constraint to insn "mov_internal". (mov_internal): Add new constraint BH. Put the insn name at first. > > gcc/testsuite/ChangeLog: > > PR target/94962 > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > * gcc.target/i386/pr94962-1.c: New test. > * gcc.target/i386/pr94962-2.c: Ditto. > * gcc.target/i386/pr94962-3.c: Ditto. > * gcc.target/i386/pr94962-4.c: Ditto. > --- > gcc/config/i386/constraints.md| 8 +++ > gcc/config/i386/i386.cc | 26 +++- > gcc/config/i386/predicates.md | 49 ++ > gcc/config/i386/sse.md| 8 +-- > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > .../i386/avx256-unaligned-store-1.c | 4 +- > .../i386/avx256-unaligned-store-2.c | 4 +- > .../i386/avx256-unaligned-store-3.c | 4 +- > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 + > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++ > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++ > 12 files changed, 235 insertions(+), 13 deletions(-) create mode > 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > diff --git a/gcc/config/i386/constraints.md > b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 > --- a/gcc/config/i386/constraints.md > +++ b/gcc/config/i386/constraints.md > @@ -168,6 +168,9 @@ > ;; z Constant call address operand. > ;; C Integer SSE constant with all bits set operand. > ;; F Floating-point SSE constant with all bits set operand. > +;; H Integer SSE constant that is 128/256bit all ones > +;; and zero-extand to 256/512bit, or 128bit all ones > +;; and zero-extend to 512bit. > ;; M x86-64 memory operand. > > (define_constraint "Bf" > @@ -233,6 +236,11 @@ >(and (match_test "TARGET_SSE") > (match_operand 0 "float_vector_all_ones_operand"))) > > +(define_constraint "BH" > + "@internal integer constant with last half/quarter bits set operand." > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > + (match_operand 0 > +"vector_all_ones_zero_extend_quarter_operand"))) > + > ;; NB: Similar to 'm', but don't use define_memory_constraint on > x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg > X))' > ;; where X is a base register. > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index > dadf453d6c0..ca799da5d7e 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) >XFmode); } > > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 > +/* Return 1 if X is all bits 0, 2 if X is all bits 1 > + and 3 if X is all bits 1 with zero extend > in supported SSE/AVX vector mode. */ > > int > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machin
[PATCH] testsuite: Fix up avx256-unaligned-store-3.c test.
Hi all, This patch aims to fix a problem that avx256-unaligned-store-3.c test reports two unexpected fails under "-march=cascadelake". Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Lin gcc/testsuite/ChangeLog: PR target/94962 * gcc.target/i386/avx256-unaligned-store-3.c: Add -mno-avx512f --- gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c index f909099bcb1..67635fb9e66 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store -mtune=generic -fno-common" } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store -mtune=generic -fno-common -mno-avx512f" } */ #define N 1024 -- 2.18.2
[PATCH 1/4] i386: Remove Meteorlake's family_model
Hi all, This patch aims to modified meteorlake's family_model. Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Lin gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_intel_cpu): Remove case 0xb5 for meteorlake. --- gcc/common/config/i386/cpuinfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 099a02467e6..bde231c07ee 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -540,7 +540,6 @@ get_intel_cpu (struct __processor_model *cpu_model, /* Alder Lake. */ case 0xb7: /* Raptor Lake. */ -case 0xb5: case 0xaa: case 0xac: /* Meteor Lake. */ -- 2.18.2
[PATCH 2/4] Initial Emeraldrapids Support
gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_intel_cpu): Handle Emeraldrapids. * common/config/i386/i386-common.cc: Add Emeraldrapids. --- gcc/common/config/i386/cpuinfo.h | 2 ++ gcc/common/config/i386/i386-common.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index bde231c07ee..3729b0f14a5 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -551,6 +551,8 @@ get_intel_cpu (struct __processor_model *cpu_model, break; case 0x8f: /* Sapphire Rapids. */ +case 0xcf: + /* Emerald Rapids. */ cpu = "sapphirerapids"; CHECK___builtin_cpu_is ("corei7"); CHECK___builtin_cpu_is ("sapphirerapids"); diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 7751265aff4..026926d8b41 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -2465,6 +2465,8 @@ const pta processor_alias_table[] = M_CPU_SUBTYPE (INTEL_COREI7_COOPERLAKE), P_PROC_AVX512F}, {"sapphirerapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, PTA_SAPPHIRERAPIDS, M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, + {"emeraldrapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, PTA_SAPPHIRERAPIDS, +M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, {"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, {"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, -- 2.18.2
RE: [PATCH 2/4] Initial Emeraldrapids Support
"PATCH 2 Initial Emeraldrapids Support" aims to support Emeraldrapids for GCC. It's my mistake, resulting in the omission of its information. -Original Message- From: Liu, Hongtao Sent: Tuesday, January 3, 2023 4:48 PM To: Hu, Lin1 ; gcc-patches@gcc.gnu.org Cc: ubiz...@gmail.com Subject: RE: [PATCH 2/4] Initial Emeraldrapids Support There are actually only two patches, not four, and the subject *Patch 2/4* should be a typo. > -Original Message- > From: Hu, Lin1 > Sent: Tuesday, January 3, 2023 4:37 PM > To: gcc-patches@gcc.gnu.org > Cc: Liu, Hongtao ; ubiz...@gmail.com > Subject: [PATCH 2/4] Initial Emeraldrapids Support > > gcc/ChangeLog: > > * common/config/i386/cpuinfo.h (get_intel_cpu): Handle Emeraldrapids. > * common/config/i386/i386-common.cc: Add Emeraldrapids. > --- > gcc/common/config/i386/cpuinfo.h | 2 ++ > gcc/common/config/i386/i386-common.cc | 2 ++ > 2 files changed, 4 insertions(+) > > diff --git a/gcc/common/config/i386/cpuinfo.h > b/gcc/common/config/i386/cpuinfo.h > index bde231c07ee..3729b0f14a5 100644 > --- a/gcc/common/config/i386/cpuinfo.h > +++ b/gcc/common/config/i386/cpuinfo.h > @@ -551,6 +551,8 @@ get_intel_cpu (struct __processor_model *cpu_model, >break; > case 0x8f: >/* Sapphire Rapids. */ > +case 0xcf: > + /* Emerald Rapids. */ >cpu = "sapphirerapids"; >CHECK___builtin_cpu_is ("corei7"); >CHECK___builtin_cpu_is ("sapphirerapids"); diff --git > a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386- > common.cc index 7751265aff4..026926d8b41 100644 > --- a/gcc/common/config/i386/i386-common.cc > +++ b/gcc/common/config/i386/i386-common.cc > @@ -2465,6 +2465,8 @@ const pta processor_alias_table[] = > M_CPU_SUBTYPE (INTEL_COREI7_COOPERLAKE), P_PROC_AVX512F}, >{"sapphirerapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, > PTA_SAPPHIRERAPIDS, > M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, > + {"emeraldrapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, > PTA_SAPPHIRERAPIDS, > +M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, >{"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, > M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, >{"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, > -- > 2.18.2
RE: [PATCH] loading float member of parameter stored via int registers
Sorry for send this mail. I enter the wrong command line. -Original Message- From: Gcc-patches On Behalf Of Segher Boessenkool Sent: Tuesday, January 3, 2023 5:00 PM To: Andrew Pinski Cc: Jiufu Guo ; Jiufu Guo via Gcc-patches ; Richard Biener ; Richard Biener ; dje@gmail.com; li...@gcc.gnu.org; jeffreya...@gmail.com Subject: Re: [PATCH] loading float member of parameter stored via int registers Hi! On Fri, Dec 30, 2022 at 12:30:04AM -0800, Andrew Pinski wrote: > On Thu, Dec 29, 2022 at 11:45 PM Segher Boessenkool > wrote: > > Ah! This simply shows rs6000_modes_tieable_p is decidedly non-optimal: > > it does not allow tying a scalar float to anything else. No such > > thing is required, or good apparently. I wonder why we have such > > restrictions at all in rs6000; is it just unfortunate history, was > > it good at one point in time? > > The documentation for TARGET_MODES_TIEABLE_P says the following: > If TARGET_HARD_REGNO_MODE_OK (r, mode1) and TARGET_HARD_REGNO_MODE_OK > (r, mode2) are always the same for any r, then TARGET_MODES_TIEABLE_P > (mode1, mode2) should be true. If they differ for any r, you should > define this hook to return false unless some other mechanism ensures > the accessibility of the value in a narrower mode. > > even though rs6000_hard_regno_mode_ok_uncached's comment has the following: > /* The float registers (except for VSX vector modes) can only hold floating > modes and DImode. */ That comment is incorrect. See fctiw for example, which defines only the SImode part of the result (the other bits are undefined). > TARGET_P8_VECTOR and TARGET_P9_VECTOR has special cased different modes now: > if (TARGET_P8_VECTOR && (mode == SImode)) > return 1; > > if (TARGET_P9_VECTOR && (mode == QImode || mode == HImode)) > return 1; > Which I suspect that means rs6000_modes_tieable_p should return true > for SImode and SFmode if TARGET_P8_VECTOR is true. Likewise for > TARGET_P9_VECTOR and SFmode and QImode/HImode too. It means that older CPUs do not have as many instructions to do scalar integer operations in vector registers, making it (almost) always a losing proposition to put scalar integers there. On newer CPUs it is not quite as bad, there is a full(er) complement of instructions to do such things in vector regs, just a bit slower than on GPRs. But yeah we might need to fix hard_regno_mode_ok if we change tieable. Segher
[r14-4046 Regression] FAIL: 23_containers/vector/bool/110807.cc -std=gnu++17 (test for excess errors) on Linux/x86_64
On Linux/x86_64, 3a0e01f6bb1d6ec444001f2caea6ef43a4a83e3a is the first bad commit commit 3a0e01f6bb1d6ec444001f2caea6ef43a4a83e3a Author: Jonathan Wakely Date: Fri Sep 1 21:27:57 2023 +0100 libstdc++: Add support for running tests with multiple -std options caused FAIL: 23_containers/vector/bool/110807.cc -std=gnu++17 (test for excess errors) with GCC configured with ../../gcc/configure --prefix=/export/users/haochenj/src/gcc-bisect/master/master/r14-4046/usr --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld --with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl --enable-libmpx x86_64-linux --disable-bootstrap To reproduce: $ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check RUNTESTFLAGS="conformance.exp=23_containers/vector/bool/110807.cc --target_board='unix{-m32}'" $ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check RUNTESTFLAGS="conformance.exp=23_containers/vector/bool/110807.cc --target_board='unix{-m32\ -march=cascadelake}'" (Please do not reply to this email, for question about this report, contact me at lin1 dot hu at intel.com.) (If you met problems with cascadelake related, disabling AVX512F in command line might save that.) (However, please make sure that there is no potential problems with AVX512.)
[PATCH] i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm
Hi, all The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128. And it has regtested on x86_64-pc-linux-gnu. OK for trunk? Thanks. Lin vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk. We can optimze them to vblend, vmovaps when there's no cross-lane. gcc/ChangeLog: * config/i386/sse.md: Modify insn vperm{i,f} and vshuf{i,f}. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test. * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-1.c: New test. * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto. --- gcc/config/i386/sse.md| 36 -- .../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +- .../gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++ .../gcc.target/i386/opt-vperm-vshuf-2.c | 68 +++ .../gcc.target/i386/opt-vperm-vshuf-3.c | 63 + 8 files changed, 218 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 513960e8f33..5b6b2427460 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18437,6 +18437,8 @@ mask = INTVAL (operands[3]) / 2; mask |= (INTVAL (operands[5]) - 4) / 2 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf64x2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -18595,6 +18597,9 @@ mask |= (INTVAL (operands[7]) - 8) / 4 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + return "vshuf32x4\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -25663,7 +25668,28 @@ (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_VPERMTI))] "TARGET_AVX2" - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" + { +int mask = INTVAL (operands[3]); +if ((mask & 0xbb) == 16) + { + if (rtx_equal_p (operands[0], operands[1])) + return ""; + else + return "vmovaps\t{%1, %0|%0, %1}"; + } +if ((mask & 0xbb) == 50) + { + if (rtx_equal_p (operands[0], operands[2])) + return ""; + else + return "vmovaps\t{%2, %0|%0, %2}"; + } +if ((mask & 0xbb) == 18) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; +if ((mask & 0xbb) == 48) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; +return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + } [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -26226,9 +26252,11 @@ && avx_vperm2f128_parallel (operands[3], mode)" { int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; - if (mask == 0x12) -return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; - if (mask == 0x20) + if ((mask & 0xbb) == 0x12) +return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 0x30) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if ((mask & 0xbb) == 0x20) return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c index 6c2fb2f184a..02aecf4edce 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f32x4 (x, x, 2); + x = _mm256_shuffle_f32x4 (x, x, 3); x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c index 1191b400134..563ded5d9df 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f64x2 (x, x, 2); + x = _mm256_shuffle_f64x2 (x, x, 3); x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
[PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics
Hi all, The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and has been tested on x86_64-pc-linux-gnu. OK for trunk? BRs, Lin gcc/ChangeLog: * config/i386/avx2intrin.h (_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro. (_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_mm_reduce_add_epi16): New instrinsics. (_mm_reduce_mul_epi16): Ditto. (_mm_reduce_and_epi16): Ditto. (_mm_reduce_or_epi16): Ditto. (_mm_reduce_max_epi16): Ditto. (_mm_reduce_max_epu16): Ditto. (_mm_reduce_min_epi16): Ditto. (_mm_reduce_min_epu16): Ditto. (_mm256_reduce_add_epi16): Ditto. (_mm256_reduce_mul_epi16): Ditto. (_mm256_reduce_and_epi16): Ditto. (_mm256_reduce_or_epi16): Ditto. (_mm256_reduce_max_epi16): Ditto. (_mm256_reduce_max_epu16): Ditto. (_mm256_reduce_min_epi16): Ditto. (_mm256_reduce_min_epu16): Ditto. (_mm_reduce_add_epi8): Ditto. (_mm_reduce_mul_epi8): Ditto. (_mm_reduce_and_epi8): Ditto. (_mm_reduce_or_epi8): Ditto. (_mm_reduce_max_epi8): Ditto. (_mm_reduce_max_epu8): Ditto. (_mm_reduce_min_epi8): Ditto. (_mm_reduce_min_epu8): Ditto. (_mm256_reduce_add_epi8): Ditto. (_mm256_reduce_mul_epi8): Ditto. (_mm256_reduce_and_epi8): Ditto. (_mm256_reduce_or_epi8): Ditto. (_mm256_reduce_max_epi8): Ditto. (_mm256_reduce_max_epu8): Ditto. (_mm256_reduce_min_epi8): Ditto. (_mm256_reduce_min_epu8): Ditto. * config/i386/avx512vlbwintrin.h: (_mm_mask_reduce_add_epi16): Ditto. (_mm_mask_reduce_mul_epi16): Ditto. (_mm_mask_reduce_and_epi16): Ditto. (_mm_mask_reduce_or_epi16): Ditto. (_mm_mask_reduce_max_epi16): Ditto. (_mm_mask_reduce_max_epu16): Ditto. (_mm_mask_reduce_min_epi16): Ditto. (_mm_mask_reduce_min_epu16): Ditto. (_mm256_mask_reduce_add_epi16): Ditto. (_mm256_mask_reduce_mul_epi16): Ditto. (_mm256_mask_reduce_and_epi16): Ditto. (_mm256_mask_reduce_or_epi16): Ditto. (_mm256_mask_reduce_max_epi16): Ditto. (_mm256_mask_reduce_max_epu16): Ditto. (_mm256_mask_reduce_min_epi16): Ditto. (_mm256_mask_reduce_min_epu16): Ditto. (_mm_mask_reduce_add_epi8): Ditto. (_mm_mask_reduce_mul_epi8): Ditto. (_mm_mask_reduce_and_epi8): Ditto. (_mm_mask_reduce_or_epi8): Ditto. (_mm_mask_reduce_max_epi8): Ditto. (_mm_mask_reduce_max_epu8): Ditto. (_mm_mask_reduce_min_epi8): Ditto. (_mm_mask_reduce_min_epu8): Ditto. (_mm256_mask_reduce_add_epi8): Ditto. (_mm256_mask_reduce_mul_epi8): Ditto. (_mm256_mask_reduce_and_epi8): Ditto. (_mm256_mask_reduce_or_epi8): Ditto. (_mm256_mask_reduce_max_epi8): Ditto. (_mm256_mask_reduce_max_epu8): Ditto. (_mm256_mask_reduce_min_epi8): Ditto. (_mm256_mask_reduce_min_epu8): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vlbw-reduce-op-1.c: New test. --- gcc/config/i386/avx2intrin.h | 347 ++ gcc/config/i386/avx512vlbwintrin.h| 256 + .../gcc.target/i386/avx512vlbw-reduce-op-1.c | 206 +++ 3 files changed, 809 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 1b9c8169a96..9b8c13b7233 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, (int) (SCALE)) #endif /* __OPTIMIZE__ */ +#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \ + __v8hi __T1 = (__v8hi)__W; \ + __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 6, 7); \ + __v8hi __T3 = __T1 op __T2; \ + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 6, 7); \ + __v8hi __T5 = __T3 op __T4; \ + __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, 6, 7); \ + __v8hi __T7 = __T5 op __T6; \ + return __T7[0] + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_add_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (+); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_mul_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (*); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __alway
RE: [PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics
More details: Intrinsics guide add these 128/256-bit intrinsics as follow: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=reduce_&ig_expand=5814. So we intend to enable these intrinsics for GCC-14. -Original Message- From: Gcc-patches On Behalf Of Hu, Lin1 via Gcc-patches Sent: Tuesday, April 18, 2023 3:03 PM To: gcc-patches@gcc.gnu.org Cc: Liu, Hongtao ; ubiz...@gmail.com Subject: [PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics Hi all, The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and has been tested on x86_64-pc-linux-gnu. OK for trunk? BRs, Lin gcc/ChangeLog: * config/i386/avx2intrin.h (_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro. (_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_mm_reduce_add_epi16): New instrinsics. (_mm_reduce_mul_epi16): Ditto. (_mm_reduce_and_epi16): Ditto. (_mm_reduce_or_epi16): Ditto. (_mm_reduce_max_epi16): Ditto. (_mm_reduce_max_epu16): Ditto. (_mm_reduce_min_epi16): Ditto. (_mm_reduce_min_epu16): Ditto. (_mm256_reduce_add_epi16): Ditto. (_mm256_reduce_mul_epi16): Ditto. (_mm256_reduce_and_epi16): Ditto. (_mm256_reduce_or_epi16): Ditto. (_mm256_reduce_max_epi16): Ditto. (_mm256_reduce_max_epu16): Ditto. (_mm256_reduce_min_epi16): Ditto. (_mm256_reduce_min_epu16): Ditto. (_mm_reduce_add_epi8): Ditto. (_mm_reduce_mul_epi8): Ditto. (_mm_reduce_and_epi8): Ditto. (_mm_reduce_or_epi8): Ditto. (_mm_reduce_max_epi8): Ditto. (_mm_reduce_max_epu8): Ditto. (_mm_reduce_min_epi8): Ditto. (_mm_reduce_min_epu8): Ditto. (_mm256_reduce_add_epi8): Ditto. (_mm256_reduce_mul_epi8): Ditto. (_mm256_reduce_and_epi8): Ditto. (_mm256_reduce_or_epi8): Ditto. (_mm256_reduce_max_epi8): Ditto. (_mm256_reduce_max_epu8): Ditto. (_mm256_reduce_min_epi8): Ditto. (_mm256_reduce_min_epu8): Ditto. * config/i386/avx512vlbwintrin.h: (_mm_mask_reduce_add_epi16): Ditto. (_mm_mask_reduce_mul_epi16): Ditto. (_mm_mask_reduce_and_epi16): Ditto. (_mm_mask_reduce_or_epi16): Ditto. (_mm_mask_reduce_max_epi16): Ditto. (_mm_mask_reduce_max_epu16): Ditto. (_mm_mask_reduce_min_epi16): Ditto. (_mm_mask_reduce_min_epu16): Ditto. (_mm256_mask_reduce_add_epi16): Ditto. (_mm256_mask_reduce_mul_epi16): Ditto. (_mm256_mask_reduce_and_epi16): Ditto. (_mm256_mask_reduce_or_epi16): Ditto. (_mm256_mask_reduce_max_epi16): Ditto. (_mm256_mask_reduce_max_epu16): Ditto. (_mm256_mask_reduce_min_epi16): Ditto. (_mm256_mask_reduce_min_epu16): Ditto. (_mm_mask_reduce_add_epi8): Ditto. (_mm_mask_reduce_mul_epi8): Ditto. (_mm_mask_reduce_and_epi8): Ditto. (_mm_mask_reduce_or_epi8): Ditto. (_mm_mask_reduce_max_epi8): Ditto. (_mm_mask_reduce_max_epu8): Ditto. (_mm_mask_reduce_min_epi8): Ditto. (_mm_mask_reduce_min_epu8): Ditto. (_mm256_mask_reduce_add_epi8): Ditto. (_mm256_mask_reduce_mul_epi8): Ditto. (_mm256_mask_reduce_and_epi8): Ditto. (_mm256_mask_reduce_or_epi8): Ditto. (_mm256_mask_reduce_max_epi8): Ditto. (_mm256_mask_reduce_max_epu8): Ditto. (_mm256_mask_reduce_min_epi8): Ditto. (_mm256_mask_reduce_min_epu8): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vlbw-reduce-op-1.c: New test. --- gcc/config/i386/avx2intrin.h | 347 ++ gcc/config/i386/avx512vlbwintrin.h| 256 + .../gcc.target/i386/avx512vlbw-reduce-op-1.c | 206 +++ 3 files changed, 809 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 1b9c8169a96..9b8c13b7233 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, (int) (SCALE)) #endif /* __OPTIMIZE__ */ +#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \ + __v8hi __T1 = (__v8hi)__W; \ + __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, +6, 7); \ + __v8hi __T3 = __T1 op __T2; \ + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, +6, 7); \ + __v8hi __T5 = __T3 op __T4; \ + __v8hi __T6 = __builtin_shufflevector (__T5,