On Tue, May 28, 2024 at 4:00 PM Hu, Lin1 <lin1...@intel.com> wrote: > > Hi all, > > This patch aims to acheive EQ/NE comparison between avx512 kmask and -1 > by using kxortest with checking CF. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk? Ok. > > BRs, > Lin > > gcc/ChangeLog: > > PR target/113609 > * config/i386/sse.md > (*kortest_cmp<mode>_setcc): New define_insn_and_split. > (*kortest_cmp<mode>_jcc): Ditto. > > gcc/testsuite/ChangeLog: > > PR target/113609 > * gcc.target/i386/pr113609-1.c: New test. > * gcc.target/i386/pr113609-2.c: Ditto. > --- > gcc/config/i386/sse.md | 67 +++++++ > gcc/testsuite/gcc.target/i386/pr113609-1.c | 194 +++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr113609-2.c | 161 +++++++++++++++++ > 3 files changed, 422 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-2.c > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index b59c988fc31..34fd2e4afac 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -2201,6 +2201,73 @@ (define_expand "kortest<mode>" > UNSPEC_KORTEST))] > "TARGET_AVX512F") > > +;; Optimize cmp + setcc with mask register by kortest + setcc. > +(define_insn_and_split "*kortest_cmp<mode>_setcc" > + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm, qm") > + (match_operator:QI 1 "bt_comparison_operator" > + [(match_operand:SWI1248_AVX512BWDQ_64 2 "register_operand" "?k, > <r>") > + (const_int -1)])) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_AVX512BW" > + "#" > + "&& reload_completed" > + [(const_int 0)] > +{ > + if (MASK_REGNO_P (REGNO (operands[2]))) > + { > + emit_insn (gen_kortest<mode>_ccc (operands[2], operands[2])); > + operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG); > + } > + else > + { > + operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG); > + emit_insn (gen_rtx_SET (operands[4], > + gen_rtx_COMPARE (CCZmode, > + operands[2], > + constm1_rtx))); > + } > + ix86_expand_setcc (operands[0], > + GET_CODE (operands[1]), > + operands[4], > + const0_rtx); > + DONE; > +}) > + > +;; Optimize cmp + jcc with mask register by kortest + jcc. > +(define_insn_and_split "*kortest_cmp<mode>_jcc" > + [(set (pc) > + (if_then_else > + (match_operator 0 "bt_comparison_operator" > + [(match_operand:SWI1248_AVX512BWDQ_64 1 "register_operand" "?k, > <r>") > + (const_int -1)]) > + (label_ref (match_operand 2)) > + (pc))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_AVX512BW" > + "#" > + "&& reload_completed" > + [(const_int 0)] > +{ > + if (MASK_REGNO_P (REGNO (operands[1]))) > + { > + emit_insn (gen_kortest<mode>_ccc (operands[1], operands[1])); > + operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG); > + } > + else > + { > + operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG); > + emit_insn (gen_rtx_SET (operands[4], > + gen_rtx_COMPARE (CCZmode, > + operands[1], > + constm1_rtx))); > + } > + ix86_expand_branch (GET_CODE (operands[0]), > + operands[4], > + const0_rtx, > + operands[2]); > + DONE; > +}) > + > (define_insn "kunpckhi" > [(set (match_operand:HI 0 "register_operand" "=k") > (ior:HI > diff --git a/gcc/testsuite/gcc.target/i386/pr113609-1.c > b/gcc/testsuite/gcc.target/i386/pr113609-1.c > new file mode 100644 > index 00000000000..f0639b8500a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr113609-1.c > @@ -0,0 +1,194 @@ > +/* PR target/113609 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=x86-64-v4" } */ > +/* { dg-final { scan-assembler-not "^cmp" } } */ > +/* { dg-final { scan-assembler-not "\[ \\t\]+sete" { target { ! ia32 } } } } > */ > +/* { dg-final { scan-assembler-not "\[ \\t\]+setne" { target { ! ia32 } } } > } */ > +/* { dg-final { scan-assembler-not "\[ \\t\]+je" { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-not "\[ \\t\]+jne" { target { ! ia32 } } } } > */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 1 { target { ia32 } } } > } */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 1 { target { ia32 } } > } } */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+je" 1 { target { ia32 } } } } > */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 2 { target { ia32 } } } > } */ > +/* { dg-final { scan-assembler-times "kortest" 12 { target { ia32 } } } } */ > +/* { dg-final { scan-assembler-times "kortest" 17 { target { ! ia32 } } } } > */ > + > +#include <immintrin.h> > + > +unsigned int > +cmp_vector_sete_mask8(__m128i a, __m128i b) > +{ > + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); > + if (k == (__mmask8) -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_vector_sete_mask16(__m128i a, __m128i b) > +{ > + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); > + if (k == (__mmask16) -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_vector_sete_mask32(__m256i a, __m256i b) > +{ > + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); > + if (k == (__mmask32) -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_vector_sete_mask64(__m512i a, __m512i b) > +{ > + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); > + if (k == (__mmask64) -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_vector_setne_mask8(__m128i a, __m128i b) > +{ > + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); > + if (k != (__mmask8) -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_vector_setne_mask16(__m128i a, __m128i b) > +{ > + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); > + if (k != (__mmask16) -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_vector_setne_mask32(__m256i a, __m256i b) > +{ > + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); > + if (k != (__mmask32) -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_vector_setne_mask64(__m512i a, __m512i b) > +{ > + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); > + if (k != (__mmask64) -1) > + return 1; > + else > + return 0; > +} > + > +__m128i > +cmp_vector_je_mask8(__m128i a, __m128i b) { > + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); > + if (k == (__mmask8) -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 1; > + } > + return a; > +} > + > +__m128i > +cmp_vector_je_mask16(__m128i a, __m128i b) { > + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); > + if (k == (__mmask16) -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 1; > + } > + return a; > +} > + > +__m256i > +cmp_vector_je_mask32(__m256i a, __m256i b) { > + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); > + if (k == (__mmask32) -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 1; > + } > + return a; > +} > + > +__m512i > +cmp_vector_je_mask64(__m512i a, __m512i b) { > + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); > + if (k == (__mmask64) -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 5; > + } > + return a; > +} > + > +__m128i > +cmp_vector_jne_mask8(__m128i a, __m128i b) { > + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); > + if (k == (__mmask8) -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > + > +__m128i > +cmp_vector_jne_mask16(__m128i a, __m128i b) { > + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); > + if (k == (__mmask16) -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > + > +__m256i > +cmp_vector_jne_mask32(__m256i a, __m256i b) { > + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); > + if (k == (__mmask32) -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > + > +__m512i > +cmp_vector_jne_mask64(__m512i a, __m512i b) { > + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); > + if (k == (__mmask64) -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > + > +__m512i > +mask_cmp_vector_jne_mask64(__m512i a, __m512i b) { > + __mmask64 k = _mm512_mask_cmpeq_epi8_mask > ((__mmask64)0xffffffefffffffff, a, b); > + if (k == (__mmask64) -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr113609-2.c > b/gcc/testsuite/gcc.target/i386/pr113609-2.c > new file mode 100644 > index 00000000000..e9503f51538 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr113609-2.c > @@ -0,0 +1,161 @@ > +/* PR target/113609 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=x86-64-v4" } */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 4 } } */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 4 } } */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+je" 4 } } */ > +/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 4 } } */ > + > +#include <immintrin.h> > + > +unsigned int > +cmp_pi8_setcc(char a) > +{ > + if (a == -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_pi16_setcc(short a) > +{ > + if (a == -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_pi32_setcc(int a) > +{ > + if (a == -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_pi64_setcc(long long a) > +{ > + if (a == -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_pi8_setne(char a) > +{ > + if (a != -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_pi16_setne(short a) > +{ > + if (a != -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_pi32_setne(int a) > +{ > + if (a != -1) > + return 1; > + else > + return 0; > +} > + > +unsigned int > +cmp_pi64_setne(long long a) > +{ > + if (a != -1) > + return 1; > + else > + return 0; > +} > + > +__m128i > +cmp_pi8_je(__m128i a, char b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 1; > + } > + return a; > +} > + > +__m128i > +cmp_pi16_je(__m128i a, short b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 1; > + } > + return a; > +} > + > +__m128i > +cmp_pi32_je(__m128i a, int b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 1; > + } > + return a; > +} > + > +__m128i > +cmp_pi64_je(__m128i a, long long b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + else { > + a[0] = a[0] - 1; > + } > + return a; > +} > + > +__m128i > +cmp_pi8_jne(__m128i a, char b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > + > +__m128i > +cmp_pi16_jne(__m128i a, short b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > + > +__m128i > +cmp_pi32_jne(__m128i a, int b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > + > +__m128i > +cmp_pi64_jne(__m128i a, long long b) { > + if (b == -1) { > + a[0] = a[0] + 1; > + } > + a[0] = a[0] - 4; > + return a; > +} > -- > 2.31.1 >
-- BR, Hongtao