Hi all, This patch aims to acheive EQ/NE comparison between avx512 kmask and -1 by using kxortest with checking CF.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk? BRs, Lin gcc/ChangeLog: PR target/113609 * config/i386/sse.md (*kortest_cmp<mode>_setcc): New define_insn_and_split. (*kortest_cmp<mode>_jcc): Ditto. gcc/testsuite/ChangeLog: PR target/113609 * gcc.target/i386/pr113609-1.c: New test. * gcc.target/i386/pr113609-2.c: Ditto. --- gcc/config/i386/sse.md | 67 +++++++ gcc/testsuite/gcc.target/i386/pr113609-1.c | 194 +++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr113609-2.c | 161 +++++++++++++++++ 3 files changed, 422 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-2.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b59c988fc31..34fd2e4afac 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2201,6 +2201,73 @@ (define_expand "kortest<mode>" UNSPEC_KORTEST))] "TARGET_AVX512F") +;; Optimize cmp + setcc with mask register by kortest + setcc. +(define_insn_and_split "*kortest_cmp<mode>_setcc" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm, qm") + (match_operator:QI 1 "bt_comparison_operator" + [(match_operand:SWI1248_AVX512BWDQ_64 2 "register_operand" "?k, <r>") + (const_int -1)])) + (clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(const_int 0)] +{ + if (MASK_REGNO_P (REGNO (operands[2]))) + { + emit_insn (gen_kortest<mode>_ccc (operands[2], operands[2])); + operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG); + } + else + { + operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG); + emit_insn (gen_rtx_SET (operands[4], + gen_rtx_COMPARE (CCZmode, + operands[2], + constm1_rtx))); + } + ix86_expand_setcc (operands[0], + GET_CODE (operands[1]), + operands[4], + const0_rtx); + DONE; +}) + +;; Optimize cmp + jcc with mask register by kortest + jcc. +(define_insn_and_split "*kortest_cmp<mode>_jcc" + [(set (pc) + (if_then_else + (match_operator 0 "bt_comparison_operator" + [(match_operand:SWI1248_AVX512BWDQ_64 1 "register_operand" "?k, <r>") + (const_int -1)]) + (label_ref (match_operand 2)) + (pc))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(const_int 0)] +{ + if (MASK_REGNO_P (REGNO (operands[1]))) + { + emit_insn (gen_kortest<mode>_ccc (operands[1], operands[1])); + operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG); + } + else + { + operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG); + emit_insn (gen_rtx_SET (operands[4], + gen_rtx_COMPARE (CCZmode, + operands[1], + constm1_rtx))); + } + ix86_expand_branch (GET_CODE (operands[0]), + operands[4], + const0_rtx, + operands[2]); + DONE; +}) + (define_insn "kunpckhi" [(set (match_operand:HI 0 "register_operand" "=k") (ior:HI diff --git a/gcc/testsuite/gcc.target/i386/pr113609-1.c b/gcc/testsuite/gcc.target/i386/pr113609-1.c new file mode 100644 index 00000000000..f0639b8500a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr113609-1.c @@ -0,0 +1,194 @@ +/* PR target/113609 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4" } */ +/* { dg-final { scan-assembler-not "^cmp" } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+sete" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+setne" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+je" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+jne" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 1 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 1 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+je" 1 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "kortest" 12 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "kortest" 17 { target { ! ia32 } } } } */ + +#include <immintrin.h> + +unsigned int +cmp_vector_sete_mask8(__m128i a, __m128i b) +{ + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); + if (k == (__mmask8) -1) + return 1; + else + return 0; +} + +unsigned int +cmp_vector_sete_mask16(__m128i a, __m128i b) +{ + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); + if (k == (__mmask16) -1) + return 1; + else + return 0; +} + +unsigned int +cmp_vector_sete_mask32(__m256i a, __m256i b) +{ + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); + if (k == (__mmask32) -1) + return 1; + else + return 0; +} + +unsigned int +cmp_vector_sete_mask64(__m512i a, __m512i b) +{ + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); + if (k == (__mmask64) -1) + return 1; + else + return 0; +} + +unsigned int +cmp_vector_setne_mask8(__m128i a, __m128i b) +{ + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); + if (k != (__mmask8) -1) + return 1; + else + return 0; +} + +unsigned int +cmp_vector_setne_mask16(__m128i a, __m128i b) +{ + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); + if (k != (__mmask16) -1) + return 1; + else + return 0; +} + +unsigned int +cmp_vector_setne_mask32(__m256i a, __m256i b) +{ + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); + if (k != (__mmask32) -1) + return 1; + else + return 0; +} + +unsigned int +cmp_vector_setne_mask64(__m512i a, __m512i b) +{ + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); + if (k != (__mmask64) -1) + return 1; + else + return 0; +} + +__m128i +cmp_vector_je_mask8(__m128i a, __m128i b) { + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); + if (k == (__mmask8) -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 1; + } + return a; +} + +__m128i +cmp_vector_je_mask16(__m128i a, __m128i b) { + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); + if (k == (__mmask16) -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 1; + } + return a; +} + +__m256i +cmp_vector_je_mask32(__m256i a, __m256i b) { + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); + if (k == (__mmask32) -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 1; + } + return a; +} + +__m512i +cmp_vector_je_mask64(__m512i a, __m512i b) { + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); + if (k == (__mmask64) -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 5; + } + return a; +} + +__m128i +cmp_vector_jne_mask8(__m128i a, __m128i b) { + __mmask8 k = _mm_cmpeq_epi16_mask (a, b); + if (k == (__mmask8) -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} + +__m128i +cmp_vector_jne_mask16(__m128i a, __m128i b) { + __mmask16 k = _mm_cmpeq_epi8_mask (a, b); + if (k == (__mmask16) -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} + +__m256i +cmp_vector_jne_mask32(__m256i a, __m256i b) { + __mmask32 k = _mm256_cmpeq_epi8_mask (a, b); + if (k == (__mmask32) -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} + +__m512i +cmp_vector_jne_mask64(__m512i a, __m512i b) { + __mmask64 k = _mm512_cmpeq_epi8_mask (a, b); + if (k == (__mmask64) -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} + +__m512i +mask_cmp_vector_jne_mask64(__m512i a, __m512i b) { + __mmask64 k = _mm512_mask_cmpeq_epi8_mask ((__mmask64)0xffffffefffffffff, a, b); + if (k == (__mmask64) -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} diff --git a/gcc/testsuite/gcc.target/i386/pr113609-2.c b/gcc/testsuite/gcc.target/i386/pr113609-2.c new file mode 100644 index 00000000000..e9503f51538 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr113609-2.c @@ -0,0 +1,161 @@ +/* PR target/113609 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4" } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 4 } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 4 } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+je" 4 } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 4 } } */ + +#include <immintrin.h> + +unsigned int +cmp_pi8_setcc(char a) +{ + if (a == -1) + return 1; + else + return 0; +} + +unsigned int +cmp_pi16_setcc(short a) +{ + if (a == -1) + return 1; + else + return 0; +} + +unsigned int +cmp_pi32_setcc(int a) +{ + if (a == -1) + return 1; + else + return 0; +} + +unsigned int +cmp_pi64_setcc(long long a) +{ + if (a == -1) + return 1; + else + return 0; +} + +unsigned int +cmp_pi8_setne(char a) +{ + if (a != -1) + return 1; + else + return 0; +} + +unsigned int +cmp_pi16_setne(short a) +{ + if (a != -1) + return 1; + else + return 0; +} + +unsigned int +cmp_pi32_setne(int a) +{ + if (a != -1) + return 1; + else + return 0; +} + +unsigned int +cmp_pi64_setne(long long a) +{ + if (a != -1) + return 1; + else + return 0; +} + +__m128i +cmp_pi8_je(__m128i a, char b) { + if (b == -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 1; + } + return a; +} + +__m128i +cmp_pi16_je(__m128i a, short b) { + if (b == -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 1; + } + return a; +} + +__m128i +cmp_pi32_je(__m128i a, int b) { + if (b == -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 1; + } + return a; +} + +__m128i +cmp_pi64_je(__m128i a, long long b) { + if (b == -1) { + a[0] = a[0] + 1; + } + else { + a[0] = a[0] - 1; + } + return a; +} + +__m128i +cmp_pi8_jne(__m128i a, char b) { + if (b == -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} + +__m128i +cmp_pi16_jne(__m128i a, short b) { + if (b == -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} + +__m128i +cmp_pi32_jne(__m128i a, int b) { + if (b == -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} + +__m128i +cmp_pi64_jne(__m128i a, long long b) { + if (b == -1) { + a[0] = a[0] + 1; + } + a[0] = a[0] - 4; + return a; +} -- 2.31.1