Hi all,

This patch aims to acheive EQ/NE comparison between avx512 kmask and -1
by using kxortest with checking CF.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk?

BRs,
Lin

gcc/ChangeLog:

        PR target/113609
        * config/i386/sse.md
        (*kortest_cmp<mode>_setcc): New define_insn_and_split.
        (*kortest_cmp<mode>_jcc): Ditto.

gcc/testsuite/ChangeLog:

        PR target/113609
        * gcc.target/i386/pr113609-1.c: New test.
        * gcc.target/i386/pr113609-2.c: Ditto.
---
 gcc/config/i386/sse.md                     |  67 +++++++
 gcc/testsuite/gcc.target/i386/pr113609-1.c | 194 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr113609-2.c | 161 +++++++++++++++++
 3 files changed, 422 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b59c988fc31..34fd2e4afac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2201,6 +2201,73 @@ (define_expand "kortest<mode>"
          UNSPEC_KORTEST))]
   "TARGET_AVX512F")
 
+;; Optimize cmp + setcc with mask register by kortest + setcc.
+(define_insn_and_split "*kortest_cmp<mode>_setcc"
+   [(set (match_operand:QI 0 "nonimmediate_operand" "=qm, qm")
+        (match_operator:QI 1 "bt_comparison_operator"
+           [(match_operand:SWI1248_AVX512BWDQ_64 2 "register_operand" "?k, 
<r>")
+            (const_int -1)]))
+  (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  if (MASK_REGNO_P (REGNO (operands[2])))
+    {
+      emit_insn (gen_kortest<mode>_ccc (operands[2], operands[2]));
+      operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG);
+    }
+  else
+    {
+      operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG);
+      emit_insn (gen_rtx_SET (operands[4],
+                             gen_rtx_COMPARE (CCZmode,
+                                              operands[2],
+                                              constm1_rtx)));
+    }
+  ix86_expand_setcc (operands[0],
+                    GET_CODE (operands[1]),
+                    operands[4],
+                    const0_rtx);
+  DONE;
+})
+
+;; Optimize cmp + jcc with mask register by kortest + jcc.
+(define_insn_and_split "*kortest_cmp<mode>_jcc"
+   [(set (pc)
+      (if_then_else
+       (match_operator 0 "bt_comparison_operator"
+         [(match_operand:SWI1248_AVX512BWDQ_64 1 "register_operand" "?k, <r>")
+          (const_int -1)])
+         (label_ref (match_operand 2))
+      (pc)))
+  (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  if (MASK_REGNO_P (REGNO (operands[1])))
+    {
+      emit_insn (gen_kortest<mode>_ccc (operands[1], operands[1]));
+      operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG);
+    }
+  else
+    {
+      operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG);
+      emit_insn (gen_rtx_SET (operands[4],
+                             gen_rtx_COMPARE (CCZmode,
+                                              operands[1],
+                                              constm1_rtx)));
+    }
+  ix86_expand_branch (GET_CODE (operands[0]),
+                     operands[4],
+                     const0_rtx,
+                     operands[2]);
+  DONE;
+})
+
 (define_insn "kunpckhi"
   [(set (match_operand:HI 0 "register_operand" "=k")
        (ior:HI
diff --git a/gcc/testsuite/gcc.target/i386/pr113609-1.c 
b/gcc/testsuite/gcc.target/i386/pr113609-1.c
new file mode 100644
index 00000000000..f0639b8500a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113609-1.c
@@ -0,0 +1,194 @@
+/* PR target/113609 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-not "^cmp" } } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+sete" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+setne" { target { ! ia32 } } } } 
*/
+/* { dg-final { scan-assembler-not "\[ \\t\]+je" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+jne" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 1 { target { ia32 } } } } 
*/
+/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 1 { target { ia32 } } } 
} */
+/* { dg-final { scan-assembler-times "\[ \\t\]+je" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 2 { target { ia32 } } } } 
*/
+/* { dg-final { scan-assembler-times "kortest" 12 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "kortest" 17 { target { ! ia32 } } } } */
+
+#include <immintrin.h>
+
+unsigned int
+cmp_vector_sete_mask8(__m128i a, __m128i b)
+{
+    __mmask8 k = _mm_cmpeq_epi16_mask (a, b);
+    if (k == (__mmask8) -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_vector_sete_mask16(__m128i a, __m128i b)
+{
+    __mmask16 k = _mm_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask16) -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_vector_sete_mask32(__m256i a, __m256i b)
+{
+    __mmask32 k = _mm256_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask32) -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_vector_sete_mask64(__m512i a, __m512i b)
+{
+    __mmask64 k = _mm512_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask64) -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_vector_setne_mask8(__m128i a, __m128i b)
+{
+    __mmask8 k = _mm_cmpeq_epi16_mask (a, b);
+    if (k != (__mmask8) -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_vector_setne_mask16(__m128i a, __m128i b)
+{
+    __mmask16 k = _mm_cmpeq_epi8_mask (a, b);
+    if (k != (__mmask16) -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_vector_setne_mask32(__m256i a, __m256i b)
+{
+    __mmask32 k = _mm256_cmpeq_epi8_mask (a, b);
+    if (k != (__mmask32) -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_vector_setne_mask64(__m512i a, __m512i b)
+{
+    __mmask64 k = _mm512_cmpeq_epi8_mask (a, b);
+    if (k != (__mmask64) -1)
+      return 1;
+    else
+      return 0;
+}
+
+__m128i
+cmp_vector_je_mask8(__m128i a, __m128i b) {
+    __mmask8 k = _mm_cmpeq_epi16_mask (a, b);
+    if (k == (__mmask8) -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 1;
+    }
+    return a; 
+}
+
+__m128i
+cmp_vector_je_mask16(__m128i a, __m128i b) {
+    __mmask16 k = _mm_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask16) -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 1;
+    }
+    return a; 
+}
+
+__m256i
+cmp_vector_je_mask32(__m256i a, __m256i b) {
+    __mmask32 k = _mm256_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask32) -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 1;
+    }
+    return a; 
+}
+
+__m512i
+cmp_vector_je_mask64(__m512i a, __m512i b) {
+    __mmask64 k = _mm512_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask64) -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 5;
+    }
+    return a; 
+}
+
+__m128i
+cmp_vector_jne_mask8(__m128i a, __m128i b) {
+    __mmask8 k = _mm_cmpeq_epi16_mask (a, b);
+    if (k == (__mmask8) -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
+
+__m128i
+cmp_vector_jne_mask16(__m128i a, __m128i b) {
+    __mmask16 k = _mm_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask16) -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
+
+__m256i
+cmp_vector_jne_mask32(__m256i a, __m256i b) {
+    __mmask32 k = _mm256_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask32) -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
+
+__m512i
+cmp_vector_jne_mask64(__m512i a, __m512i b) {
+    __mmask64 k = _mm512_cmpeq_epi8_mask (a, b);
+    if (k == (__mmask64) -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
+
+__m512i
+mask_cmp_vector_jne_mask64(__m512i a, __m512i b) {
+    __mmask64 k = _mm512_mask_cmpeq_epi8_mask ((__mmask64)0xffffffefffffffff, 
a, b);
+    if (k == (__mmask64) -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr113609-2.c 
b/gcc/testsuite/gcc.target/i386/pr113609-2.c
new file mode 100644
index 00000000000..e9503f51538
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113609-2.c
@@ -0,0 +1,161 @@
+/* PR target/113609 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+je" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 4 } } */
+
+#include <immintrin.h>
+
+unsigned int
+cmp_pi8_setcc(char a)
+{
+    if (a == -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_pi16_setcc(short a)
+{
+    if (a == -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_pi32_setcc(int a)
+{
+    if (a == -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_pi64_setcc(long long a)
+{
+    if (a == -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_pi8_setne(char a)
+{
+    if (a != -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_pi16_setne(short a)
+{
+    if (a != -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_pi32_setne(int a)
+{
+    if (a != -1)
+      return 1;
+    else
+      return 0;
+}
+
+unsigned int
+cmp_pi64_setne(long long a)
+{
+    if (a != -1)
+      return 1;
+    else
+      return 0;
+}
+
+__m128i
+cmp_pi8_je(__m128i a, char b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 1;
+    }
+    return a; 
+}
+
+__m128i
+cmp_pi16_je(__m128i a, short b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 1;
+    }
+    return a; 
+}
+
+__m128i
+cmp_pi32_je(__m128i a, int b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 1;
+    }
+    return a; 
+}
+
+__m128i
+cmp_pi64_je(__m128i a, long long b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    else {
+       a[0] = a[0] - 1;
+    }
+    return a; 
+}
+
+__m128i
+cmp_pi8_jne(__m128i a, char b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
+
+__m128i
+cmp_pi16_jne(__m128i a, short b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
+
+__m128i
+cmp_pi32_jne(__m128i a, int b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
+
+__m128i
+cmp_pi64_jne(__m128i a, long long b) {
+    if (b == -1) {
+       a[0] = a[0] + 1;
+    }
+    a[0] = a[0] - 4;
+    return a; 
+}
-- 
2.31.1

Reply via email to