Plus some optimization to remove redundant vpcmpeq(x86 use 2 vpcmpeq to support 
neq)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

        PR target/101639
        * config/i386/sse.md
        (VI_AVX): New mode iterator.
        (VI_AVX_CMP): Ditto.
        (ssebytemode): Add V16HI, V32QI, V16QI.
        (reduc_sbool_and_scal_<mode>): New expander.
        (reduc_sbool_ior_scal_<mode>): Ditto.
        (reduc_sbool_xor_scal_<mode>): Ditto.
        (*eq<mode>3_2_negate): New pre_reload splitter.
        (*ptest<mode>_ccz): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/pr101639_reduc_mask_vdi.c: New test.
        * gcc.target/i386/pr101639_reduc_mask_vqi.c: New test.
        * gcc.target/i386/pr101639_reduc_mask_vsi.c: New test.
        * gcc.target/i386/pr101639_reduc_mask_ior_vqi.c: New test.
        * gcc.target/i386/pr101639_reduc_mask_and_vqi.c: New test.
---
 gcc/config/i386/sse.md                        | 148 +++++++++++++++++-
 .../i386/pr101639_reduc_mask_and_vqi.c        |  14 ++
 .../i386/pr101639_reduc_mask_ior_vqi.c        |  14 ++
 .../gcc.target/i386/pr101639_reduc_mask_vdi.c |  31 ++++
 .../gcc.target/i386/pr101639_reduc_mask_vqi.c |  31 ++++
 .../gcc.target/i386/pr101639_reduc_mask_vsi.c |  31 ++++
 6 files changed, 265 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 444dc7a7cbc..cd6e10e2dc3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -569,6 +569,18 @@ (define_mode_iterator VI_AVX2
    (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
    (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
 
+(define_mode_iterator VI_AVX
+  [(V32QI "TARGET_AVX") V16QI
+   (V16HI "TARGET_AVX") V8HI
+   (V8SI "TARGET_AVX") V4SI
+   (V4DI "TARGET_AVX") V2DI])
+
+(define_mode_iterator VI_AVX2_CMP
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI
+   (V4DI "TARGET_AVX2") V2DI])
+
 (define_mode_iterator VI_AVX_AVX512F
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
    (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
@@ -896,7 +908,8 @@ (define_mode_attr ssedoublemode
 (define_mode_attr ssebytemode
   [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
    (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")
-   (V8HI "V16QI")])
+   (V16HI "V32QI") (V8HI "V16QI")
+   (V32QI "V32QI") (V16QI "V16QI")])
 
 (define_mode_attr sseintconvert
   [(V32HI "w") (V16HI "w") (V8HI "w")
@@ -4095,6 +4108,88 @@ (define_expand "reduc_sbool_xor_scal_<mode>"
   DONE;
 })
 
+(define_expand "reduc_sbool_and_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+  (match_operand:VI_AVX 1 "register_operand")]
+ "TARGET_SSE4_1"
+{
+  rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+  rtx op2, tmp;
+  if (TARGET_AVX2 || <MODE_SIZE> != 32)
+    {
+      op2 = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+      tmp = gen_reg_rtx (<MODE>mode);
+      rtx op1 = gen_rtx_EQ (<MODE>mode, operands[1], op2);
+      emit_insn (gen_vec_cmp<mode><mode> (tmp, op1, operands[1], op2));
+    }
+  else
+    {
+      op2 = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+      tmp = gen_reg_rtx (<MODE>mode);
+      rtx ops[3] = { tmp, operands[1], op2 };
+      ix86_expand_vector_logical_operator (XOR, <MODE>mode, ops);
+    }
+
+  tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, tmp, tmp), UNSPEC_PTEST);
+  emit_insn (gen_rtx_SET (flags, tmp));
+  rtx ret = gen_rtx_fmt_ee (EQ, VOIDmode, flags, const0_rtx);
+  PUT_MODE (ret, QImode);
+  emit_insn (gen_rtx_SET (operands[0], ret));
+  DONE;
+
+})
+
+(define_expand "reduc_sbool_ior_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+  (match_operand:VI_AVX 1 "register_operand")]
+ "TARGET_SSE4_1"
+{
+  rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+  rtx tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, operands[1], operands[1]), 
UNSPEC_PTEST);
+  emit_insn (gen_rtx_SET (flags, tmp));
+  rtx ret = gen_rtx_fmt_ee (NE, VOIDmode, flags, const0_rtx);
+  PUT_MODE (ret, QImode);
+  emit_insn (gen_rtx_SET (operands[0], ret));
+  DONE;
+})
+
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+  (match_operand:VI1_AVX2 1 "register_operand")]
+ "TARGET_SSE2 && TARGET_POPCNT"
+{
+  rtx popcnt1 = gen_reg_rtx (SImode);
+  emit_insn (gen_<sse2_avx2>_pmovmskb (popcnt1,operands[1]));
+
+  emit_insn (gen_popcountsi2 (popcnt1, popcnt1));
+  emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+
+  emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1));
+  DONE;
+})
+
+(define_mode_attr ssefltvecmode
+  [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
+
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+  (match_operand:VI48_AVX 1 "register_operand")]
+ "TARGET_SSE2 && TARGET_POPCNT"
+{
+  rtx popcnt1 = gen_reg_rtx (SImode);
+  rtx tmp = gen_rtx_UNSPEC (SImode, gen_rtvec(1,
+                                   gen_lowpart (<ssefltvecmode>mode,
+                                   operands[1])),
+                           UNSPEC_MOVMSK);
+  emit_insn (gen_rtx_SET (popcnt1, tmp));
+
+  emit_insn (gen_popcountsi2 (popcnt1, popcnt1));
+  emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+
+  emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1));
+  DONE;
+})
+
 (define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>"
   [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
        (unspec:VFH_AVX512VL
@@ -18057,6 +18152,24 @@ (define_insn "*avx2_eq<mode>3"
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
+(define_insn_and_split "*eq<mode>3_2_negate"
+  [(set (match_operand:VI_AVX2_CMP 0 "register_operand")
+       (eq:VI_AVX2_CMP
+         (eq:VI_AVX2_CMP
+           (eq: VI_AVX2_CMP
+             (match_operand:VI_AVX2_CMP 1 "nonimmediate_operand")
+             (match_operand:VI_AVX2_CMP 2 "general_operand"))
+           (match_operand:VI_AVX2_CMP 3 "const0_operand"))
+         (match_operand:VI_AVX2_CMP 4 "const0_operand")))]
+  "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (eq:VI_AVX2_CMP (match_dup 1)
+                       (match_dup 5)))]
+ "operands[5] = force_reg (<MODE>mode, operands[2]);")
+
+
 (define_insn_and_split "*avx2_pcmp<mode>3_1"
  [(set (match_operand:VI_128_256  0 "register_operand")
        (vec_merge:VI_128_256
@@ -23747,9 +23860,6 @@ (define_insn_and_split 
"*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_gt"
    (set_attr "btver2_decode" "vector,vector,vector")
    (set_attr "mode" "<MODE>")])
 
-(define_mode_attr ssefltvecmode
-  [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
-
 (define_insn_and_split 
"*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint"
   [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x")
        (unspec:<ssebytemode>
@@ -25564,6 +25674,36 @@ (define_split
                           (match_dup 0)
                           (pc)))])
 
+
+;; (unspec:ccz [(eq (eq op0 const0) const0)] unspec_ptest)
+;; is equal to (unspec:ccz [op0 op0] unspec_ptest).
+(define_insn_and_split "*ptest<mode>_ccz"
+  [(set (reg:CCZ FLAGS_REG)
+       (unspec:CCZ
+         [(eq:VI_AVX
+            (eq:VI_AVX
+              (match_operand:VI_AVX 0 "vector_operand")
+              (match_operand:VI_AVX 1 "const0_operand"))
+            (match_operand:VI_AVX 2 "const0_operand"))
+          (eq:VI_AVX
+            (eq:VI_AVX (match_dup 0) (match_dup 1))
+            (match_dup 2))]
+        UNSPEC_PTEST))]
+  "TARGET_SSE4_1
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCZ FLAGS_REG)
+       (unspec:CCZ
+         [(match_dup 3) (match_dup 3)]
+        UNSPEC_PTEST))]
+{
+  if (MEM_P (operands[0]))
+    operands[3] = force_reg (<MODE>mode, operands[0]);
+  else
+    operands[3] = operands[0];
+})
+
 (define_expand "nearbyint<mode>2"
   [(set (match_operand:VFH 0 "register_operand")
        (unspec:VFH
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c 
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c
new file mode 100644
index 00000000000..23fc67e8ae9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 1 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
+
+bool f2(char * p, long n)
+{
+  bool r = true;
+  for(long i = 0; i < 32; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c 
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c
new file mode 100644
index 00000000000..e1deb2fe21d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-not "vpcmpeq" } } */
+
+bool f2(char * p, long n)
+{
+  bool r = false;
+  for(long i = 0; i < 32; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c 
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c
new file mode 100644
index 00000000000..ee526973006
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 2 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-times "popcnt" 1 } } */
+/* { dg-final { scan-assembler-times "vmovmskpd" 1 } } */
+
+bool f(long long *p, long n)
+{
+  bool r = true;
+  for(long i = 0; i < 4; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool f2(long long *p, long n)
+{
+  bool r = false;
+  for(long i = 0; i < 4; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+bool f3(long long *p, long n)
+{
+  bool r = false;
+  for(long i = 0; i < 4; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c 
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c
new file mode 100644
index 00000000000..1707f15ce58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 2 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-times "popcnt" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovmskb" 1 } } */
+
+bool f(char * p, long n)
+{
+  bool r = true;
+  for(long i = 0; i < 32; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool f2(char * p, long n)
+{
+  bool r = false;
+  for(long i = 0; i < 32; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+bool f3(char * p, long n)
+{
+  bool r = false;
+  for(long i = 0; i < 32; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c 
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c
new file mode 100644
index 00000000000..2d4a39f71c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 2 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-times "popcnt" 1 } } */
+/* { dg-final { scan-assembler-times "vmovmskps" 1 } } */
+
+bool f(int * p, long n)
+{
+  bool r = true;
+  for(long i = 0; i < 8; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool f2(int * p, long n)
+{
+  bool r = false;
+  for(long i = 0; i < 8; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+bool f3(int * p, long n)
+{
+  bool r = false;
+  for(long i = 0; i < 8; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
-- 
2.34.1

Reply via email to