Plus some optimization to remove redundant vpcmpeq(x86 use 2 vpcmpeq to support
neq)
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.
gcc/ChangeLog:
PR target/101639
* config/i386/sse.md
(VI_AVX): New mode iterator.
(VI_AVX_CMP): Ditto.
(ssebytemode): Add V16HI, V32QI, V16QI.
(reduc_sbool_and_scal_<mode>): New expander.
(reduc_sbool_ior_scal_<mode>): Ditto.
(reduc_sbool_xor_scal_<mode>): Ditto.
(*eq<mode>3_2_negate): New pre_reload splitter.
(*ptest<mode>_ccz): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr101639_reduc_mask_vdi.c: New test.
* gcc.target/i386/pr101639_reduc_mask_vqi.c: New test.
* gcc.target/i386/pr101639_reduc_mask_vsi.c: New test.
* gcc.target/i386/pr101639_reduc_mask_ior_vqi.c: New test.
* gcc.target/i386/pr101639_reduc_mask_and_vqi.c: New test.
---
gcc/config/i386/sse.md | 148 +++++++++++++++++-
.../i386/pr101639_reduc_mask_and_vqi.c | 14 ++
.../i386/pr101639_reduc_mask_ior_vqi.c | 14 ++
.../gcc.target/i386/pr101639_reduc_mask_vdi.c | 31 ++++
.../gcc.target/i386/pr101639_reduc_mask_vqi.c | 31 ++++
.../gcc.target/i386/pr101639_reduc_mask_vsi.c | 31 ++++
6 files changed, 265 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 444dc7a7cbc..cd6e10e2dc3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -569,6 +569,18 @@ (define_mode_iterator VI_AVX2
(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+(define_mode_iterator VI_AVX
+ [(V32QI "TARGET_AVX") V16QI
+ (V16HI "TARGET_AVX") V8HI
+ (V8SI "TARGET_AVX") V4SI
+ (V4DI "TARGET_AVX") V2DI])
+
+(define_mode_iterator VI_AVX2_CMP
+ [(V32QI "TARGET_AVX2") V16QI
+ (V16HI "TARGET_AVX2") V8HI
+ (V8SI "TARGET_AVX2") V4SI
+ (V4DI "TARGET_AVX2") V2DI])
+
(define_mode_iterator VI_AVX_AVX512F
[(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
@@ -896,7 +908,8 @@ (define_mode_attr ssedoublemode
(define_mode_attr ssebytemode
[(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
(V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")
- (V8HI "V16QI")])
+ (V16HI "V32QI") (V8HI "V16QI")
+ (V32QI "V32QI") (V16QI "V16QI")])
(define_mode_attr sseintconvert
[(V32HI "w") (V16HI "w") (V8HI "w")
@@ -4095,6 +4108,88 @@ (define_expand "reduc_sbool_xor_scal_<mode>"
DONE;
})
+(define_expand "reduc_sbool_and_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI_AVX 1 "register_operand")]
+ "TARGET_SSE4_1"
+{
+ rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+ rtx op2, tmp;
+ if (TARGET_AVX2 || <MODE_SIZE> != 32)
+ {
+ op2 = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ tmp = gen_reg_rtx (<MODE>mode);
+ rtx op1 = gen_rtx_EQ (<MODE>mode, operands[1], op2);
+ emit_insn (gen_vec_cmp<mode><mode> (tmp, op1, operands[1], op2));
+ }
+ else
+ {
+ op2 = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+ tmp = gen_reg_rtx (<MODE>mode);
+ rtx ops[3] = { tmp, operands[1], op2 };
+ ix86_expand_vector_logical_operator (XOR, <MODE>mode, ops);
+ }
+
+ tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, tmp, tmp), UNSPEC_PTEST);
+ emit_insn (gen_rtx_SET (flags, tmp));
+ rtx ret = gen_rtx_fmt_ee (EQ, VOIDmode, flags, const0_rtx);
+ PUT_MODE (ret, QImode);
+ emit_insn (gen_rtx_SET (operands[0], ret));
+ DONE;
+
+})
+
+(define_expand "reduc_sbool_ior_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI_AVX 1 "register_operand")]
+ "TARGET_SSE4_1"
+{
+ rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+ rtx tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, operands[1], operands[1]),
UNSPEC_PTEST);
+ emit_insn (gen_rtx_SET (flags, tmp));
+ rtx ret = gen_rtx_fmt_ee (NE, VOIDmode, flags, const0_rtx);
+ PUT_MODE (ret, QImode);
+ emit_insn (gen_rtx_SET (operands[0], ret));
+ DONE;
+})
+
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI1_AVX2 1 "register_operand")]
+ "TARGET_SSE2 && TARGET_POPCNT"
+{
+ rtx popcnt1 = gen_reg_rtx (SImode);
+ emit_insn (gen_<sse2_avx2>_pmovmskb (popcnt1,operands[1]));
+
+ emit_insn (gen_popcountsi2 (popcnt1, popcnt1));
+ emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+
+ emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1));
+ DONE;
+})
+
+(define_mode_attr ssefltvecmode
+ [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
+
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI48_AVX 1 "register_operand")]
+ "TARGET_SSE2 && TARGET_POPCNT"
+{
+ rtx popcnt1 = gen_reg_rtx (SImode);
+ rtx tmp = gen_rtx_UNSPEC (SImode, gen_rtvec(1,
+ gen_lowpart (<ssefltvecmode>mode,
+ operands[1])),
+ UNSPEC_MOVMSK);
+ emit_insn (gen_rtx_SET (popcnt1, tmp));
+
+ emit_insn (gen_popcountsi2 (popcnt1, popcnt1));
+ emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+
+ emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1));
+ DONE;
+})
+
(define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>"
[(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
(unspec:VFH_AVX512VL
@@ -18057,6 +18152,24 @@ (define_insn "*avx2_eq<mode>3"
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
+(define_insn_and_split "*eq<mode>3_2_negate"
+ [(set (match_operand:VI_AVX2_CMP 0 "register_operand")
+ (eq:VI_AVX2_CMP
+ (eq:VI_AVX2_CMP
+ (eq: VI_AVX2_CMP
+ (match_operand:VI_AVX2_CMP 1 "nonimmediate_operand")
+ (match_operand:VI_AVX2_CMP 2 "general_operand"))
+ (match_operand:VI_AVX2_CMP 3 "const0_operand"))
+ (match_operand:VI_AVX2_CMP 4 "const0_operand")))]
+ "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (eq:VI_AVX2_CMP (match_dup 1)
+ (match_dup 5)))]
+ "operands[5] = force_reg (<MODE>mode, operands[2]);")
+
+
(define_insn_and_split "*avx2_pcmp<mode>3_1"
[(set (match_operand:VI_128_256 0 "register_operand")
(vec_merge:VI_128_256
@@ -23747,9 +23860,6 @@ (define_insn_and_split
"*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_gt"
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "<MODE>")])
-(define_mode_attr ssefltvecmode
- [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
-
(define_insn_and_split
"*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint"
[(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x")
(unspec:<ssebytemode>
@@ -25564,6 +25674,36 @@ (define_split
(match_dup 0)
(pc)))])
+
+;; (unspec:ccz [(eq (eq op0 const0) const0)] unspec_ptest)
+;; is equal to (unspec:ccz [op0 op0] unspec_ptest).
+(define_insn_and_split "*ptest<mode>_ccz"
+ [(set (reg:CCZ FLAGS_REG)
+ (unspec:CCZ
+ [(eq:VI_AVX
+ (eq:VI_AVX
+ (match_operand:VI_AVX 0 "vector_operand")
+ (match_operand:VI_AVX 1 "const0_operand"))
+ (match_operand:VI_AVX 2 "const0_operand"))
+ (eq:VI_AVX
+ (eq:VI_AVX (match_dup 0) (match_dup 1))
+ (match_dup 2))]
+ UNSPEC_PTEST))]
+ "TARGET_SSE4_1
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (reg:CCZ FLAGS_REG)
+ (unspec:CCZ
+ [(match_dup 3) (match_dup 3)]
+ UNSPEC_PTEST))]
+{
+ if (MEM_P (operands[0]))
+ operands[3] = force_reg (<MODE>mode, operands[0]);
+ else
+ operands[3] = operands[0];
+})
+
(define_expand "nearbyint<mode>2"
[(set (match_operand:VFH 0 "register_operand")
(unspec:VFH
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c
new file mode 100644
index 00000000000..23fc67e8ae9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_and_vqi.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 1 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
+
+bool f2(char * p, long n)
+{
+ bool r = true;
+ for(long i = 0; i < 32; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c
new file mode 100644
index 00000000000..e1deb2fe21d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_ior_vqi.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-not "vpcmpeq" } } */
+
+bool f2(char * p, long n)
+{
+ bool r = false;
+ for(long i = 0; i < 32; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c
new file mode 100644
index 00000000000..ee526973006
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vdi.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 2 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-times "popcnt" 1 } } */
+/* { dg-final { scan-assembler-times "vmovmskpd" 1 } } */
+
+bool f(long long *p, long n)
+{
+ bool r = true;
+ for(long i = 0; i < 4; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+bool f2(long long *p, long n)
+{
+ bool r = false;
+ for(long i = 0; i < 4; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+bool f3(long long *p, long n)
+{
+ bool r = false;
+ for(long i = 0; i < 4; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c
new file mode 100644
index 00000000000..1707f15ce58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vqi.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 2 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-times "popcnt" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovmskb" 1 } } */
+
+bool f(char * p, long n)
+{
+ bool r = true;
+ for(long i = 0; i < 32; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+bool f2(char * p, long n)
+{
+ bool r = false;
+ for(long i = 0; i < 32; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+bool f3(char * p, long n)
+{
+ bool r = false;
+ for(long i = 0; i < 32; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c
b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c
new file mode 100644
index 00000000000..2d4a39f71c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101639_reduc_mask_vsi.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2" } */
+/* { dg-final { scan-assembler-times "vptest" 2 } } */
+/* { dg-final { scan-assembler-times "sete" 1 } } */
+/* { dg-final { scan-assembler-times "setne" 1 } } */
+/* { dg-final { scan-assembler-times "popcnt" 1 } } */
+/* { dg-final { scan-assembler-times "vmovmskps" 1 } } */
+
+bool f(int * p, long n)
+{
+ bool r = true;
+ for(long i = 0; i < 8; ++i)
+ r &= (p[i] != 0);
+ return r;
+}
+
+bool f2(int * p, long n)
+{
+ bool r = false;
+ for(long i = 0; i < 8; ++i)
+ r |= (p[i] != 0);
+ return r;
+}
+
+bool f3(int * p, long n)
+{
+ bool r = false;
+ for(long i = 0; i < 8; ++i)
+ r ^= (p[i] != 0);
+ return r;
+}
--
2.34.1