On Tue, Dec 21, 2021 at 2:27 PM liuhongt <hongtao....@intel.com> wrote: > > The purpose of those define_insn_and_split: > 1. Combine vpcmpuw and zero_extend into vpcmpuw. > 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just > kmov > 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant > kmov. Use DImode as dest of zero_extend is too aggressive which causes several regression. New patch add define_insn_and_split just combine vpcmpuw and zero_extend into vpcmpuw. Here's the patch i'm checking in. > > It should partially fix the issue in PR. > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ready to push to trunk. > > gcc/ChangeLog: > > PR target/103750 > * config/i386/sse.md > (*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > New define_insn_and_split. > (*<avx512>_cmp<mode>3): Ditto. > (*<avx512>_cmp<mode>3_zero_extenddi): New define_insn. > (*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > New define_insn_and_split. > (*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > Ditto. > (*<avx512>_ucmp<mode>3): Ditto. > (*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn. > (*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>): > New define_insn_and_split. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/ > * g++.target/i386/pr103750-1.C: New test. > --- > gcc/config/i386/sse.md | 267 ++++++++++++++++++ > gcc/testsuite/g++.target/i386/pr103750-1.C | 50 ++++ > .../gcc.target/i386/bitwise_mask_op-3.c | 6 +- > 3 files changed, 320 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 5196149ee32..fb885d58272 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -3702,6 +3702,75 @@ (define_insn > "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can > transfrom > +;; duplicated vpcmpuw to vpcmpuw and kmov > +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by > cprop_hardreg. > +(define_insn_and_split > "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<V48H_AVX512VL:avx512fmaskmode> > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<V48H_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_cmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], > <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") > + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64" > + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_insn_and_split "*<avx512>_cmp<mode>3" > [(set (match_operand:<avx512fmaskmode> 0 "register_operand") > (not:<avx512fmaskmode> > @@ -3735,6 +3804,72 @@ (define_insn > "<avx512>_cmp<mode>3<mask_scalar_merge_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split > "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_cmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], > <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_int_iterator UNSPEC_PCMP_ITER > [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) > > @@ -3771,6 +3906,72 @@ (define_insn > "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split > "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "const_0_to_7_operand" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<VI12_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_ucmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], > <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") > + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" > [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > (unspec:<avx512fmaskmode> > @@ -3785,6 +3986,72 @@ (define_insn > "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn_and_split > "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>" > + [(set (match_operand:SWI248x 0 "register_operand" "=k") > + (zero_extend:SWI248x > + (unspec:<VI48_AVX512VL:avx512fmaskmode> > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "const_0_to_7_operand" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) > + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + "&& <SWI248x:MODE>mode != E_DImode" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<VI48_AVX512VL:avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")]) > + > +(define_insn_and_split "*<avx512>_ucmp<mode>3" > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") > + (unspec:<avx512fmaskmode> > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3)] > + UNSPEC_UNSIGNED_PCMP)))] > + "operands[0] = lowpart_subreg (DImode, operands[0], > <avx512fmaskmode>mode);" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" > + [(set (match_operand:DI 0 "register_operand" "=k") > + (zero_extend:DI > + (unspec:<avx512fmaskmode> > + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") > + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") > + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] > + UNSPEC_UNSIGNED_PCMP)))] > + "TARGET_AVX512BW > + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" > + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" > + [(set_attr "type" "ssecmp") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_insn_and_split "*<avx512>_ucmp<mode>3" > [(set (match_operand:<avx512fmaskmode> 0 "register_operand") > (not:<avx512fmaskmode> > diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C > b/gcc/testsuite/g++.target/i386/pr103750-1.C > new file mode 100644 > index 00000000000..83f471331b3 > --- /dev/null > +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C > @@ -0,0 +1,50 @@ > +/* PR target/103750 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */ > +/* { dg-final { scan-assembler-times "kmovw" 2 } } */ > +/* { dg-final { scan-assembler-times "kmovd" 2 } } */ > +/* There shouldn't be any kmovw/kmovd inside the loop. */ > +#include <immintrin.h> > + > +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept > +{ > + __m256i mch256 = _mm256_set1_epi16(c); > + for ( ; n < e; n += 32) { > + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); > + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) > + 1); > + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); > + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); > + if (_kortestz_mask16_u8(mask1, mask2)) > + continue; > + > + unsigned idx = _tzcnt_u32(mask1); > + if (mask1 == 0) { > + idx = __tzcnt_u16(mask2); > + n += 16; > + } > + return n + idx; > + } > + return e; > +} > + > +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept > +{ > + __m256i mch256 = _mm256_set1_epi16(c); > + for ( ; n < e; n += 32) { > + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); > + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) > + 1); > + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); > + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); > + if (_kortestz_mask32_u8(mask1, mask2)) > + continue; > + > + unsigned idx = _tzcnt_u32(mask1); > + if (mask1 == 0) { > + idx = __tzcnt_u16(mask2); > + n += 16; > + } > + return n + idx; > + } > + return e; > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > index 352c49d6c6b..82bb99e30af 100644 > --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c > @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b) > foo = m1 | m2; > } > > -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } } > */ > +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { > ! ia32 } } } } } */ > > void > foo_xorb (__m512i a, __m512i b) > @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b) > foo = m1 ^ m2; > } > > -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } } > */ > +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { > ! ia32 } } } } } */ > > void > foo_andb (__m512i a, __m512i b) > @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b) > foo = m1 & ~m2; > } > > -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } } > */ > +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { > ! ia32 } } } } } */ > -- > 2.18.1 >
-- BR, Hongtao
0001-i386-Combine-vpcmpuw-zero_extend-to-vpcmpuw.patch
Description: Binary data