> Am 27.10.2023 um 09:13 schrieb Hongtao Liu <crazy...@gmail.com>:
>
> On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
> <richard.guent...@gmail.com> wrote:
>>
>>
>>
>>>> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao....@intel.com>:
>>>
>>> When 2 vectors are equal, kmask is allones and kortest will set CF,
>>> else CF will be cleared.
>>>
>>> So CF bit can be used to check for the result of the comparison.
>>>
>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>>> Ok for trunk?
>>
>> Is that also profitable for 256bit aka AVX10?
> Yes, it's also available for both 128-bit and 256-bit with AVX10, from
> performance perspective it's better.
> AVX10:
> vpcmp + kortest
> vs
> AVX2:
> vpxor + vptest
>
> vptest is more expensive than vpcmp + kortest
>
>> Is there a jump on carry in case the result feeds control flow rather than a
>> value and is using ktest better then (does combine figure this out?)
> There are JC and JNC, there're many pattern matches for ptest which
> can't be automatically adjusted to kortest by combining, backend needs
> to manually transform them.
> That's why my patch only handles 64-bit vectors(to avoid regressing
> those pattern match stuff).
Ah, I see. That’s exactly what I was wondering.
Richard
>
>>
>>> Before:
>>> vmovdqu (%rsi), %ymm0
>>> vpxorq (%rdi), %ymm0, %ymm0
>>> vptest %ymm0, %ymm0
>>> jne .L2
>>> vmovdqu 32(%rsi), %ymm0
>>> vpxorq 32(%rdi), %ymm0, %ymm0
>>> vptest %ymm0, %ymm0
>>> je .L5
>>> .L2:
>>> movl $1, %eax
>>> xorl $1, %eax
>>> vzeroupper
>>> ret
>>>
>>> After:
>>> vmovdqu64 (%rsi), %zmm0
>>> xorl %eax, %eax
>>> vpcmpeqd (%rdi), %zmm0, %k0
>>> kortestw %k0, %k0
>>> setc %al
>>> vzeroupper
>>> ret
>>>
>>> gcc/ChangeLog:
>>>
>>> PR target/104610
>>> * config/i386/i386-expand.cc (ix86_expand_branch): Handle
>>> 512-bit vector with vpcmpeq + kortest.
>>> * config/i386/i386.md (cbranchxi4): New expander.
>>> * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
>>> and V8DImode.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> * gcc.target/i386/pr104610-2.c: New test.
>>> ---
>>> gcc/config/i386/i386-expand.cc | 55 +++++++++++++++-------
>>> gcc/config/i386/i386.md | 16 +++++++
>>> gcc/config/i386/sse.md | 36 +++++++++++---
>>> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
>>> 4 files changed, 99 insertions(+), 22 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
>>>
>>> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
>>> index 1eae9d7c78c..c664cb61e80 100644
>>> --- a/gcc/config/i386/i386-expand.cc
>>> +++ b/gcc/config/i386/i386-expand.cc
>>> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0,
>>> rtx op1, rtx label)
>>> rtx tmp;
>>>
>>> /* Handle special case - vector comparsion with boolean result, transform
>>> - it using ptest instruction. */
>>> + it using ptest instruction or vpcmpeq + kortest. */
>>> if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>>> || (mode == TImode && !TARGET_64BIT)
>>> - || mode == OImode)
>>> + || mode == OImode
>>> + || GET_MODE_SIZE (mode) == 64)
>>> {
>>> - rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
>>> - machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode :
>>> V2DImode;
>>> + unsigned msize = GET_MODE_SIZE (mode);
>>> + machine_mode p_mode
>>> + = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
>>> + /* kortest set CF when result is 0xFFFF (op0 == op1). */
>>> + rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
>>>
>>> gcc_assert (code == EQ || code == NE);
>>>
>>> - if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> + /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
>>> + if (msize == 64)
>>> {
>>> - op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> - op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> - mode = p_mode;
>>> + if (mode != V16SImode)
>>> + {
>>> + op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> + op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> + }
>>> +
>>> + tmp = gen_reg_rtx (HImode);
>>> + emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
>>> + emit_insn (gen_kortesthi_ccc (tmp, tmp));
>>> + }
>>> + /* Using ptest for 128/256-bit vectors. */
>>> + else
>>> + {
>>> + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> + {
>>> + op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> + op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> + mode = p_mode;
>>> + }
>>> +
>>> + /* Generate XOR since we can't check that one operand is zero
>>> + vector. */
>>> + tmp = gen_reg_rtx (mode);
>>> + emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> + tmp = gen_lowpart (p_mode, tmp);
>>> + emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> + gen_rtx_UNSPEC (CCZmode,
>>> + gen_rtvec (2, tmp, tmp),
>>> + UNSPEC_PTEST)));
>>> }
>>> - /* Generate XOR since we can't check that one operand is zero
>>> vector. */
>>> - tmp = gen_reg_rtx (mode);
>>> - emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> - tmp = gen_lowpart (p_mode, tmp);
>>> - emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> - gen_rtx_UNSPEC (CCZmode,
>>> - gen_rtvec (2, tmp, tmp),
>>> - UNSPEC_PTEST)));
>>> tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
>>> tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
>>> gen_rtx_LABEL_REF (VOIDmode, label),
>>> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
>>> index abaf2f311e8..51d8d0c3b97 100644
>>> --- a/gcc/config/i386/i386.md
>>> +++ b/gcc/config/i386/i386.md
>>> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
>>> DONE;
>>> })
>>>
>>> +(define_expand "cbranchxi4"
>>> + [(set (reg:CC FLAGS_REG)
>>> + (compare:CC (match_operand:XI 1 "nonimmediate_operand")
>>> + (match_operand:XI 2 "nonimmediate_operand")))
>>> + (set (pc) (if_then_else
>>> + (match_operator 0 "bt_comparison_operator"
>>> + [(reg:CC FLAGS_REG) (const_int 0)])
>>> + (label_ref (match_operand 3))
>>> + (pc)))]
>>> + "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
>>> +{
>>> + ix86_expand_branch (GET_CODE (operands[0]),
>>> + operands[1], operands[2], operands[3]);
>>> + DONE;
>>> +})
>>> +
>>> (define_expand "cstore<mode>4"
>>> [(set (reg:CC FLAGS_REG)
>>> (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
>>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>>> index c988935d4df..88fb1154699 100644
>>> --- a/gcc/config/i386/sse.md
>>> +++ b/gcc/config/i386/sse.md
>>> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
>>> (set_attr "type" "msklog")
>>> (set_attr "prefix" "vex")])
>>>
>>> -(define_insn "kortest<mode>"
>>> - [(set (reg:CC FLAGS_REG)
>>> - (unspec:CC
>>> +(define_insn "*kortest<mode>"
>>> + [(set (reg FLAGS_REG)
>>> + (unspec
>>> [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
>>> (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
>>> UNSPEC_KORTEST))]
>>> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
>>> (set_attr "type" "msklog")
>>> (set_attr "prefix" "vex")])
>>>
>>> +(define_insn "kortest<mode>_ccc"
>>> + [(set (reg:CCC FLAGS_REG)
>>> + (unspec:CCC
>>> + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> + UNSPEC_KORTEST))]
>>> + "TARGET_AVX512F")
>>> +
>>> +(define_insn "kortest<mode>_ccz"
>>> + [(set (reg:CCZ FLAGS_REG)
>>> + (unspec:CCZ
>>> + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> + UNSPEC_KORTEST))]
>>> + "TARGET_AVX512F")
>>> +
>>> +(define_expand "kortest<mode>"
>>> + [(set (reg:CC FLAGS_REG)
>>> + (unspec:CC
>>> + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> + UNSPEC_KORTEST))]
>>> + "TARGET_AVX512F")
>>> +
>>> (define_insn "kunpckhi"
>>> [(set (match_operand:HI 0 "register_operand" "=k")
>>> (ior:HI
>>> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
>>>
>>> (define_expand "cbranch<mode>4"
>>> [(set (reg:CC FLAGS_REG)
>>> - (compare:CC (match_operand:VI48_AVX 1 "register_operand")
>>> - (match_operand:VI48_AVX 2 "nonimmediate_operand")))
>>> + (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
>>> + (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
>>> (set (pc) (if_then_else
>>> (match_operator 0 "bt_comparison_operator"
>>> [(reg:CC FLAGS_REG) (const_int 0)])
>>> (label_ref (match_operand 3))
>>> (pc)))]
>>> - "TARGET_SSE4_1"
>>> + "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
>>> {
>>> ix86_expand_branch (GET_CODE (operands[0]),
>>> operands[1], operands[2], operands[3]);
>>> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> new file mode 100644
>>> index 00000000000..999ef926a18
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> @@ -0,0 +1,14 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
>>> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
>>> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
>>> +
>>> +int compare (const char* s1, const char* s2)
>>> +{
>>> + return __builtin_memcmp (s1, s2, 64) == 0;
>>> +}
>>> +
>>> +int compare1 (const char* s1, const char* s2)
>>> +{
>>> + return __builtin_memcmp (s1, s2, 64) != 0;
>>> +}
>>> --
>>> 2.31.1
>>>
>
>
>
> --
> BR,
> Hongtao