> Am 27.10.2023 um 09:13 schrieb Hongtao Liu <crazy...@gmail.com>:
> 
> On Fri, Oct 27, 2023 at 2:49 PM Richard Biener
> <richard.guent...@gmail.com> wrote:
>> 
>> 
>> 
>>>> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao....@intel.com>:
>>> 
>>> When 2 vectors are equal, kmask is allones and kortest will set CF,
>>> else CF will be cleared.
>>> 
>>> So CF bit can be used to check for the result of the comparison.
>>> 
>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>>> Ok for trunk?
>> 
>> Is that also profitable for 256bit aka AVX10?
> Yes, it's also available for both 128-bit and 256-bit with AVX10, from
> performance perspective it's better.
> AVX10:
>  vpcmp + kortest
> vs
> AVX2:
> vpxor + vptest
> 
> vptest is more expensive than vpcmp + kortest
> 
>> Is there a jump on carry in case the result feeds control flow rather than a 
>> value and is using ktest better then (does combine figure this out?)
> There are JC and JNC, there're many pattern matches for ptest which
> can't be automatically adjusted to kortest by combining, backend needs
> to manually transform them.
> That's why my patch only handles 64-bit vectors(to avoid regressing
> those pattern match stuff).

Ah, I see.  That’s exactly what I was wondering.

Richard 

> 
>> 
>>> Before:
>>>       vmovdqu (%rsi), %ymm0
>>>       vpxorq  (%rdi), %ymm0, %ymm0
>>>       vptest  %ymm0, %ymm0
>>>       jne     .L2
>>>       vmovdqu 32(%rsi), %ymm0
>>>       vpxorq  32(%rdi), %ymm0, %ymm0
>>>       vptest  %ymm0, %ymm0
>>>       je      .L5
>>> .L2:
>>>       movl    $1, %eax
>>>       xorl    $1, %eax
>>>       vzeroupper
>>>       ret
>>> 
>>> After:
>>>       vmovdqu64       (%rsi), %zmm0
>>>       xorl    %eax, %eax
>>>       vpcmpeqd        (%rdi), %zmm0, %k0
>>>       kortestw        %k0, %k0
>>>       setc    %al
>>>       vzeroupper
>>>       ret
>>> 
>>> gcc/ChangeLog:
>>> 
>>>   PR target/104610
>>>   * config/i386/i386-expand.cc (ix86_expand_branch): Handle
>>>   512-bit vector with vpcmpeq + kortest.
>>>   * config/i386/i386.md (cbranchxi4): New expander.
>>>   * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
>>>   and V8DImode.
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>>   * gcc.target/i386/pr104610-2.c: New test.
>>> ---
>>> gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
>>> gcc/config/i386/i386.md                    | 16 +++++++
>>> gcc/config/i386/sse.md                     | 36 +++++++++++---
>>> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
>>> 4 files changed, 99 insertions(+), 22 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> 
>>> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
>>> index 1eae9d7c78c..c664cb61e80 100644
>>> --- a/gcc/config/i386/i386-expand.cc
>>> +++ b/gcc/config/i386/i386-expand.cc
>>> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, 
>>> rtx op1, rtx label)
>>>  rtx tmp;
>>> 
>>>  /* Handle special case - vector comparsion with boolean result, transform
>>> -     it using ptest instruction.  */
>>> +     it using ptest instruction or vpcmpeq + kortest.  */
>>>  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>>>      || (mode == TImode && !TARGET_64BIT)
>>> -      || mode == OImode)
>>> +      || mode == OImode
>>> +      || GET_MODE_SIZE (mode) == 64)
>>>    {
>>> -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
>>> -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : 
>>> V2DImode;
>>> +      unsigned msize = GET_MODE_SIZE (mode);
>>> +      machine_mode p_mode
>>> +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
>>> +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
>>> +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
>>> 
>>>      gcc_assert (code == EQ || code == NE);
>>> 
>>> -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
>>> +      if (msize == 64)
>>>   {
>>> -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> -      mode = p_mode;
>>> +      if (mode != V16SImode)
>>> +        {
>>> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> +        }
>>> +
>>> +      tmp = gen_reg_rtx (HImode);
>>> +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
>>> +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
>>> +    }
>>> +      /* Using ptest for 128/256-bit vectors.  */
>>> +      else
>>> +    {
>>> +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
>>> +        {
>>> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
>>> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
>>> +          mode = p_mode;
>>> +        }
>>> +
>>> +      /* Generate XOR since we can't check that one operand is zero
>>> +         vector.  */
>>> +      tmp = gen_reg_rtx (mode);
>>> +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> +      tmp = gen_lowpart (p_mode, tmp);
>>> +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> +                  gen_rtx_UNSPEC (CCZmode,
>>> +                          gen_rtvec (2, tmp, tmp),
>>> +                          UNSPEC_PTEST)));
>>>   }
>>> -      /* Generate XOR since we can't check that one operand is zero 
>>> vector.  */
>>> -      tmp = gen_reg_rtx (mode);
>>> -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
>>> -      tmp = gen_lowpart (p_mode, tmp);
>>> -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
>>> -                  gen_rtx_UNSPEC (CCZmode,
>>> -                          gen_rtvec (2, tmp, tmp),
>>> -                          UNSPEC_PTEST)));
>>>      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
>>>      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
>>>                 gen_rtx_LABEL_REF (VOIDmode, label),
>>> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
>>> index abaf2f311e8..51d8d0c3b97 100644
>>> --- a/gcc/config/i386/i386.md
>>> +++ b/gcc/config/i386/i386.md
>>> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
>>>  DONE;
>>> })
>>> 
>>> +(define_expand "cbranchxi4"
>>> +  [(set (reg:CC FLAGS_REG)
>>> +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
>>> +            (match_operand:XI 2 "nonimmediate_operand")))
>>> +   (set (pc) (if_then_else
>>> +           (match_operator 0 "bt_comparison_operator"
>>> +        [(reg:CC FLAGS_REG) (const_int 0)])
>>> +           (label_ref (match_operand 3))
>>> +           (pc)))]
>>> +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
>>> +{
>>> +  ix86_expand_branch (GET_CODE (operands[0]),
>>> +              operands[1], operands[2], operands[3]);
>>> +  DONE;
>>> +})
>>> +
>>> (define_expand "cstore<mode>4"
>>>  [(set (reg:CC FLAGS_REG)
>>>   (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
>>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>>> index c988935d4df..88fb1154699 100644
>>> --- a/gcc/config/i386/sse.md
>>> +++ b/gcc/config/i386/sse.md
>>> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
>>>   (set_attr "type" "msklog")
>>>   (set_attr "prefix" "vex")])
>>> 
>>> -(define_insn "kortest<mode>"
>>> -  [(set (reg:CC FLAGS_REG)
>>> -    (unspec:CC
>>> +(define_insn "*kortest<mode>"
>>> +  [(set (reg FLAGS_REG)
>>> +    (unspec
>>>     [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
>>>      (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
>>>     UNSPEC_KORTEST))]
>>> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
>>>   (set_attr "type" "msklog")
>>>   (set_attr "prefix" "vex")])
>>> 
>>> +(define_insn "kortest<mode>_ccc"
>>> +  [(set (reg:CCC FLAGS_REG)
>>> +    (unspec:CCC
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> +(define_insn "kortest<mode>_ccz"
>>> +  [(set (reg:CCZ FLAGS_REG)
>>> +    (unspec:CCZ
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> +(define_expand "kortest<mode>"
>>> +  [(set (reg:CC FLAGS_REG)
>>> +    (unspec:CC
>>> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
>>> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
>>> +      UNSPEC_KORTEST))]
>>> +  "TARGET_AVX512F")
>>> +
>>> (define_insn "kunpckhi"
>>>  [(set (match_operand:HI 0 "register_operand" "=k")
>>>   (ior:HI
>>> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
>>> 
>>> (define_expand "cbranch<mode>4"
>>>  [(set (reg:CC FLAGS_REG)
>>> -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
>>> -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
>>> +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
>>> +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
>>>   (set (pc) (if_then_else
>>>          (match_operator 0 "bt_comparison_operator"
>>>       [(reg:CC FLAGS_REG) (const_int 0)])
>>>          (label_ref (match_operand 3))
>>>          (pc)))]
>>> -  "TARGET_SSE4_1"
>>> +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
>>> {
>>>  ix86_expand_branch (GET_CODE (operands[0]),
>>>             operands[1], operands[2], operands[3]);
>>> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c 
>>> b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> new file mode 100644
>>> index 00000000000..999ef926a18
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
>>> @@ -0,0 +1,14 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
>>> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
>>> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
>>> +
>>> +int compare (const char* s1, const char* s2)
>>> +{
>>> +  return __builtin_memcmp (s1, s2, 64) == 0;
>>> +}
>>> +
>>> +int compare1 (const char* s1, const char* s2)
>>> +{
>>> +  return __builtin_memcmp (s1, s2, 64) != 0;
>>> +}
>>> --
>>> 2.31.1
>>> 
> 
> 
> 
> -- 
> BR,
> Hongtao

Reply via email to