Re: [PATCH] Disparage slightly the mask register alternative for bitwise operations. [PR target/101142]

Hongtao Liu via Gcc-patches Mon, 21 Jun 2021 01:08:14 -0700

On Mon, Jun 21, 2021 at 3:28 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Jun 21, 2021 at 6:56 AM liuhongt <hongtao....@intel.com> wrote:
> >
> > The avx512 supports bitwise operations with mask registers, but the
> > throughput of those instructions is much lower than that of the
> > corresponding gpr version, so we would additionally disparages
> > slightly the mask register alternative for bitwise operations in the
> > LRA.
>
> This was the reason for UNSPEC tagged instructions with mask
> registers, used mainly for builtins.
>
> Also, care should be taken if we want mask registers to be used under
> GPR pressure, or it is better to spill GPR registers. In the past,
> DImode values caused a lot of problems with MMX registers on x86-64,
> but we were able to hand-tune the allocator in the way you propose.
>
> Let's try the proposed approach to see what happens.
>
> > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > MASK_REGS first since it has already been disparaged.
> >
> > gcc/ChangeLog:
> >
> >         PR target/101142
> >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> >         register alternative.
> >         (*and<mode>_1): Ditto.
> >         (*andqi_1): Ditto.
> >         (*andn<mode>_1): Ditto.
> >         (*<code><mode>_1): Ditto.
> >         (*<code>qi_1): Ditto.
> >         (*one_cmpl<mode>2_1): Ditto.
> >         (*one_cmplsi2_1_zext): Ditto.
> >         (*one_cmplqi2_1): Ditto.
> >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> >         the order of mask registers to be before general registers.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/101142
> >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
>
> OK with a comment addition, see inline.
>
> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386.c                        |  8 +-
> >  gcc/config/i386/i386.md                       | 20 ++---
> >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> >  6 files changed, 91 insertions(+), 59 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index a61255857ff..a651853ca3b 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> >     int pos = 0;
> >     int i;
> >
> > +   /* Mask register.  */
> > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > +     reg_alloc_order [pos++] = i;
>
> Please add a comment why mask registers should come first.
Thanks for the review, this is the patch i'm check in.
>
> >     /* First allocate the local general purpose registers.  */
> >     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
> >       if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i))
> > @@ -20489,10 +20493,6 @@ x86_order_regs_for_local_alloc (void)
> >     for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
> >       reg_alloc_order [pos++] = i;
> >
> > -   /* Mask register.  */
> > -   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > -     reg_alloc_order [pos++] = i;
> > -
> >     /* x87 registers.  */
> >     if (TARGET_SSE_MATH)
> >       for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index 6e4abf32e7c..3eef56b27d7 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -9138,7 +9138,7 @@ (define_insn_and_split "*anddi3_doubleword"
> >  })
> >
> >  (define_insn "*anddi_1"
> > -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
> > +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,?k")
> >         (and:DI
> >          (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
> >          (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
> > @@ -9226,7 +9226,7 @@ (define_insn "*andsi_1_zext"
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*and<mode>_1"
> > -  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
> > +  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,?k")
> >         (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" 
> > "%0,0,qm,k")
> >                    (match_operand:SWI24 2 "<general_operand>" 
> > "r<i>,m,L,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > @@ -9255,7 +9255,7 @@ (define_insn "*and<mode>_1"
> >     (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
> >
> >  (define_insn "*andqi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
> >         (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> >                 (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > @@ -9651,7 +9651,7 @@ (define_split
> >  })
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
> > +  [(set (match_operand:SWI48 0 "register_operand" "=r,r,?k")
> >         (and:SWI48
> >           (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
> >           (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
> > @@ -9667,7 +9667,7 @@ (define_insn "*andn<mode>_1"
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
> > +  [(set (match_operand:SWI12 0 "register_operand" "=r,?k")
> >         (and:SWI12
> >           (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
> >           (match_operand:SWI12 2 "register_operand" "r,k")))
> > @@ -9757,7 +9757,7 @@ (define_insn_and_split "*<code>di3_doubleword"
> >  })
> >
> >  (define_insn "*<code><mode>_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
> >         (any_or:SWI248
> >          (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> >          (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
> > @@ -9847,7 +9847,7 @@ (define_insn "*<code>si_1_zext_imm"
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*<code>qi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
> >         (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> >                    (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > @@ -10603,7 +10603,7 @@ (define_insn_and_split "*one_cmpldi2_doubleword"
> >    "split_double_mode (DImode, &operands[0], 2, &operands[0], 
> > &operands[2]);")
> >
> >  (define_insn "*one_cmpl<mode>2_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,?k")
> >         (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
> >    "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
> >    "@
> > @@ -10620,7 +10620,7 @@ (define_insn "*one_cmpl<mode>2_1"
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*one_cmplsi2_1_zext"
> > -  [(set (match_operand:DI 0 "register_operand" "=r,k")
> > +  [(set (match_operand:DI 0 "register_operand" "=r,?k")
> >         (zero_extend:DI
> >           (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
> >    "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
> > @@ -10632,7 +10632,7 @@ (define_insn "*one_cmplsi2_1_zext"
> >     (set_attr "mode" "SI,SI")])
> >
> >  (define_insn "*one_cmplqi2_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,?k")
> >         (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
> >    "ix86_unary_operator_ok (NOT, QImode, operands)"
> >    "@
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c 
> > b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> > index c5043e224ea..94d6764fc56 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> > @@ -1,15 +1,31 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u32
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
> >
> >  typedef unsigned long long u64;
> >  typedef unsigned int u32;
> >  typedef unsigned short u16;
> >  typedef unsigned char u8;
> >
> > +#ifdef DTYPE32
> > +typedef u32 DTYPE;
> > +#define byteswap byteswapu32
> > +#endif
> > +
> > +#ifdef DTYPE16
> > +typedef u16 DTYPE;
> > +#define byteswap byteswapu16
> > +#endif
> > +
> > +#ifdef DTYPE8
> > +typedef u16 DTYPE;
> > +#define byteswap byteswapu8
> > +#endif
> > +
> > +#ifdef DTYPE64
> > +typedef u16 DTYPE;
> > +#define byteswap byteswapu64
> > +#endif
> > +
> >  #define R(x,n) ( (x >> n) | (x << (32 - n)))
> >
> >  #define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
> > @@ -23,36 +39,51 @@ typedef unsigned char u8;
> >      d += tmp1;                                           \
> >  }
> >
> > -static inline DTYPE byteswap(DTYPE x)
> > +static inline u32 byteswapu32(u32 x)
> >  {
> > -       x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> > -       x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> > -       return x;
> > +  x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> > +  x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> > +  return x;
> >  }
> >
> > -#define BE_LOAD32(n,b,i) (n) = byteswap(*(DTYPE *)(b + i))
> > +static inline u16 byteswapu16(u16 x)
> > +{
> > +  x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;
> > +  return x;
> > +}
> > +
> > +static inline u8 byteswapu8(u8 x)
> > +{
> > +  return x;
> > +}
> > +
> > +static inline u64 byteswapu64(u64 x)
> > +{
> > +  x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x 
> > & 0xFFFFFFFF00000000) >> 32);
> > +  return x;
> > +}
> >
> > -void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
> > +void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
> >  {
> >      DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
> >      DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
> >         w8, w9, w10, w11, w12, w13, w14, w15;
> > -    w0  = byteswap(*(DTYPE *)(in + 0));
> > -    w1  = byteswap(*(DTYPE *)(in + 4));
> > -    w2  = byteswap(*(DTYPE *)(in + 8));
> > -    w3  = byteswap(*(DTYPE *)(in + 12));
> > -    w4  = byteswap(*(DTYPE *)(in + 16));
> > -    w5  = byteswap(*(DTYPE *)(in + 20));
> > -    w6  = byteswap(*(DTYPE *)(in + 24));
> > -    w7  = byteswap(*(DTYPE *)(in + 28));
> > -    w8  = byteswap(*(DTYPE *)(in + 32));
> > -    w9  = byteswap(*(DTYPE *)(in + 36));
> > -    w10 = byteswap(*(DTYPE *)(in + 40));
> > -    w11 = byteswap(*(DTYPE *)(in + 44));
> > -    w12 = byteswap(*(DTYPE *)(in + 48));
> > -    w13 = byteswap(*(DTYPE *)(in + 52));
> > -    w14 = byteswap(*(DTYPE *)(in + 56));
> > -    w15 = byteswap(*(DTYPE *)(in + 60));
> > +    w0  = byteswap(in[0]);
> > +    w1  = byteswap(in[1]);
> > +    w2  = byteswap(in[2]);
> > +    w3  = byteswap(in[3]);
> > +    w4  = byteswap(in[4]);
> > +    w5  = byteswap(in[5]);
> > +    w6  = byteswap(in[6]);
> > +    w7  = byteswap(in[7]);
> > +    w8  = byteswap(in[8]);
> > +    w9  = byteswap(in[9]);
> > +    w10 = byteswap(in[10]);
> > +    w11 = byteswap(in[11]);
> > +    w12 = byteswap(in[12]);
> > +    w13 = byteswap(in[13]);
> > +    w14 = byteswap(in[14]);
> > +    w15 = byteswap(in[15]);
> >      a = out[0];
> >      b = out[1];
> >      c = out[2];
> > @@ -90,3 +121,7 @@ void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
> >  }
> >
> >  /* { dg-final { scan-assembler "kmovd" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c 
> > b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> > index 1f0c6b474d6..c7d09769ed5 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> > @@ -1,10 +1,9 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u16
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE16" } */
> >
> >  #include "spill_to_mask-1.c"
> >
> > -/* { dg-final { scan-assembler "kmovw" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c 
> > b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> > index 5b59090c296..b7a383fe12f 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> > @@ -1,10 +1,9 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u8
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE8" } */
> >
> >  #include "spill_to_mask-1.c"
> >
> > -/* { dg-final { scan-assembler "kmovb" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c 
> > b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> > index f111cf42b36..d67038d5b88 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> > @@ -1,10 +1,9 @@
> >  /* { dg-do compile  { target { ! ia32 } } } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u64
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE64" } */
> >
> >  #include "spill_to_mask-1.c"
> >
> > -/* { dg-final { scan-assembler "kmovq" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > --
> > 2.18.1
> >




-- 
BR,
Hongtao

From d7e2dc0a63f5820df8b8dd686277043b04177d1c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Tue, 15 Jun 2021 16:25:16 +0800
Subject: [PATCH] Disparage slightly the mask register alternative for bitwise
 operations.

The avx512 supports bitwise operations with mask registers, but the
throughput of those instructions is much lower than that of the
corresponding gpr version, so we would additionally disparages
slightly the mask register alternative for bitwise operations in the
LRA.

Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
MASK_REGS first since it has already been disparaged.

gcc/ChangeLog:

	PR target/101142
	* config/i386/i386.md: (*anddi_1): Disparage slightly the mask
	register alternative.
	(*and<mode>_1): Ditto.
	(*andqi_1): Ditto.
	(*andn<mode>_1): Ditto.
	(*<code><mode>_1): Ditto.
	(*<code>qi_1): Ditto.
	(*one_cmpl<mode>2_1): Ditto.
	(*one_cmplsi2_1_zext): Ditto.
	(*one_cmplqi2_1): Ditto.
	* config/i386/i386.c (x86_order_regs_for_local_alloc): Change
	the order of mask registers to be before general registers.

gcc/testsuite/ChangeLog:

	PR target/101142
	* gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
	* gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
	* gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
	* gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
---
 gcc/config/i386/i386.c                        | 13 ++-
 gcc/config/i386/i386.md                       | 20 ++---
 .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
 .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
 .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
 .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
 6 files changed, 96 insertions(+), 59 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7d0d4143bca..c3740ff8ec4 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20475,6 +20475,15 @@ x86_order_regs_for_local_alloc (void)
    int pos = 0;
    int i;
 
+   /* When allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
+      MASK_REGS first since it has already been disparaged. This is for
+      testcase bitwise_mask_op3.c where the input is allocated as mask
+      registers, then mask bitwise instructions should be used there.
+      Refer to pr101142.  */
+   /* Mask register.  */
+   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+     reg_alloc_order [pos++] = i;
+
    /* First allocate the local general purpose registers.  */
    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
      if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i))
@@ -20501,10 +20510,6 @@ x86_order_regs_for_local_alloc (void)
    for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
      reg_alloc_order [pos++] = i;
 
-   /* Mask register.  */
-   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
-     reg_alloc_order [pos++] = i;
-
    /* x87 registers.  */
    if (TARGET_SSE_MATH)
      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 48532eb7ddf..7041789fa04 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9138,7 +9138,7 @@ (define_insn_and_split "*anddi3_doubleword"
 })
 
 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,?k")
 	(and:DI
 	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
 	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
@@ -9226,7 +9226,7 @@ (define_insn "*andsi_1_zext"
    (set_attr "mode" "SI")])
 
 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,?k")
 	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
 		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
@@ -9255,7 +9255,7 @@ (define_insn "*and<mode>_1"
    (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
 
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
 	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
 		(match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
@@ -9651,7 +9651,7 @@ (define_split
 })
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,?k")
 	(and:SWI48
 	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
 	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
@@ -9667,7 +9667,7 @@ (define_insn "*andn<mode>_1"
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,?k")
 	(and:SWI12
 	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
 	  (match_operand:SWI12 2 "register_operand" "r,k")))
@@ -9757,7 +9757,7 @@ (define_insn_and_split "*<code>di3_doubleword"
 })
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
 	(any_or:SWI248
 	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
 	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
@@ -9847,7 +9847,7 @@ (define_insn "*<code>si_1_zext_imm"
    (set_attr "mode" "SI")])
 
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
 	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
 		   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
@@ -10603,7 +10603,7 @@ (define_insn_and_split "*one_cmpldi2_doubleword"
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,?k")
 	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
   "@
@@ -10620,7 +10620,7 @@ (define_insn "*one_cmpl<mode>2_1"
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,k")
+  [(set (match_operand:DI 0 "register_operand" "=r,?k")
 	(zero_extend:DI
 	  (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
@@ -10632,7 +10632,7 @@ (define_insn "*one_cmplsi2_1_zext"
    (set_attr "mode" "SI,SI")])
 
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,?k")
 	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
index c5043e224ea..94d6764fc56 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
@@ -1,15 +1,31 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u32
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
 
 typedef unsigned long long u64;
 typedef unsigned int u32;
 typedef unsigned short u16;
 typedef unsigned char u8;
 
+#ifdef DTYPE32
+typedef u32 DTYPE;
+#define byteswap byteswapu32
+#endif
+
+#ifdef DTYPE16
+typedef u16 DTYPE;
+#define byteswap byteswapu16
+#endif
+
+#ifdef DTYPE8
+typedef u16 DTYPE;
+#define byteswap byteswapu8
+#endif
+
+#ifdef DTYPE64
+typedef u16 DTYPE;
+#define byteswap byteswapu64
+#endif
+
 #define R(x,n) ( (x >> n) | (x << (32 - n)))
 
 #define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
@@ -23,36 +39,51 @@ typedef unsigned char u8;
     d += tmp1;                                           \
 }
 
-static inline DTYPE byteswap(DTYPE x)
+static inline u32 byteswapu32(u32 x)
 {
-	x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
-	x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;  
-	return x;
+  x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
+  x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;  
+  return x;
 }
 
-#define BE_LOAD32(n,b,i) (n) = byteswap(*(DTYPE *)(b + i))
+static inline u16 byteswapu16(u16 x)
+{
+  x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;  
+  return x;
+}
+
+static inline u8 byteswapu8(u8 x)
+{
+  return x;
+}
+
+static inline u64 byteswapu64(u64 x)
+{
+  x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32);
+  return x;
+}
 
-void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
+void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
 {
     DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
     DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
 	w8, w9, w10, w11, w12, w13, w14, w15;
-    w0  = byteswap(*(DTYPE *)(in + 0));
-    w1  = byteswap(*(DTYPE *)(in + 4));
-    w2  = byteswap(*(DTYPE *)(in + 8));
-    w3  = byteswap(*(DTYPE *)(in + 12));
-    w4  = byteswap(*(DTYPE *)(in + 16));
-    w5  = byteswap(*(DTYPE *)(in + 20));
-    w6  = byteswap(*(DTYPE *)(in + 24));
-    w7  = byteswap(*(DTYPE *)(in + 28));
-    w8  = byteswap(*(DTYPE *)(in + 32));
-    w9  = byteswap(*(DTYPE *)(in + 36));
-    w10 = byteswap(*(DTYPE *)(in + 40));
-    w11 = byteswap(*(DTYPE *)(in + 44));
-    w12 = byteswap(*(DTYPE *)(in + 48));
-    w13 = byteswap(*(DTYPE *)(in + 52));
-    w14 = byteswap(*(DTYPE *)(in + 56));
-    w15 = byteswap(*(DTYPE *)(in + 60));
+    w0  = byteswap(in[0]);
+    w1  = byteswap(in[1]);
+    w2  = byteswap(in[2]);
+    w3  = byteswap(in[3]);
+    w4  = byteswap(in[4]);
+    w5  = byteswap(in[5]);
+    w6  = byteswap(in[6]);
+    w7  = byteswap(in[7]);
+    w8  = byteswap(in[8]);
+    w9  = byteswap(in[9]);
+    w10 = byteswap(in[10]);
+    w11 = byteswap(in[11]);
+    w12 = byteswap(in[12]);
+    w13 = byteswap(in[13]);
+    w14 = byteswap(in[14]);
+    w15 = byteswap(in[15]);
     a = out[0];
     b = out[1];
     c = out[2];
@@ -90,3 +121,7 @@ void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
 }
 
 /* { dg-final { scan-assembler "kmovd" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
index 1f0c6b474d6..c7d09769ed5 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
@@ -1,10 +1,9 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u16
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE16" } */
 
 #include "spill_to_mask-1.c"
 
-/* { dg-final { scan-assembler "kmovw" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
index 5b59090c296..b7a383fe12f 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
@@ -1,10 +1,9 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u8
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE8" } */
 
 #include "spill_to_mask-1.c"
 
-/* { dg-final { scan-assembler "kmovb" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
index f111cf42b36..d67038d5b88 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
@@ -1,10 +1,9 @@
 /* { dg-do compile  { target { ! ia32 } } } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u64
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE64" } */
 
 #include "spill_to_mask-1.c"
 
-/* { dg-final { scan-assembler "kmovq" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */
-- 
2.18.1

Re: [PATCH] Disparage slightly the mask register alternative for bitwise operations. [PR target/101142]

Reply via email to