[PATCH] i386: Sync tune_string with arch_string for target attribute arch=*

2023-06-25 Thread Hongyu Wang via Gcc-patches
Hi,

For function with target attribute arch=*, current logic will set its
tune to -mtune from command line so all target_clones will get same
tuning flags which would affect the performance for each clone. Override
tune with arch if tune was not explicitly specified to get proper tuning
flags for target_clones.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}

Ok for trunk and backport to active release branches?

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
Override tune_string with arch_string if tune_string is not
explicitly specified.

gcc/testsuite/ChangeLog:

* gcc.target/i386/mvc17.c: New test.
---
 gcc/config/i386/i386-options.cc   |  6 +-
 gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..7f593cebe76 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1400,7 +1400,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args,
   if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
opts->x_ix86_tune_string
  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
-  else if (orig_tune_defaulted)
+  /* If we have explicit arch string and no tune string specified, set
+tune_string to NULL and later it will be overriden by arch_string
+so target clones can get proper optimization.  */
+  else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
+  || orig_tune_defaulted)
opts->x_ix86_tune_string = NULL;
 
   /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c 
b/gcc/testsuite/gcc.target/i386/mvc17.c
new file mode 100644
index 000..2c7cc2fdace
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mvc17.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-ifunc "" } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "rep mov" 1 } } */
+
+__attribute__((target_clones("default","arch=icelake-server")))
+void
+foo (char *a, char *b, int size)
+{
+  __builtin_memcpy (a, b, size & 0x7F);
+}
-- 
2.31.1



[PATCH] i386: Relax inline requirement for functions with different target attrs

2023-06-25 Thread Hongyu Wang via Gcc-patches
Hi,

For function with different target attributes, current logic rejects to
inline the callee when any arch or tune is mismatched. Relax the
condition to honor just prefer_vecotr_width_type and other flags that
may cause safety issue so caller can get more optimization opportunity.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}

Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_can_inline_p): Do not check arch or
tune directly, just check prefer_vector_width_type and make sure
not to inline if they mismatch.

gcc/testsuite/ChangeLog:

* gcc.target/i386/inline-target-attr.c: New test.
---
 gcc/config/i386/i386.cc   | 11 +
 .../gcc.target/i386/inline-target-attr.c  | 24 +++
 2 files changed, 30 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/inline-target-attr.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0761965344b..1d86384ac06 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -605,11 +605,12 @@ ix86_can_inline_p (tree caller, tree callee)
   != (callee_opts->x_target_flags & ~always_inline_safe_mask))
 ret = false;
 
-  /* See if arch, tune, etc. are the same.  */
-  else if (caller_opts->arch != callee_opts->arch)
-ret = false;
-
-  else if (!always_inline && caller_opts->tune != callee_opts->tune)
+  /* Do not inline when specified perfer-vector-width mismatched between
+ callee and caller.  */
+  else if ((callee_opts->x_prefer_vector_width_type != PVW_NONE
+  && caller_opts->x_prefer_vector_width_type != PVW_NONE)
+  && callee_opts->x_prefer_vector_width_type
+ != caller_opts->x_prefer_vector_width_type)
 ret = false;
 
   else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
diff --git a/gcc/testsuite/gcc.target/i386/inline-target-attr.c 
b/gcc/testsuite/gcc.target/i386/inline-target-attr.c
new file mode 100644
index 000..995502165f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/inline-target-attr.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "call\[ \t\]callee" } } */
+
+__attribute__((target("arch=skylake")))
+int callee (int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; i++)
+{
+  if (i % 2 == 0)
+   sum +=i;
+  else
+   sum += (i - 1);
+}
+  return sum + n;
+}
+
+__attribute__((target("arch=icelake-server")))
+int caller (int n)
+{
+  return callee (n) + n;
+}
+
-- 
2.31.1



Re: [PATCH] i386: Sync tune_string with arch_string for target attribute arch=*

2023-06-26 Thread Hongyu Wang via Gcc-patches
Thanks, I'll backport it down to GCC10 after this passed all bootstrap/regtest.

Uros Bizjak via Gcc-patches  于2023年6月26日周一 14:05写道:
>
> On Mon, Jun 26, 2023 at 4:31 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > For function with target attribute arch=*, current logic will set its
> > tune to -mtune from command line so all target_clones will get same
> > tuning flags which would affect the performance for each clone. Override
> > tune with arch if tune was not explicitly specified to get proper tuning
> > flags for target_clones.
> >
> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}
> >
> > Ok for trunk and backport to active release branches?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
> > Override tune_string with arch_string if tune_string is not
> > explicitly specified.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/mvc17.c: New test.
>
> LGTM.
>
> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386-options.cc   |  6 +-
> >  gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++
> >  2 files changed, 16 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c
> >
> > diff --git a/gcc/config/i386/i386-options.cc 
> > b/gcc/config/i386/i386-options.cc
> > index 2cb0bddcd35..7f593cebe76 100644
> > --- a/gcc/config/i386/i386-options.cc
> > +++ b/gcc/config/i386/i386-options.cc
> > @@ -1400,7 +1400,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree 
> > args,
> >if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
> > opts->x_ix86_tune_string
> >   = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
> > -  else if (orig_tune_defaulted)
> > +  /* If we have explicit arch string and no tune string specified, set
> > +tune_string to NULL and later it will be overriden by arch_string
> > +so target clones can get proper optimization.  */
> > +  else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
> > +  || orig_tune_defaulted)
> > opts->x_ix86_tune_string = NULL;
> >
> >/* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
> > diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c 
> > b/gcc/testsuite/gcc.target/i386/mvc17.c
> > new file mode 100644
> > index 000..2c7cc2fdace
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/mvc17.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-ifunc "" } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { scan-assembler-times "rep mov" 1 } } */
> > +
> > +__attribute__((target_clones("default","arch=icelake-server")))
> > +void
> > +foo (char *a, char *b, int size)
> > +{
> > +  __builtin_memcpy (a, b, size & 0x7F);
> > +}
> > --
> > 2.31.1
> >


Re: [PATCH] i386: Relax inline requirement for functions with different target attrs

2023-06-27 Thread Hongyu Wang via Gcc-patches
> I don't think this is desirable. If we inline something with different
> ISAs, we get some strange mix of ISAs when the function is inlined.
> OTOH - we already inline with mismatched tune flags if the function is
> marked with always_inline.

Previously ix86_can_inline_p has

if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
 != callee_opts->x_ix86_isa_flags)
|| ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
!= callee_opts->x_ix86_isa_flags2))
  ret = false;

It make sure caller ISA is a super set of callee, and the inlined one
should follow caller's ISA specification.

IMHO I cannot give a real example that after inline the caller's
performance get harmed, I added PVW since there might
be some callee want to limit its vector size and caller may have
larger preferred vector size. At least with current change
we get more optimization opportunity for different target_clones.

But I agree the tuning setting may be a factor that affect the
performance. One possible choice is that if the
tune for callee is unspecified or default, just inline it to the
caller with specified arch and tune.

Uros Bizjak via Gcc-patches  于2023年6月27日周二 17:16写道:



>
> On Mon, Jun 26, 2023 at 4:36 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > For function with different target attributes, current logic rejects to
> > inline the callee when any arch or tune is mismatched. Relax the
> > condition to honor just prefer_vecotr_width_type and other flags that
> > may cause safety issue so caller can get more optimization opportunity.
>
> I don't think this is desirable. If we inline something with different
> ISAs, we get some strange mix of ISAs when the function is inlined.
> OTOH - we already inline with mismatched tune flags if the function is
> marked with always_inline.
>
> Uros.
>
> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}
> >
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.cc (ix86_can_inline_p): Do not check arch or
> > tune directly, just check prefer_vector_width_type and make sure
> > not to inline if they mismatch.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/inline-target-attr.c: New test.
> > ---
> >  gcc/config/i386/i386.cc   | 11 +
> >  .../gcc.target/i386/inline-target-attr.c  | 24 +++
> >  2 files changed, 30 insertions(+), 5 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/inline-target-attr.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 0761965344b..1d86384ac06 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -605,11 +605,12 @@ ix86_can_inline_p (tree caller, tree callee)
> >!= (callee_opts->x_target_flags & ~always_inline_safe_mask))
> >  ret = false;
> >
> > -  /* See if arch, tune, etc. are the same.  */
> > -  else if (caller_opts->arch != callee_opts->arch)
> > -ret = false;
> > -
> > -  else if (!always_inline && caller_opts->tune != callee_opts->tune)
> > +  /* Do not inline when specified perfer-vector-width mismatched between
> > + callee and caller.  */
> > +  else if ((callee_opts->x_prefer_vector_width_type != PVW_NONE
> > +  && caller_opts->x_prefer_vector_width_type != PVW_NONE)
> > +  && callee_opts->x_prefer_vector_width_type
> > + != caller_opts->x_prefer_vector_width_type)
> >  ret = false;
> >
> >else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
> > diff --git a/gcc/testsuite/gcc.target/i386/inline-target-attr.c 
> > b/gcc/testsuite/gcc.target/i386/inline-target-attr.c
> > new file mode 100644
> > index 000..995502165f0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/inline-target-attr.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { scan-assembler-not "call\[ \t\]callee" } } */
> > +
> > +__attribute__((target("arch=skylake")))
> > +int callee (int n)
> > +{
> > +  int sum = 0;
> > +  for (int i = 0; i < n; i++)
> > +{
> > +  if (i % 2 == 0)
> > +   sum +=i;
> > +  else
> > +   sum += (i - 1);
> > +}
> > +  return sum + n;
> > +}
> > +
> > +__attribute__((target("arch=icelake-server")))
> > +int caller (int n)
> > +{
> > +  return callee (n) + n;
> > +}
> > +
> > --
> > 2.31.1
> >


Re: [PATCH] i386: Sync tune_string with arch_string for target attribute arch=*

2023-06-27 Thread Hongyu Wang via Gcc-patches
The testcase fails with --with-arch=native build on cascadelake, here
is the patch to adjust it

gcc/testsuite/ChangeLog:

* gcc.target/i386/mvc17.c: Add -march=x86-64 to dg-options.
---
 gcc/testsuite/gcc.target/i386/mvc17.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c
b/gcc/testsuite/gcc.target/i386/mvc17.c
index 2c7cc2fdace..8b83c1aecb3 100644
--- a/gcc/testsuite/gcc.target/i386/mvc17.c
+++ b/gcc/testsuite/gcc.target/i386/mvc17.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-ifunc "" } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -march=x86-64" } */
 /* { dg-final { scan-assembler-times "rep mov" 1 } } */

 __attribute__((target_clones("default","arch=icelake-server")))
---

Will push it as an obvious fix, also will apply to the pending backports.

Hongyu Wang  于2023年6月27日周二 13:43写道:
>
> Thanks, I'll backport it down to GCC10 after this passed all 
> bootstrap/regtest.
>
> Uros Bizjak via Gcc-patches  于2023年6月26日周一 14:05写道:
> >
> > On Mon, Jun 26, 2023 at 4:31 AM Hongyu Wang  wrote:
> > >
> > > Hi,
> > >
> > > For function with target attribute arch=*, current logic will set its
> > > tune to -mtune from command line so all target_clones will get same
> > > tuning flags which would affect the performance for each clone. Override
> > > tune with arch if tune was not explicitly specified to get proper tuning
> > > flags for target_clones.
> > >
> > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}
> > >
> > > Ok for trunk and backport to active release branches?
> > >
> > > gcc/ChangeLog:
> > >
> > > * config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
> > > Override tune_string with arch_string if tune_string is not
> > > explicitly specified.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > * gcc.target/i386/mvc17.c: New test.
> >
> > LGTM.
> >
> > Thanks,
> > Uros.
> >
> > > ---
> > >  gcc/config/i386/i386-options.cc   |  6 +-
> > >  gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++
> > >  2 files changed, 16 insertions(+), 1 deletion(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c
> > >
> > > diff --git a/gcc/config/i386/i386-options.cc 
> > > b/gcc/config/i386/i386-options.cc
> > > index 2cb0bddcd35..7f593cebe76 100644
> > > --- a/gcc/config/i386/i386-options.cc
> > > +++ b/gcc/config/i386/i386-options.cc
> > > @@ -1400,7 +1400,11 @@ ix86_valid_target_attribute_tree (tree fndecl, 
> > > tree args,
> > >if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
> > > opts->x_ix86_tune_string
> > >   = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
> > > -  else if (orig_tune_defaulted)
> > > +  /* If we have explicit arch string and no tune string specified, 
> > > set
> > > +tune_string to NULL and later it will be overriden by arch_string
> > > +so target clones can get proper optimization.  */
> > > +  else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
> > > +  || orig_tune_defaulted)
> > > opts->x_ix86_tune_string = NULL;
> > >
> > >/* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  
> > > */
> > > diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c 
> > > b/gcc/testsuite/gcc.target/i386/mvc17.c
> > > new file mode 100644
> > > index 000..2c7cc2fdace
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/mvc17.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-require-ifunc "" } */
> > > +/* { dg-options "-O2" } */
> > > +/* { dg-final { scan-assembler-times "rep mov" 1 } } */
> > > +
> > > +__attribute__((target_clones("default","arch=icelake-server")))
> > > +void
> > > +foo (char *a, char *b, int size)
> > > +{
> > > +  __builtin_memcpy (a, b, size & 0x7F);
> > > +}
> > > --
> > > 2.31.1
> > >


Re: [PATCH] i386: Relax inline requirement for functions with different target attrs

2023-06-28 Thread Hongyu Wang via Gcc-patches
> If the user specified a different arch for callee than the caller,
> then the compiler will switch on different ISAs (-march is just a
> shortcut for different ISA packs), and the programmer is aware that
> inlining isn't intended here (we have -mtune, which is not as strong
> as -march, but even functions with different -mtune are not inlined
> without always_inline attribute). This is documented as:

The original issue comes from a case like

float callee (float a, float b, float c, float d,
float e, float f, float g, float h)
{
return a * b + c * d + e * f + g + h + a * c + b * c
+ a * d + b * e + a * f + c * h +
b * (a - 0.4f) * (c + h) * (b + e * d) - a / f * h;
}

__attribute__((target_clones("default","arch=icelake-server")))
void caller (int n, float *a,
float c1, float c2, float c3,
float c4, float c5, float c6,
float c7)
{
  for (int i = 0; i < n; i++)
  {
a[i] = callee (a[i], c1, c2, c3, c4, c5, c6, c7);
  }
}

For current gcc, the .icelake_server clone fails to inline callee due
to target specific option mismatch, while the .default clone
succeeded and the loop get vectorized. I think it is not reasonable
that the specific clone with higher arch cannot produce better code.
So I think at least we can decide to inline those callee without any
arch/tune specified, but for now they are rejected by the strict arch=
and tune= check.

Uros Bizjak  于2023年6月28日周三 14:43写道:
>
> On Wed, Jun 28, 2023 at 3:56 AM Hongyu Wang  wrote:
> >
> > > I don't think this is desirable. If we inline something with different
> > > ISAs, we get some strange mix of ISAs when the function is inlined.
> > > OTOH - we already inline with mismatched tune flags if the function is
> > > marked with always_inline.
> >
> > Previously ix86_can_inline_p has
> >
> > if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
> >  != callee_opts->x_ix86_isa_flags)
> > || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
> > != callee_opts->x_ix86_isa_flags2))
> >   ret = false;
> >
> > It make sure caller ISA is a super set of callee, and the inlined one
> > should follow caller's ISA specification.
> >
> > IMHO I cannot give a real example that after inline the caller's
> > performance get harmed, I added PVW since there might
> > be some callee want to limit its vector size and caller may have
> > larger preferred vector size. At least with current change
> > we get more optimization opportunity for different target_clones.
> >
> > But I agree the tuning setting may be a factor that affect the
> > performance. One possible choice is that if the
> > tune for callee is unspecified or default, just inline it to the
> > caller with specified arch and tune.
>
> If the user specified a different arch for callee than the caller,
> then the compiler will switch on different ISAs (-march is just a
> shortcut for different ISA packs), and the programmer is aware that
> inlining isn't intended here (we have -mtune, which is not as strong
> as -march, but even functions with different -mtune are not inlined
> without always_inline attribute). This is documented as:
>
> --q--
> On the x86, the inliner does not inline a function that has different
> target options than the caller, unless the callee has a subset of the
> target options of the caller. For example a function declared with
> target("sse3") can inline a function with target("sse2"), since -msse3
> implies -msse2.
> --/q--
>
> I don't think arch=skylake can be considered as a subset of 
> arch=icelake-server.
>
> I agree that the compiler should reject functions with different PVW.
> This is also in accordance with the documentation.
>
> Uros.
>
> >
> > Uros Bizjak via Gcc-patches  于2023年6月27日周二 17:16写道:
> >
> >
> >
> > >
> > > On Mon, Jun 26, 2023 at 4:36 AM Hongyu Wang  wrote:
> > > >
> > > > Hi,
> > > >
> > > > For function with different target attributes, current logic rejects to
> > > > inline the callee when any arch or tune is mismatched. Relax the
> > > > condition to honor just prefer_vecotr_width_type and other flags that
> > > > may cause safety issue so caller can get more optimization opportunity.
> > >
> > > I don't think this is desirable. If we inline something with different
> > > ISAs, we get some strange mix of ISAs when the function is inlined.
> > > OTOH - we already inline with mismatched tune flags if the function is
> > > marked with always_inline.
> > >
> > > Uros.
> > >
> > > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}
> > > >
> > > > Ok for trunk?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * config/i386/i386.cc (ix86_can_inline_p): Do not check arch or
> > > > tune directly, just check prefer_vector_width_type and make sure
> > > > not to inline if they mismatch.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * gcc.target/i386/inline-target-attr.c: New test.
> > > > ---
> > > >  gc

[PATCH V2] i386: Inline function with default arch/tune to caller

2023-07-03 Thread Hongyu Wang via Gcc-patches
Hi,

For function with different target attributes, current logic rejects to
inline the callee when any arch or tune is mismatched. Relax the
condition to allow callee with default arch/tune to be inlined.

Boostrapped/regtested on x86-64-linux-gnu{-m32,}.

Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_can_inline_p): If callee has
default arch=x86-64 and tune=generic, do not block the
inlining to its caller.

gcc/testsuite/ChangeLog:

* gcc.target/i386/inline_target_clones.c: New test.
---
 gcc/config/i386/i386.cc   | 22 +++--
 .../gcc.target/i386/inline_target_clones.c| 24 +++
 2 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/inline_target_clones.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 8989985700a..4741c9b5364 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -605,13 +605,6 @@ ix86_can_inline_p (tree caller, tree callee)
   != (callee_opts->x_target_flags & ~always_inline_safe_mask))
 ret = false;
 
-  /* See if arch, tune, etc. are the same.  */
-  else if (caller_opts->arch != callee_opts->arch)
-ret = false;
-
-  else if (!always_inline && caller_opts->tune != callee_opts->tune)
-ret = false;
-
   else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
   /* If the calle doesn't use FP expressions differences in
  ix86_fpmath can be ignored.  We are called from FEs
@@ -622,6 +615,21 @@ ix86_can_inline_p (tree caller, tree callee)
   || ipa_fn_summaries->get (callee_node)->fp_expressions))
 ret = false;
 
+  /* At this point we cannot identify whether arch or tune setting
+ comes from target attribute or not. So the most conservative way
+ is to allow the callee that uses default arch and tune string to
+ be inlined.  */
+  else if (!strcmp (callee_opts->x_ix86_arch_string, "x86-64")
+  && !strcmp (callee_opts->x_ix86_tune_string, "generic"))
+ret = true;
+
+  /* See if arch, tune, etc. are the same.  */
+  else if (caller_opts->arch != callee_opts->arch)
+ret = false;
+
+  else if (!always_inline && caller_opts->tune != callee_opts->tune)
+ret = false;
+
   else if (!always_inline
   && caller_opts->branch_cost != callee_opts->branch_cost)
 ret = false;
diff --git a/gcc/testsuite/gcc.target/i386/inline_target_clones.c 
b/gcc/testsuite/gcc.target/i386/inline_target_clones.c
new file mode 100644
index 000..53db1600ce5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/inline_target_clones.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-require-ifunc "" } */
+/* { dg-options "-O3 -march=x86-64" } */
+/* { dg-final { scan-assembler-not "call\[ \t\]+callee" } } */
+
+float callee (float a, float b, float c, float d,
+ float e, float f, float g, float h)
+{
+  return a * b + c * d + e * f + g + h + a * c + b * c
++ a * d + b * e + a * f + c * h + 
+b * (a - 0.4f) * (c + h) * (b + e * d) - a / f * h;
+}
+
+__attribute__((target_clones("default","arch=icelake-server")))
+void caller (int n, float *a,
+float c1, float c2, float c3,
+float c4, float c5, float c6,
+float c7)
+{
+  for (int i = 0; i < n; i++)
+{
+  a[i] = callee (a[i], c1, c2, c3, c4, c5, c6, c7);
+}
+}
-- 
2.31.1



Re: [PATCH V2] i386: Inline function with default arch/tune to caller

2023-07-04 Thread Hongyu Wang via Gcc-patches
> In a follow-up patch, can you please document inlining rules involving
> -march and -mtune to "x86 Function Attributes" section? Currently, the
> inlining rules at the end of "target function attribute" section does
> not even mention -march and -mtune. Maybe a subsubsection "Inlining
> rules" should be added (like AArch64 has) to mention that only default
> arch and tune are inlined by default (but inline can be forced with
> always_inline for different mtune flags).

The document has below at the end of 'target (OPTIONS)' section

On the x86, the inliner does not inline a function that has
different target options than the caller, unless the callee
has a subset of the target options of the caller.  For example
a function declared with 'target("sse3")' can inline a
function with 'target("sse2")', since '-msse3' implies
'-msse2'.

Do we need to move this part to a new section and combine with -march and
-mtune rule description to the new subsubsection?

> Looking at the above, perhaps inlining of different arches can also be
> forced with always_inline? This would allow developers some control of
> inlining, and would not be surprising.

If so, I'd like to add the always_inline change on arch to current
patch and leave the
document change alone in the next patch.

Uros Bizjak via Gcc-patches  于2023年7月4日周二 14:19写道:
>
> On Tue, Jul 4, 2023 at 5:12 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > For function with different target attributes, current logic rejects to
> > inline the callee when any arch or tune is mismatched. Relax the
> > condition to allow callee with default arch/tune to be inlined.
> >
> > Boostrapped/regtested on x86-64-linux-gnu{-m32,}.
> >
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.cc (ix86_can_inline_p): If callee has
> > default arch=x86-64 and tune=generic, do not block the
> > inlining to its caller.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/inline_target_clones.c: New test.
>
> OK.
>
> In a follow-up patch, can you please document inlining rules involving
> -march and -mtune to "x86 Function Attributes" section? Currently, the
> inlining rules at the end of "target function attribute" section does
> not even mention -march and -mtune. Maybe a subsubsection "Inlining
> rules" should be added (like AArch64 has) to mention that only default
> arch and tune are inlined by default (but inline can be forced with
> always_inline for different mtune flags).
>
> Looking at the above, perhaps inlining of different arches can also be
> forced with always_inline? This would allow developers some control of
> inlining, and would not be surprising.
>
> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386.cc   | 22 +++--
> >  .../gcc.target/i386/inline_target_clones.c| 24 +++
> >  2 files changed, 39 insertions(+), 7 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/inline_target_clones.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 8989985700a..4741c9b5364 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -605,13 +605,6 @@ ix86_can_inline_p (tree caller, tree callee)
> >!= (callee_opts->x_target_flags & ~always_inline_safe_mask))
> >  ret = false;
> >
> > -  /* See if arch, tune, etc. are the same.  */
> > -  else if (caller_opts->arch != callee_opts->arch)
> > -ret = false;
> > -
> > -  else if (!always_inline && caller_opts->tune != callee_opts->tune)
> > -ret = false;
> > -
> >else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
> >/* If the calle doesn't use FP expressions differences in
> >   ix86_fpmath can be ignored.  We are called from FEs
> > @@ -622,6 +615,21 @@ ix86_can_inline_p (tree caller, tree callee)
> >|| ipa_fn_summaries->get (callee_node)->fp_expressions))
> >  ret = false;
> >
> > +  /* At this point we cannot identify whether arch or tune setting
> > + comes from target attribute or not. So the most conservative way
> > + is to allow the callee that uses default arch and tune string to
> > + be inlined.  */
> > +  else if (!strcmp (callee_opts->x_ix86_arch_string, "x86-64")
> > +  && !strcmp (callee_opts->x_ix86_tune_string, "generic"))
> > +ret = true;
> > +
> > +  /* See if arch, tune, etc. are the same.  */
> > +  else if (caller_opts->arch != callee_opts->arch)
> > +ret = false;
> > +
> > +  else if (!always_inline && caller_opts->tune != callee_opts->tune)
> > +ret = false;
> > +
> >else if (!always_inline
> >&& caller_opts->branch_cost != callee_opts->branch_cost)
> >  ret = false;
> > diff --git a/gcc/testsuite/gcc.target/i386/inline_target_clones.c 
> > b/gcc/testsuite/gcc.target/i386/inline_target_clones.c
> > new file mode 100644
> > index 000..53db1600ce5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/in

Re: [PATCH V2] i386: Inline function with default arch/tune to caller

2023-07-05 Thread Hongyu Wang via Gcc-patches
Thanks, this is the updated patch I'm going to check in.

Uros Bizjak  于2023年7月4日周二 16:57写道:
>
> On Tue, Jul 4, 2023 at 10:32 AM Hongyu Wang  wrote:
> >
> > > In a follow-up patch, can you please document inlining rules involving
> > > -march and -mtune to "x86 Function Attributes" section? Currently, the
> > > inlining rules at the end of "target function attribute" section does
> > > not even mention -march and -mtune. Maybe a subsubsection "Inlining
> > > rules" should be added (like AArch64 has) to mention that only default
> > > arch and tune are inlined by default (but inline can be forced with
> > > always_inline for different mtune flags).
> >
> > The document has below at the end of 'target (OPTIONS)' section
> >
> > On the x86, the inliner does not inline a function that has
> > different target options than the caller, unless the callee
> > has a subset of the target options of the caller.  For example
> > a function declared with 'target("sse3")' can inline a
> > function with 'target("sse2")', since '-msse3' implies
> > '-msse2'.
> >
> > Do we need to move this part to a new section and combine with -march and
> > -mtune rule description to the new subsubsection?
> >
> > > Looking at the above, perhaps inlining of different arches can also be
> > > forced with always_inline? This would allow developers some control of
> > > inlining, and would not be surprising.
> >
> > If so, I'd like to add the always_inline change on arch to current
> > patch and leave the
> > document change alone in the next patch.
>
> Yes, this is OK.
>
> Thanks,
> Uros.
> >
> > Uros Bizjak via Gcc-patches  于2023年7月4日周二 14:19写道:
> > >
> > > On Tue, Jul 4, 2023 at 5:12 AM Hongyu Wang  wrote:
> > > >
> > > > Hi,
> > > >
> > > > For function with different target attributes, current logic rejects to
> > > > inline the callee when any arch or tune is mismatched. Relax the
> > > > condition to allow callee with default arch/tune to be inlined.
> > > >
> > > > Boostrapped/regtested on x86-64-linux-gnu{-m32,}.
> > > >
> > > > Ok for trunk?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * config/i386/i386.cc (ix86_can_inline_p): If callee has
> > > > default arch=x86-64 and tune=generic, do not block the
> > > > inlining to its caller.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * gcc.target/i386/inline_target_clones.c: New test.
> > >
> > > OK.
> > >
> > > In a follow-up patch, can you please document inlining rules involving
> > > -march and -mtune to "x86 Function Attributes" section? Currently, the
> > > inlining rules at the end of "target function attribute" section does
> > > not even mention -march and -mtune. Maybe a subsubsection "Inlining
> > > rules" should be added (like AArch64 has) to mention that only default
> > > arch and tune are inlined by default (but inline can be forced with
> > > always_inline for different mtune flags).
> > >
> > > Looking at the above, perhaps inlining of different arches can also be
> > > forced with always_inline? This would allow developers some control of
> > > inlining, and would not be surprising.
> > >
> > > Thanks,
> > > Uros.
> > >
> > > > ---
> > > >  gcc/config/i386/i386.cc   | 22 +++--
> > > >  .../gcc.target/i386/inline_target_clones.c| 24 +++
> > > >  2 files changed, 39 insertions(+), 7 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/inline_target_clones.c
> > > >
> > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > > index 8989985700a..4741c9b5364 100644
> > > > --- a/gcc/config/i386/i386.cc
> > > > +++ b/gcc/config/i386/i386.cc
> > > > @@ -605,13 +605,6 @@ ix86_can_inline_p (tree caller, tree callee)
> > > >!= (callee_opts->x_target_flags & 
> > > > ~always_inline_safe_mask))
> > > >  ret = false;
> > > >
> > > > -  /* See if arch, tune, etc. are the same.  */
> > > > -  else if (caller_opts->arch != callee_opts->arch)
> > > > -ret = false;
> > > > -
> > > > -  else if (!always_inline && caller_opts->tune != callee_opts->tune)
> > > > -ret = false;
> > > > -
> > > >else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
> > > >/* If the calle doesn't use FP expressions differences in
> > > >   ix86_fpmath can be ignored.  We are called from FEs
> > > > @@ -622,6 +615,21 @@ ix86_can_inline_p (tree caller, tree callee)
> > > >|| ipa_fn_summaries->get (callee_node)->fp_expressions))
> > > >  ret = false;
> > > >
> > > > +  /* At this point we cannot identify whether arch or tune setting
> > > > + comes from target attribute or not. So the most conservative way
> > > > + is to allow the callee that uses default arch and tune string to
> > > > + be inlined.  */
> > > > +  else if (!strcmp (callee_opts->x_ix86_arch_string, "x86-64")
> > > > +  && !strcmp (callee_opts->x_ix86_tune_string, "generic"))
> > > > +ret = true;
> > > 

[PATCH] i386: Update document for inlining rules

2023-07-05 Thread Hongyu Wang via Gcc-patches
Hi,

This is a follow-up patch for
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623525.html
that updates document about x86 inlining rules.

Ok for trunk?

gcc/ChangeLog:

* doc/extend.texi: Move x86 inlining rule to a new subsubsection
and add description for inling of function with arch and tune
attributes.
---
 gcc/doc/extend.texi | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index d1b018ee6d6..d701b4d1d41 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -7243,11 +7243,6 @@ Prefer 256-bit vector width for instructions.
 Prefer 512-bit vector width for instructions.
 @end table
 
-On the x86, the inliner does not inline a
-function that has different target options than the caller, unless the
-callee has a subset of the target options of the caller.  For example
-a function declared with @code{target("sse3")} can inline a function
-with @code{target("sse2")}, since @code{-msse3} implies @code{-msse2}.
 @end table
 
 @cindex @code{indirect_branch} function attribute, x86
@@ -7361,6 +7356,20 @@ counterpart to option @option{-mno-direct-extern-access}.
 
 @end table
 
+@subsubsection Inlining rules
+On the x86, the inliner does not inline a
+function that has different target options than the caller, unless the
+callee has a subset of the target options of the caller.  For example
+a function declared with @code{target("sse3")} can inline a function
+with @code{target("sse2")}, since @code{-msse3} implies @code{-msse2}.
+
+Besides the basic rule, when a function specifies
+@code{target("arch=@var{ARCH}")} or @code{target("tune=@var{TUNE}")}
+attribute, the inlining rule will be different. It allows inlining of
+a function with default @option{-march=x86-64} and
+@option{-mtune=generic} specified, or a function that has a subset
+of ISA features and marked with always_inline.
+
 @node Xstormy16 Function Attributes
 @subsection Xstormy16 Function Attributes
 
-- 
2.31.1



[PATCH] libgomp: Fix default value of GOMP_SPINCOUNT [PR 109062]

2023-03-07 Thread Hongyu Wang via Gcc-patches
Hi,

When OMP_WAIT_POLICY is not specified, current implementation will cause
icv flag GOMP_ICV_WAIT_POLICY unset, so global variable wait_policy
will remain its uninitialized value. Set it to -1 when the flag is not
specified to keep GOMP_SPINCOUNT behavior consistent with its description.

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

libgomp/ChangeLog:

PR libgomp/109062
* env.c (initialize_env): Set wait_policy to -1 if
OMP_WAIT_POLICY is not specified.
* testsuite/libgomp.c-c++-common/pr109062.c: New test.
---
 libgomp/env.c |  2 ++
 libgomp/testsuite/libgomp.c-c++-common/pr109062.c | 14 ++
 2 files changed, 16 insertions(+)
 create mode 100644 libgomp/testsuite/libgomp.c-c++-common/pr109062.c

diff --git a/libgomp/env.c b/libgomp/env.c
index c41c1f852cc..fa36a8697d6 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -2249,6 +2249,8 @@ initialize_env (void)
 wait_policy = none->icvs.wait_policy;
   else if (all != NULL && gomp_get_icv_flag (all->flags, GOMP_ICV_WAIT_POLICY))
 wait_policy = all->icvs.wait_policy;
+  else
+wait_policy = -1;
 
   if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
 {
diff --git a/libgomp/testsuite/libgomp.c-c++-common/pr109062.c 
b/libgomp/testsuite/libgomp.c-c++-common/pr109062.c
new file mode 100644
index 000..5c7c287dafd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/pr109062.c
@@ -0,0 +1,14 @@
+/* { dg-do run } */
+
+#include 
+#include 
+
+int
+main ()
+{
+  omp_display_env (1);
+
+  return 0;
+}
+
+/* { dg-output ".*\\\[host] GOMP_SPINCOUNT = '30'.*" { target native } } */
-- 
2.31.1



Re: [PATCH] libgomp: Fix default value of GOMP_SPINCOUNT [PR 109062]

2023-03-08 Thread Hongyu Wang via Gcc-patches
> I think the right spot to fix this would be instead in initialize_icvs,
> change the
>   icvs->wait_policy = 0;
> in there to
>   icvs->wait_policy = -1;
> That way it will be the default for all the devices, not just the
> initial one.

It doesn't work, for the code that determines value of wait_policy:

if (none != NULL && gomp_get_icv_flag (none->flags, GOMP_ICV_WAIT_POLICY))
  wait_policy = none->icvs.wait_policy;
else if (all != NULL && gomp_get_icv_flag (all->flags, GOMP_ICV_WAIT_POLICY))
  wait_policy = all->icvs.wait_policy;

gomp_get_icv_flag (none->flags, GOMP_ICV_WAIT_POLICY) returns true only when
OMP_WAIT_POLICY is explicitly set, so the initial icvs->wait_policy
could not affect the global wait_policy that used to set
GOMP_SPINCOUNT.


Re: [PATCH] libgomp: Fix default value of GOMP_SPINCOUNT [PR 109062]

2023-03-08 Thread Hongyu Wang via Gcc-patches
Hongyu Wang  于2023年3月8日周三 16:07写道:
>
> > I think the right spot to fix this would be instead in initialize_icvs,
> > change the
> >   icvs->wait_policy = 0;
> > in there to
> >   icvs->wait_policy = -1;
> > That way it will be the default for all the devices, not just the
> > initial one.
>
> It doesn't work, for the code that determines value of wait_policy:
>
> if (none != NULL && gomp_get_icv_flag (none->flags, GOMP_ICV_WAIT_POLICY))
>   wait_policy = none->icvs.wait_policy;
> else if (all != NULL && gomp_get_icv_flag (all->flags, GOMP_ICV_WAIT_POLICY))
>   wait_policy = all->icvs.wait_policy;
>
> gomp_get_icv_flag (none->flags, GOMP_ICV_WAIT_POLICY) returns true only when
> OMP_WAIT_POLICY is explicitly set, so the initial icvs->wait_policy
> could not affect the global wait_policy that used to set
> GOMP_SPINCOUNT.

Also the global variable wait_policy here is only used for setting
spin_count related values that do not
belong to any ICV, so there is no need to set icvs->wait_policy since
for OMP_WAIT_POLICY_(DEV|ALL)
itself only has value 0 for passive and value 1 for active.


Re: [PATCH] libgomp: Fix default value of GOMP_SPINCOUNT [PR 109062]

2023-03-08 Thread Hongyu Wang via Gcc-patches
> Seems for many ICVs the default values are done through
> gomp_default_icv_values, but that doesn't cover wait_policy.
> For other vars, the defaults are provided through just initializers of
> those vars on the var definitions, e.g.:
> char *gomp_affinity_format_var = "level %L thread %i affinity %A";
> So, I'd do the initialize_icvs change and change
> static int wait_policy;
> to
> static int wait_policy = -1;

Agreed, here is the updated patch, ok for trunk?

When OMP_WAIT_POLICY is not specified, current implementation will cause
icv flag GOMP_ICV_WAIT_POLICY unset, so global variable wait_policy
will remain its uninitialized value. Initialize it to -1 to make
GOMP_SPINCOUNT behavior consistent with its description.

libgomp/ChangeLog:

PR libgomp/109062
* env.c (wait_policy): Initialize to -1.
(initialize_icvs): Initialize icvs->wait_policy to -1.
* testsuite/libgomp.c-c++-common/pr109062.c: New test.
---
 libgomp/env.c |  4 ++--
 libgomp/testsuite/libgomp.c-c++-common/pr109062.c | 14 ++
 2 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.c-c++-common/pr109062.c

diff --git a/libgomp/env.c b/libgomp/env.c
index c41c1f852cc..e7a035b593c 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -124,7 +124,7 @@ int goacc_default_dims[GOMP_DIM_MAX];

 #ifndef LIBGOMP_OFFLOADED_ONLY

-static int wait_policy;
+static int wait_policy = -1;
 static unsigned long stacksize = GOMP_DEFAULT_STACKSIZE;

 static void
@@ -1981,7 +1981,7 @@ initialize_icvs (struct gomp_initial_icvs *icvs)
   icvs->bind_var = gomp_default_icv_values.bind_var;
   icvs->nteams_var = gomp_default_icv_values.nteams_var;
   icvs->teams_thread_limit_var =
gomp_default_icv_values.teams_thread_limit_var;
-  icvs->wait_policy = 0;
+  icvs->wait_policy = -1;
 }

 /* Helper function for initialize_env to add a device specific ICV value
diff --git a/libgomp/testsuite/libgomp.c-c++-common/pr109062.c
b/libgomp/testsuite/libgomp.c-c++-common/pr109062.c
new file mode 100644
index 000..5c7c287dafd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/pr109062.c
@@ -0,0 +1,14 @@
+/* { dg-do run } */
+
+#include 
+#include 
+
+int
+main ()
+{
+  omp_display_env (1);
+
+  return 0;
+}
+
+/* { dg-output ".*\\\[host] GOMP_SPINCOUNT = '30'.*" { target native } } */
-- 
2.31.1


[PATCH] i386: Fix typos in amxbf16 runtime test.

2021-08-10 Thread Hongyu Wang via Gcc-patches
Hi,

This patch fixes some typo in amxbf16-dpbf16ps-2 test.

Tested under sde/spr machine and passed.

OK for master and backport to GCC 11?

gcc/testsuite/ChangeLog:

* gcc.target/i386/amxbf16-dpbf16ps-2.c: Fix typos.
---
 gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c 
b/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c
index 349ec58ada2..f7002ca5ea5 100644
--- a/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c
+++ b/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c
@@ -57,7 +57,7 @@ void calc_matrix_dpbf16ps (__tile *dst, __tile *src1, __tile 
*src2)
  (make_f32(src1_buf[i * 4 * N + 4 * j + t]) *
  make_f32(src2_buf[j * 4 * K + 4 * k + t])) +
  (make_f32(src1_buf[i * 4 * N + 4 * j + t + 1]) *
- make_f32(src1_buf[i * 4 * N + 4 * j + t + 1]));
+ make_f32(src2_buf[j * 4 * K + 4 * k + t + 1]));
  }
 
 }
@@ -72,8 +72,8 @@ void test_amx_bf16_dpbf16ps ()
   
   init_tile_config (&cfg);
   init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_buf);
-  init_tile_reg_and_src_with_buffer (2, dst, tmp_dst_buf);
-  init_tile_reg_and_src_with_buffer (3, dst, tmp_dst_buf);
+  init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+  init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
 
   calc_matrix_dpbf16ps (&dst, &src1, &src2);
   
-- 
2.18.1



[PATCH] i386: Add peephole for lea and zero extend [PR 101716]

2021-08-12 Thread Hongyu Wang via Gcc-patches
Hi,

For lea + zero_extendsidi insns, if dest of lea and src of zext are the
same, combine them with single leal under 64bit target since 32bit
register will be automatically zero-extended.

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for master?

gcc/ChangeLog:

PR target/101716
* config/i386/i386.md (*lea_zext): New define_insn.
(define_peephole2): New peephole2 to combine zero_extend
with lea.

gcc/testsuite/ChangeLog:

PR target/101716
* gcc.target/i386/pr101716.c: New test.
---
 gcc/config/i386/i386.md  | 20 
 gcc/testsuite/gcc.target/i386/pr101716.c | 11 +++
 2 files changed, 31 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101716.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4a8e8fea290..6739dbd799b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5187,6 +5187,26 @@
(const_string "SI")
(const_string "")))])
 
+;; combine zero_extendsidi with lea to use leal.
+(define_insn "*lea_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (zero_extend:DI
+   (match_operand:SWI48 1 "address_no_seg_operand" "Ts")))]
+  "TARGET_64BIT"
+  "lea{l}\t{%E1, %k0|%k0,%E1}")
+
+(define_peephole2
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+   (match_operand:SWI48 1 "address_no_seg_operand"))
+   (set (match_operand:DI 2 "general_reg_operand")
+   (zero_extend:DI (match_operand:SI 3 "general_reg_operand")))]
+  "TARGET_64BIT && ix86_hardreg_mov_ok (operands[2], operands[1])
+   && REGNO (operands[0]) == REGNO (operands[3])
+   && (REGNO (operands[2]) == REGNO (operands[3])
+  || peep2_reg_dead_p (2, operands[3]))"
+  [(set (match_dup 2)
+   (zero_extend:DI (match_dup 1)))])
+
 (define_peephole2
   [(set (match_operand:SWI48 0 "register_operand")
(match_operand:SWI48 1 "address_no_seg_operand"))]
diff --git a/gcc/testsuite/gcc.target/i386/pr101716.c 
b/gcc/testsuite/gcc.target/i386/pr101716.c
new file mode 100644
index 000..0b684755c2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101716.c
@@ -0,0 +1,11 @@
+/* PR target/101716 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler "leal\[\\t \]\*eax" } } */
+/* { dg-final { scan-assembler-not "movl\[\\t \]\*eax" } } */
+
+unsigned long long sample1(unsigned long long m) {
+unsigned int t = -1;
+return (m << 1) & t;
+}
-- 
2.18.1



Re: [PATCH] i386: Add peephole for lea and zero extend [PR 101716]

2021-08-12 Thread Hongyu Wang via Gcc-patches
Sorry for the typo, scan-assembler should be

+/* { dg-final { scan-assembler "leal\[\\t \]\[^\\n\]*eax" } } */
+/* { dg-final { scan-assembler-not "movl\[\\t \]\[^\\n\]*eax" } } */

Hongyu Wang via Gcc-patches  于2021年8月13日周五 上午8:49写道:
>
> Hi,
>
> For lea + zero_extendsidi insns, if dest of lea and src of zext are the
> same, combine them with single leal under 64bit target since 32bit
> register will be automatically zero-extended.
>
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> Ok for master?
>
> gcc/ChangeLog:
>
> PR target/101716
> * config/i386/i386.md (*lea_zext): New define_insn.
> (define_peephole2): New peephole2 to combine zero_extend
> with lea.
>
> gcc/testsuite/ChangeLog:
>
> PR target/101716
> * gcc.target/i386/pr101716.c: New test.
> ---
>  gcc/config/i386/i386.md  | 20 
>  gcc/testsuite/gcc.target/i386/pr101716.c | 11 +++
>  2 files changed, 31 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101716.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 4a8e8fea290..6739dbd799b 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -5187,6 +5187,26 @@
> (const_string "SI")
> (const_string "")))])
>
> +;; combine zero_extendsidi with lea to use leal.
> +(define_insn "*lea_zext"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +   (zero_extend:DI
> +   (match_operand:SWI48 1 "address_no_seg_operand" "Ts")))]
> +  "TARGET_64BIT"
> +  "lea{l}\t{%E1, %k0|%k0,%E1}")
> +
> +(define_peephole2
> +  [(set (match_operand:SWI48 0 "general_reg_operand")
> +   (match_operand:SWI48 1 "address_no_seg_operand"))
> +   (set (match_operand:DI 2 "general_reg_operand")
> +   (zero_extend:DI (match_operand:SI 3 "general_reg_operand")))]
> +  "TARGET_64BIT && ix86_hardreg_mov_ok (operands[2], operands[1])
> +   && REGNO (operands[0]) == REGNO (operands[3])
> +   && (REGNO (operands[2]) == REGNO (operands[3])
> +  || peep2_reg_dead_p (2, operands[3]))"
> +  [(set (match_dup 2)
> +   (zero_extend:DI (match_dup 1)))])
> +
>  (define_peephole2
>[(set (match_operand:SWI48 0 "register_operand")
> (match_operand:SWI48 1 "address_no_seg_operand"))]
> diff --git a/gcc/testsuite/gcc.target/i386/pr101716.c 
> b/gcc/testsuite/gcc.target/i386/pr101716.c
> new file mode 100644
> index 000..0b684755c2f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101716.c
> @@ -0,0 +1,11 @@
> +/* PR target/101716 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2" } */
> +
> +/* { dg-final { scan-assembler "leal\[\\t \]\*eax" } } */
> +/* { dg-final { scan-assembler-not "movl\[\\t \]\*eax" } } */
> +
> +unsigned long long sample1(unsigned long long m) {
> +unsigned int t = -1;
> +return (m << 1) & t;
> +}
> --
> 2.18.1
>


Re: [PATCH] i386: Add peephole for lea and zero extend [PR 101716]

2021-08-16 Thread Hongyu Wang via Gcc-patches
> So, the question is if the combine pass really needs to zero-extend
> with 0xfffe, the left shift << 1 guarantees zero in the LSB, so
> 0x should be better and in line with canonical zero-extension
> RTX.

The shift mask is generated in simplify_shift_const_1:

mask_rtx = gen_int_mode (nonzero_bits (varop, int_varop_mode),
 int_result_mode);
rtx count_rtx = gen_int_shift_amount (int_result_mode, count);
mask_rtx
  = simplify_const_binary_operation (code, int_result_mode,
 mask_rtx, count_rtx);

Can we adjust the count for ashift if nonzero_bits overlaps it?

> Also, ix86_decompose_address accepts ASHIFT RTX when ASHIFT is
> embedded in the PLUS chain, but naked ASHIFT is rejected (c.f. the
> call in ix86_legitimate_address_p) for some (historic?) reason. It
> looks to me that this restriction is not necessary, since
> ix86_legitimize_address can canonicalize ASHIFT RTXes without
> problems. The attached patch that survives bootstrap and regtest can
> help in your case.

We have a split to transform ashift to mult, I'm afraid it could not
help this issue.

Uros Bizjak via Gcc-patches  于2021年8月16日周一 下午4:12写道:

>
> On Fri, Aug 13, 2021 at 9:21 AM Uros Bizjak  wrote:
> >
> > On Fri, Aug 13, 2021 at 2:48 AM Hongyu Wang  wrote:
> > >
> > > Hi,
> > >
> > > For lea + zero_extendsidi insns, if dest of lea and src of zext are the
> > > same, combine them with single leal under 64bit target since 32bit
> > > register will be automatically zero-extended.
> > >
> > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > Ok for master?
> > >
> > > gcc/ChangeLog:
> > >
> > > PR target/101716
> > > * config/i386/i386.md (*lea_zext): New define_insn.
> > > (define_peephole2): New peephole2 to combine zero_extend
> > > with lea.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR target/101716
> > > * gcc.target/i386/pr101716.c: New test.
> >
> > This form should be covered by ix86_decompose_address via
> > address_no_seg_operand predicate. Combine creates:
> >
> > Trying 6 -> 7:
> >6: {r86:DI=r87:DI<<0x1;clobber flags:CC;}
> >  REG_DEAD r87:DI
> >  REG_UNUSED flags:CC
> >7: r85:DI=zero_extend(r86:DI#0)
> >  REG_DEAD r86:DI
> > Failed to match this instruction:
> > (set (reg:DI 85)
> >(and:DI (ashift:DI (reg:DI 87)
> >(const_int 1 [0x1]))
> >(const_int 4294967294 [0xfffe])))
> >
> > which does not fit:
> >
> >   else if (GET_CODE (addr) == AND
> >&& const_32bit_mask (XEXP (addr, 1), DImode))
> >
> > After reload, we lose SUBREG, so REE does not trigger on:
> >
> > (insn 17 3 7 2 (set (reg:DI 0 ax [86])
> >(mult:DI (reg:DI 5 di [87])
> >(const_int 2 [0x2]))) "pr101716.c":4:13 204 {*leadi}
> > (nil))
> > (insn 7 17 13 2 (set (reg:DI 0 ax [85])
> >(zero_extend:DI (reg:SI 0 ax [86]))) "pr101716.c":4:19 136
> > {*zero_extendsidi2}
> > (nil))
> >
> > So, the question is if the combine pass really needs to zero-extend
> > with 0xfffe, the left shift << 1 guarantees zero in the LSB, so
> > 0x should be better and in line with canonical zero-extension
> > RTX.
>
> Also, ix86_decompose_address accepts ASHIFT RTX when ASHIFT is
> embedded in the PLUS chain, but naked ASHIFT is rejected (c.f. the
> call in ix86_legitimate_address_p) for some (historic?) reason. It
> looks to me that this restriction is not necessary, since
> ix86_legitimize_address can canonicalize ASHIFT RTXes without
> problems. The attached patch that survives bootstrap and regtest can
> help in your case.
>
> Uros.


Re: [PATCH] i386: Add peephole for lea and zero extend [PR 101716]

2021-08-24 Thread Hongyu Wang via Gcc-patches
Hi Uros,

Sorry for the late update. I have tried adjusting the combine pass but
found it is not easy to modify shift const, so I came up with an
alternative solution with your patch. It matches the non-canonical
zero-extend in ix86_decompose_address and adjust ix86_rtx_cost to
combine below pattern

(set (reg:DI 85)
   (and:DI (ashift:DI (reg:DI 87)
   (const_int 1 [0x1]))
   (const_int 4294967294 [0xfffe])))

Survived bootstrap and regtest on x86-64-linux. Ok for master?

Uros Bizjak  于2021年8月16日周一 下午5:26写道:

>
> On Mon, Aug 16, 2021 at 11:18 AM Hongyu Wang  wrote:
> >
> > > So, the question is if the combine pass really needs to zero-extend
> > > with 0xfffe, the left shift << 1 guarantees zero in the LSB, so
> > > 0x should be better and in line with canonical zero-extension
> > > RTX.
> >
> > The shift mask is generated in simplify_shift_const_1:
> >
> > mask_rtx = gen_int_mode (nonzero_bits (varop, int_varop_mode),
> >  int_result_mode);
> > rtx count_rtx = gen_int_shift_amount (int_result_mode, count);
> > mask_rtx
> >   = simplify_const_binary_operation (code, int_result_mode,
> >  mask_rtx, count_rtx);
> >
> > Can we adjust the count for ashift if nonzero_bits overlaps it?
> >
> > > Also, ix86_decompose_address accepts ASHIFT RTX when ASHIFT is
> > > embedded in the PLUS chain, but naked ASHIFT is rejected (c.f. the
> > > call in ix86_legitimate_address_p) for some (historic?) reason. It
> > > looks to me that this restriction is not necessary, since
> > > ix86_legitimize_address can canonicalize ASHIFT RTXes without
> > > problems. The attached patch that survives bootstrap and regtest can
> > > help in your case.
> >
> > We have a split to transform ashift to mult, I'm afraid it could not
> > help this issue.
>
> If you want existing *lea to accept ASHIFT RTX, it uses
> address_no_seg_operand predicate which uses address_operand predicate,
> which calls ix86_legitimate_address_p, which ATM rejects ASHIFT RTXes.
>
> Uros.
From 4bcebb985439867d12f2038e97c72baaf092ffbf Mon Sep 17 00:00:00 2001
From: Hongyu Wang 
Date: Tue, 17 Aug 2021 16:53:46 +0800
Subject: [PATCH] i386: Optimize lea with zero-extend. [PR 101716]

For ASHIFT + ZERO_EXTEND pattern, combine pass failed to
match it to lea since it will generate non-canonical
zero-extend. Adjust predicate and cost_model to allow combine
for lea.

gcc/ChangeLog:

	PR target/101716
	* config/i386/i386.c (ix86_live_on_entry): Adjust comment.
	(ix86_decompose_address): Remove retval check for ASHIFT,
	allow non-canonical zero extend if AND mask covers ASHIFT
	count.
	(ix86_legitimate_address_p): Adjust condition for decompose.
	(ix86_rtx_costs): Adjust cost for lea with non-canonical
	zero-extend.

	Co-Authored by: Uros Bizjak 

gcc/testsuite/ChangeLog:

	PR target/101716
	* gcc.target/i386/pr101716.c: New test.
---
 gcc/config/i386/i386.c   | 36 
 gcc/testsuite/gcc.target/i386/pr101716.c | 11 
 2 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101716.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5bff131f6d9..a997fc04004 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -10018,8 +10018,7 @@ ix86_live_on_entry (bitmap regs)
 
 /* Extract the parts of an RTL expression that is a valid memory address
for an instruction.  Return 0 if the structure of the address is
-   grossly off.  Return -1 if the address contains ASHIFT, so it is not
-   strictly valid, but still used for computing length of lea instruction.  */
+   grossly off.  */
 
 int
 ix86_decompose_address (rtx addr, struct ix86_address *out)
@@ -10029,7 +10028,6 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
   HOST_WIDE_INT scale = 1;
   rtx scale_rtx = NULL_RTX;
   rtx tmp;
-  int retval = 1;
   addr_space_t seg = ADDR_SPACE_GENERIC;
 
   /* Allow zero-extended SImode addresses,
@@ -10053,6 +10051,27 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	  if (CONST_INT_P (addr))
 	return 0;
 	}
+  else if (GET_CODE (addr) == AND)
+	{
+	  /* For ASHIFT inside AND, combine will not generate
+	 canonical zero-extend. Merge mask for AND and shift_count
+	 to check if it is canonical zero-extend.  */
+	  tmp = XEXP (addr, 0);
+	  rtx mask = XEXP (addr, 1);
+	  if (tmp && GET_CODE(tmp) == ASHIFT)
+	{
+	  rtx shift_val = XEXP (tmp, 1);
+	  if (CONST_INT_P (mask) && CONST_INT_P (shift_val)
+		  && (((unsigned HOST_WIDE_INT) INTVAL(mask)
+		  | (HOST_WIDE_INT_1U << (INTVAL(shift_val) - 1)))
+		  == 0x))
+		{
+		  addr = lowpart_subreg (SImode, XEXP (addr, 0),
+	 DImode);
+		}
+	}
+
+	}
 }
 
   /* Allow SImode subregs of DImode addresses,
@@ -10179,7 +10198,6 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
   if ((unsigned HOST_WIDE_INT) scale > 3)
 	return 0;
   scale = 

Re: [PATCH] ipa-inline: Improve growth accumulation for recursive calls

2020-09-16 Thread Hongyu Wang via Gcc-patches
Tamar Christina  于2020年9月12日周六 上午1:39写道:

> Hi Martin,
>
> >
> > can you please confirm that the difference between these two is all due
> to
> > the last option -fno-inline-functions-called-once ?  Is LTo necessary?
> I.e., can
> > you run the benchmark also built with the branch compiler and
> -mcpu=native
> > -Ofast -fomit-frame-pointer -fno-inline-functions-called-once ?
> >
>
> Done, see below.
>
> > >
> +--+--
> >
> +--+--+--+
> > > | Branch   | -mcpu=native -Ofast -fomit-frame-pointer -flto
> > | -24% |  |  |
> > >
> +--+--
> >
> +--+--+--+
> > > | Branch   | -mcpu=native -Ofast -fomit-frame-pointer
> > | -26% |  |  |
> > >
> +--+--
> >
> +--+--+--+
> >
> > >
> > > (Hopefully the table shows up correct)
> >
> > it does show OK for me, thanks.
> >
> > >
> > > It looks like your patch definitely does improve the basic cases. So
> > > there's not much difference between lto and non-lto anymore and it's
> > much Better than GCC 10. However it still contains the regression
> introduced
> > by Honza's changes.
> >
> > I assume these are rates, not times, so negative means bad.  But do I
> > understand it correctly that you're comparing against GCC 10 with the two
> > parameters set to rather special values?  Because your table seems to
> > indicate that even for you, the branch is faster than GCC 10 with just -
> > mcpu=native -Ofast -fomit-frame-pointer.
>
> Yes these are indeed rates, and indeed I am comparing against the same
> options
> we used to get the fastest rates on before which is the two parameters and
> the inline flag.
>
> >
> > So is the problem that the best obtainable run-time, even with obscure
> > options, from the branch is slower than the best obtainable run-time from
> > GCC 10?
> >
>
> Yeah that's the problem, when we compare the two we're still behind.
>
> I've done the additional two runs
>
>
> +--+--+--+
> | Compiler | Flags
>
> | diff GCC 10  |
>
> +--+--+--+
> | GCC 10   | -mcpu=native -Ofast -fomit-frame-pointer -flto --param
> ipa-cp-eval-threshold=1 --param   ipa-cp-unit-growth=80
> -fno-inline-functions-called-once |  |
>
> +--+--+--+
> | GCC 10   | -mcpu=native -Ofast -fomit-frame-pointer
>
>| -44% |
>
> +--+--+--+
> | GCC 10   | -mcpu=native -Ofast -fomit-frame-pointer -flto
>
>| -36% |
>
> +--+--+--+
> | GCC 11   | -mcpu=native -Ofast -fomit-frame-pointer -flto --param
> ipa-cp-eval-threshold=1 --param   ipa-cp-unit-growth=80
> -fno-inline-functions-called-once | -12% |
>
> +--+--+--+
> | Branch   | -mcpu=native -Ofast -fomit-frame-pointer -flto --param
> ipa-cp-eval-threshold=1 --param   ipa-cp-unit-growth=80
>| -22% |
>
> +--+--+--+
> | Branch   | -mcpu=native -Ofast -fomit-frame-pointer -flto --param
> ipa-cp-eval-threshold=1 --param   ipa-cp-unit-growth=80
> -fno-inline-functions-called-once | -12% |
>
> +--+--+--+
> | Branch   | -mcpu=native -Ofast -fomit-frame-pointer -flto
>
>| -24% |
>
> +--+--+--+
> | Branch   

Re: [PATCH] Enable GCC support for AMX

2020-09-18 Thread Hongyu Wang via Gcc-patches
Hi Kirill,

Very Appreciated for your review again

I just update the patch with adding XSAVE dependency and use
__builtin_cpu_supports for runtime test.

Re-based on Sept. 15 trunk and tested with sde. Kindly PING.


Hongyu Wang  于2020年9月12日周六 上午1:00写道:

> Hi
>
> Thanks for your review, and sorry for the late reply. It took a while
> to finish the runtime test.
>
> > > diff --git a/gcc/config.gcc b/gcc/config.gcc
> > > index 797f0ad5edd..d0e59e86a5c 100644
> > > --- a/gcc/config.gcc
> > > +++ b/gcc/config.gcc
> > > @@ -412,7 +412,7 @@ i[34567]86-*-*)
> > >  waitpkgintrin.h cldemoteintrin.h
> avx512bf16vlintrin.h
> > >  avx512bf16intrin.h enqcmdintrin.h
> serializeintrin.h
> > >  avx512vp2intersectintrin.h
> avx512vp2intersectvlintrin.h
> > > -tsxldtrkintrin.h"
> > > +tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
> amxbf16intrin.h"
> >
> > Line more than 80 chars.
> >
> > >   ;;
> > >  x86_64-*-*)
> > >   cpu_type=i386
> > > @@ -447,7 +447,7 @@ x86_64-*-*)
> > >  waitpkgintrin.h cldemoteintrin.h
> avx512bf16vlintrin.h
> > >  avx512bf16intrin.h enqcmdintrin.h
> serializeintrin.h
> > >  avx512vp2intersectintrin.h
> avx512vp2intersectvlintrin.h
> > > -tsxldtrkintrin.h"
> > > +tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
> amxbf16intrin.h"
> >
> > Ditto.
>
> Changed.
>
> >
> > > diff --git a/gcc/config/i386/amxbf16intrin.h
> b/gcc/config/i386/amxbf16intrin.h
> > > new file mode 100644
> > > index 000..df0e2262d50
> > > --- /dev/null
> > > +++ b/gcc/config/i386/amxbf16intrin.h
> > > @@ -0,0 +1,25 @@
> > > +#if !defined _IMMINTRIN_H_INCLUDED
> > > +#error "Never use  directly; include 
> instead."
> > > +#endif
> > > +
> > > +#ifndef _AMXBF16INTRIN_H_INCLUDED
> > > +#define _AMXBF16INTRIN_H_INCLUDED
> > > +
> > > +#if !defined(__AMX_BF16__)
> > > +#pragma GCC push_options
> > > +#pragma GCC target("amx-bf16")
> > > +#define __DISABLE_AMX_BF16__
> > > +#endif /* __AMX_BF16__ */
> > > +
> > > +#if defined(__x86_64__) && defined(__AMX_BF16__)
> > > +#define _tile_dpbf16ps(dst,src1,src2)
> \
> > > +  __asm__ volatile\
> > > +  ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1",
> %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
> > > +#endif
> >
> > I hope in future we'll replace it with unspecs at least...
>
> Currently we think it is redundant to add builtins with just const int
> parameters, which are supposed to be replaced in the future.
>
> >
> > > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> > > index c9f7195d423..9389dc24948 100644
> > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > > index bca8c856dc8..a46e31f5862 100644
> > > --- a/gcc/doc/invoke.texi
> > > +++ b/gcc/doc/invoke.texi
> > > @@ -1357,6 +1357,7 @@ See RS/6000 and PowerPC Options.
> > >  -mvpclmulqdq  -mavx512bitalg  -mmovdiri  -mmovdir64b
> -mavx512vpopcntdq @gol
> > >  -mavx5124fmaps  -mavx512vnni  -mavx5124vnniw  -mprfchw  -mrdpid @gol
> > >  -mrdseed  -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
> > > +-mamx-tile -mamx-int8 -mamx-bf16@gol
> >
> > Add space please.
>
> Changed.
>
> >
> > > diff --git a/gcc/testsuite/gcc.target/i386/amxbf16-asmintel-2.c
> b/gcc/testsuite/gcc.target/i386/amxbf16-asmintel-2.c
> > > new file mode 100644
> > > index 000..605a44df3f8
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/amxbf16-asmintel-2.c
> > > @@ -0,0 +1,4 @@
> > > +/* { dg-do assemble { target { ! ia32 } } } */
> > > +/* { dg-options "-O2 -mamx-bf16 -masm=intel" } */
> > > +/* { dg-require-effective-target amx_bf16 } */
> > > +#include"amxbf16-asmintel-1.c"
> >
> > I didn't get it. We ususally use second tescase to actually execute
> > it and (well, a little) verify that semantics is ok. E.g. that
> > operands order is correct. Could you please do that?
> > This applies to all *-2.c cases.
> > I've checked and looks like public SDE simulat

Enable GCC support for Intel Key Locker extension

2020-09-20 Thread Hongyu Wang via Gcc-patches
Hi:

This patch is about to support Intel Key Locker extension.

Key Locker provides a mechanism to encrypt and decrypt data with an AES key
without having access to the raw key value.

For more details, please refer to
https://software.intel.com/content/dam/develop/external/us/en/documents/343965-intel-key-locker-specification.pdf
.

Bootstrap ok, regression test on i386/x86 backend is ok.

OK for master?

gcc/ChangeLog

* common/config/i386/cpuinfo.h (get_available_features):
Detect KL, AESKLE and WIDEKL features.
* common/config/i386/i386-common.c
(OPTION_MASK_ISA_KL_SET): New.
(OPTION_MASK_ISA_WIDEKL_SET): Likewise.
(OPTION_MASK_ISA_KL_UNSET): Likewise.
(OPTION_MASK_ISA_WIDEKL_UNSET): Likewise.
(OPTION_MASK_ISA2_AVX2_UNSET): Likewise.
(OPTION_MASK_ISA2_AVX_UNSET): Likewise.
(OPTION_MASK_ISA2_SSE4_2_UNSET): Likewise.
(OPTION_MASK_ISA2_SSE4_1_UNSET): Likewise.
(OPTION_MASK_ISA2_SSE4_UNSET): Likewise.
(OPTION_MASK_ISA2_SSSE3_UNSET): Likewise.
(OPTION_MASK_ISA2_SSE3_UNSET): Likewise.
(OPTION_MASK_ISA2_SSE2_UNSET): Likewise.
(OPTION_MASK_ISA2_SSE_UNSET): Likewise.
(ix86_handle_option): Handle kl and widekl, add dependency chain
for KL and SSE2.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
(FEATURE_KL, FEATURE_AESKLE, FEATURE_WIDEKL): New.
* common/config/i386/i386-isas.h: Add ISA_NAMES_TABLE_ENTRY
for KL, AESKLE and WIDEKL.
* config.gcc: Add keylockerintrin.h.
* doc/invoke.texi: Document new option -mkl and -mwidekl.
* doc/extend.texi: Document kl and widekl.
* config/i386/constraints.md
(Y1, Y2, Y3, Y4, Y5, Y6, Y7): New register constraints.
* config/i386/cpuid.h (bit_KL, bit_AESKLE, bit_WIDEKL): New.
* config/i386/i386-builtin-types.def ((UINT, UINT, V2DI, V2DI,
PVOID),
(UINT, UINT, V2DI, PVOID), (VOID, V2DI, V2DI, V2DI, UINT),
(UINT8, PV2DI, V2DI, PCVOID), (UINT8, PV2DI, PCV2DI, PCVOID)):
New
function types.
* config/i386/i386-builtin.def: Add
__builtin_ia32_loadiwkey,
__builtin_ia32_aesdec128kl_u8,
__builtin_ia32_aesdec256kl_u8,
__builtin_ia32_aesenc128kl_u8,
__builtin_ia32_aesenc256kl_u8,
__builtin_ia32_aesdecwide128kl_u8,
__builtin_ia32_aesdecwide256kl_u8,
__builtin_ia32_aesencwide128kl_u8,
__builtin_ia32_aesencwide256kl_u8,
__builtin_ia32_encodekey128_u32,
__builtin_ia32_encodekey256_u32.
* config/i386/i386-c.c (ix86_target_macros_internal): Handle
kl and widekl.
* config/i386/i386-options.c (isa2_opts): Add -mkl and -mwidekl.
(ix86_option_override_internal): Handle KL and WIDEKL.
(ix86_valid_target_attribute_inner_p): Add attribute for kl and
widekl.
* config/i386/i386-expand.c
(ix86_expand_builtin): Expand Keylocker Builtins.
* config/i386/i386.h (TARGET_KL): New.
(TARGET_KL_P): Likewise.
(TARGET_WIDEKL): Likewise.
(TARGET_WIDEKL_P): Likewise.
(PTA_KL): Likewise.
(PTA_WIDEKL): Likewise.
(enum reg_class): Add 7 new SSE register classes.
(REG_CLASS_NAMES): Likewise.
(REG_CLASS_CONTENTS): Likewise.
* config/i386/i386.opt: Add new option mkl and mwidekl.
* config/i386/keylockerintrin.h: New header file for Keylocker.
* config/i386/immintrin.h: Include keylockerintrin.h.
* config/i386/sse.md (UNSPECV_LOADIWKEY): New.
(UNSPECV_AESDEC128KLU8): Likewise.
(UNSPECV_AESENC128KLU8): Likewise.
(UNSPECV_AESDEC256KLU8): Likewise.
(UNSPECV_AESENC256KLU8): Likewise.
(UNSPECV_AESDECWIDE128KLU8): Likewise.
(UNSPECV_AESENCWIDE128KLU8): Likewise.
(UNSPECV_AESDECWIDE256KLU8): Likewise.
(UNSPECV_AESENCWIDE256KLU8): Likewise.
(UNSPECV_ENCODEKEY128U32): Likewise.
(UNSPECV_ENCODEKEY256U32): Likewise.
(loadiwkey): New insn pattern.
(encodekey128u32): Likewise.
(encodekey256u32): Likewise.
(aesu8): Likewise.
(aesu8): Likewise.

gcc/testsuite/ChangeLog

* gcc.target/i386/keylocker-aesdec128kl.c: New test.
* gcc.target/i386/keylocker-aesdec256kl.c: Likewise.
* gcc.target/i386/keylocker-aesdecwide128kl.c: Likewise.
* gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
* gcc.target/i386/keylocker-aesenc128kl.c: Likewise.
* gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
* gcc.target

Re: [PATCH] Enable GCC support for AMX

2020-09-28 Thread Hongyu Wang via Gcc-patches
Thanks!  I'll ask my colleague to help check in the patch.

Kirill Yukhin  于2020年9月28日周一 下午7:38写道:

> Hello,
>
> On 12 сен 01:00, Hongyu Wang wrote:
> > Hi
> >
> > Thanks for your review, and sorry for the late reply. It took a while
> > to finish the runtime test.
>
> Thanks for your fixes! The patch is OK for trunk.
>
> --
> Thanks, K
>


-- 
Regards,

Hongyu, Wang


[PATCH] Add Missing FSF copyright notes for some x86 intrinsic headers

2020-09-28 Thread Hongyu Wang via Gcc-patches
Hi,

Some x86 intrinsic headers is missing FSF copyright notes. This patch add
the missed notes for those headers.

OK for master?

gcc/ChangeLog:

* config/i386/amxbf16intrin.h: Add FSF copyright notes.
* config/i386/amxint8intrin.h: Ditto.
* config/i386/amxtileintrin.h: Ditto.
* config/i386/avx512vp2intersectintrin.h: Ditto.
* config/i386/avx512vp2intersectvlintrin.h: Ditto.
* config/i386/pconfigintrin.h: Ditto.
* config/i386/tsxldtrkintrin.h: Ditto.
* config/i386/wbnoinvdintrin.h: Ditto.

-- 
Regards,

Hongyu, Wang
From ec6263ba1d74953721dd274c301bdeeeb71d5e77 Mon Sep 17 00:00:00 2001
From: Hongyu Wang 
Date: Mon, 28 Sep 2020 22:22:28 +
Subject: [PATCH] Add missing FSF copyright notes for x86 intrinsic headers.

gcc/ChangeLog:

	* config/i386/amxbf16intrin.h: Add FSF copyright notes.
	* config/i386/amxint8intrin.h: Ditto.
	* config/i386/amxtileintrin.h: Ditto.
	* config/i386/avx512vp2intersectintrin.h: Ditto.
	* config/i386/avx512vp2intersectvlintrin.h: Ditto.
	* config/i386/pconfigintrin.h: Ditto.
	* config/i386/tsxldtrkintrin.h: Ditto.
	* config/i386/wbnoinvdintrin.h: Ditto.
---
 gcc/config/i386/amxbf16intrin.h  | 23 
 gcc/config/i386/amxint8intrin.h  | 23 
 gcc/config/i386/amxtileintrin.h  | 23 
 gcc/config/i386/avx512vp2intersectintrin.h   | 23 
 gcc/config/i386/avx512vp2intersectvlintrin.h | 23 
 gcc/config/i386/pconfigintrin.h  | 23 
 gcc/config/i386/tsxldtrkintrin.h | 23 
 gcc/config/i386/wbnoinvdintrin.h | 23 
 8 files changed, 184 insertions(+)

diff --git a/gcc/config/i386/amxbf16intrin.h b/gcc/config/i386/amxbf16intrin.h
index b1620963944..77cc395e86d 100644
--- a/gcc/config/i386/amxbf16intrin.h
+++ b/gcc/config/i386/amxbf16intrin.h
@@ -1,3 +1,26 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
 #if !defined _IMMINTRIN_H_INCLUDED
 #error "Never use  directly; include  instead."
 #endif
diff --git a/gcc/config/i386/amxint8intrin.h b/gcc/config/i386/amxint8intrin.h
index 11adc1f1295..f4e410b6647 100644
--- a/gcc/config/i386/amxint8intrin.h
+++ b/gcc/config/i386/amxint8intrin.h
@@ -1,3 +1,26 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
 #if !defined _IMMINTRIN_H_INCLUDED
 #error "Never use  directly; include  instead."
 #endif
diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
index e78e5c04909..41fb9a5d86a 100644
--- a/gcc/config/i386/amxtileintrin.h
+++ b/gcc/config/i386/amxtileintrin.h
@@ -1,3 +1,26 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without ev

Re: [committed] testsuite: Fix up amx* dg-do run tests with older binutils

2020-09-30 Thread Hongyu Wang via Gcc-patches
Thanks for the fix! I forgot that we don't have builtin check for
target-supports.exp.

Will update these once we implement AMX with builtins.

Jakub Jelinek  于2020年9月30日周三 下午7:51写道:

> On Fri, Sep 18, 2020 at 04:31:55PM +0800, Hongyu Wang via Gcc-patches
> wrote:
> > Very Appreciated for your review again
> >
> > I just update the patch with adding XSAVE dependency and use
> > __builtin_cpu_supports for runtime test.
>
> Several tests FAIL when using older binutils that don't support AMX.
>
> Fixed thusly, tested on x86_64-linux -m32/-m64, committed to trunk as
> obvious:
>
> 2020-09-30  Jakub Jelinek  
>
> * gcc.target/i386/amxint8-dpbssd-2.c: Require effective targets
> amx_tile and amx_int8.
> * gcc.target/i386/amxint8-dpbsud-2.c: Likewise.
> * gcc.target/i386/amxint8-dpbusd-2.c: Likewise.
> * gcc.target/i386/amxint8-dpbuud-2.c: Likewise.
> * gcc.target/i386/amxbf16-dpbf16ps-2.c: Require effective targets
> amx_tile and amx_bf16.
> * gcc.target/i386/amxtile-2.c: Require effective target amx_tile.
>
> --- gcc/testsuite/gcc.target/i386/amxint8-dpbssd-2.c.jj 2020-09-29
> 11:32:02.950602758 +0200
> +++ gcc/testsuite/gcc.target/i386/amxint8-dpbssd-2.c2020-09-30
> 13:16:08.186445881 +0200
> @@ -1,4 +1,6 @@
>  /* { dg-do run { target { ! ia32 } } } */
> +/* { dg-require-effective-target amx_tile } */
> +/* { dg-require-effective-target amx_int8 } */
>  /* { dg-options "-O2 -mamx-tile -mamx-int8" } */
>  #include 
>
> --- gcc/testsuite/gcc.target/i386/amxint8-dpbsud-2.c.jj 2020-09-29
> 11:32:02.950602758 +0200
> +++ gcc/testsuite/gcc.target/i386/amxint8-dpbsud-2.c2020-09-30
> 13:16:23.715221450 +0200
> @@ -1,4 +1,6 @@
>  /* { dg-do run { target { ! ia32 } } } */
> +/* { dg-require-effective-target amx_tile } */
> +/* { dg-require-effective-target amx_int8 } */
>  /* { dg-options "-O2 -mamx-tile -mamx-int8" } */
>  #include 
>
> --- gcc/testsuite/gcc.target/i386/amxint8-dpbusd-2.c.jj 2020-09-29
> 11:32:02.950602758 +0200
> +++ gcc/testsuite/gcc.target/i386/amxint8-dpbusd-2.c2020-09-30
> 13:16:28.998145100 +0200
> @@ -1,4 +1,6 @@
>  /* { dg-do run { target { ! ia32 } } } */
> +/* { dg-require-effective-target amx_tile } */
> +/* { dg-require-effective-target amx_int8 } */
>  /* { dg-options "-O2 -mamx-tile -mamx-int8" } */
>  #include 
>
> --- gcc/testsuite/gcc.target/i386/amxint8-dpbuud-2.c.jj 2020-09-29
> 11:32:02.950602758 +0200
> +++ gcc/testsuite/gcc.target/i386/amxint8-dpbuud-2.c2020-09-30
> 13:16:35.770047224 +0200
> @@ -1,4 +1,6 @@
>  /* { dg-do run { target { ! ia32 } } } */
> +/* { dg-require-effective-target amx_tile } */
> +/* { dg-require-effective-target amx_int8 } */
>  /* { dg-options "-O2 -mamx-tile -mamx-int8" } */
>  #include 
>
> --- gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c.jj   2020-09-29
> 11:32:02.949602773 +0200
> +++ gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c  2020-09-30
> 13:15:41.079837637 +0200
> @@ -1,4 +1,6 @@
>  /* { dg-do run { target { ! ia32 } } } */
> +/* { dg-require-effective-target amx_tile } */
> +/* { dg-require-effective-target amx_bf16 } */
>  /* { dg-options "-O2 -mamx-tile -mamx-bf16" } */
>  #include 
>
> --- gcc/testsuite/gcc.target/i386/amxtile-2.c.jj2020-09-29
> 11:32:02.950602758 +0200
> +++ gcc/testsuite/gcc.target/i386/amxtile-2.c   2020-09-30
> 13:16:57.972726339 +0200
> @@ -1,4 +1,5 @@
>  /* { dg-do run { target { ! ia32 } } } */
> +/* { dg-require-effective-target amx_tile } */
>  /* { dg-options "-O2 -mamx-tile " } */
>  #include 
>
>
>
> Jakub
>
>

-- 
Regards,

Hongyu, Wang


[PATCH] i386: Fix pr104551 testcase for solaris [PR 104726]

2022-03-01 Thread Hongyu Wang via Gcc-patches
Use avx2-check mechanism to avoid illegal instrucion on non-avx2 target.

Tested by Rainer Orth on Solaris/x86. Pushed to trunk as obvious fix.

gcc/testsuite/ChangeLog:

PR target/104726
* gcc.target/i386/pr104551.c: Use avx2-check.h.
---
 gcc/testsuite/gcc.target/i386/pr104551.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr104551.c 
b/gcc/testsuite/gcc.target/i386/pr104551.c
index 6300f25c0d5..b839d5b2b0d 100644
--- a/gcc/testsuite/gcc.target/i386/pr104551.c
+++ b/gcc/testsuite/gcc.target/i386/pr104551.c
@@ -2,6 +2,8 @@
 /* { dg-options "-O3 -mavx2" } */
 /* { dg-require-effective-target avx2 } */
 
+#include "avx2-check.h"
+
 unsigned int
 __attribute__((noipa))
 test(unsigned int a, unsigned char p[16]) {
@@ -11,7 +13,8 @@ test(unsigned int a, unsigned char p[16]) {
   return res;
 }
 
-int main ()
+static void
+avx2_test (void)
 {
   unsigned int a = 16U;
   unsigned char p[16];
@@ -20,5 +23,4 @@ int main ()
   unsigned int res = test (a, p);
   if (res != 128)
 __builtin_abort ();
-  return 0;
 }
-- 
2.18.1



[PATCH] AVX512FP16: Fix vcvt[u]si2sh runtime tests for Solaris

2022-03-01 Thread Hongyu Wang via Gcc-patches
Use standard C type instead of __int64_t which doesn't work on Solaris.

Tested by Rainer Orth on Solaris/x86. Pushed to trunk as obvious fix.

gcc/testsuite/ChangeLog:

PR target/104724
* gcc.target/i386/avx512fp16-vcvtsi2sh-1b.c: Use long long
instead of __int64_t.
* gcc.target/i386/avx512fp16-vcvtsi2sh64-1b.c: Ditto.
* gcc.target/i386/avx512fp16-vcvtusi2sh-1b.c: Ditto.
* gcc.target/i386/avx512fp16-vcvtusi2sh64-1b.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh-1b.c| 2 +-
 gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh64-1b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh-1b.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh64-1b.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh-1b.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh-1b.c
index d9c9a853a17..7f7e6032e60 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh-1b.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh-1b.c
@@ -9,7 +9,7 @@
 
 void NOINLINE
 emulate_vcvtsi2sh(V512 *dest, V512 op1, 
- int value_32, __int64_t value_64, int bits)
+ int value_32, long long value_64, int bits)
 {
   V512 v1,v2,v5,v6;
   unpack_ph_2twops(op1, &v1, &v2);
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh64-1b.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh64-1b.c
index 6f66a87a8e7..5bca1905f82 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh64-1b.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtsi2sh64-1b.c
@@ -9,7 +9,7 @@
 
 void NOINLINE
 emulate_vcvtsi2sh(V512 *dest, V512 op1, 
- int value_32, __int64_t value_64, int bits)
+ int value_32, long long value_64, int bits)
 {
   V512 v1,v2,v5,v6;
   unpack_ph_2twops(op1, &v1, &v2);
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh-1b.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh-1b.c
index d339f0a4043..e17579cfd3c 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh-1b.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh-1b.c
@@ -9,7 +9,7 @@
 
 void NOINLINE
 emulate_vcvtusi2sh(V512 *dest, V512 op1, 
-  int value_32, __int64_t value_64, int bits)
+  int value_32, long long value_64, int bits)
 {
   V512 v1,v2,v5,v6;
   unpack_ph_2twops(op1, &v1, &v2);
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh64-1b.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh64-1b.c
index 20e711e1b0e..42726bd3d67 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh64-1b.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vcvtusi2sh64-1b.c
@@ -9,7 +9,7 @@
 
 void NOINLINE
 emulate_vcvtusi2sh(V512 *dest, V512 op1, 
-  int value_32, __int64_t value_64, int bits)
+  int value_32, long long value_64, int bits)
 {
   V512 v1,v2,v5,v6;
   unpack_ph_2twops(op1, &v1, &v2);
-- 
2.18.1



[PATCH] AVX512FP16: Fix masm=intel output for vfc?(madd|mul)csh [PR 104977]

2022-03-18 Thread Hongyu Wang via Gcc-patches
Hi, 

This patch fixes typo in subst for scalar complex mask_round operand.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

gcc/ChangeLog:

PR target/104977
* config/i386/sse.md

(avx512fp16_fmash_v8hf):
Correct round operand for intel dialect.

gcc/testsuite/ChangeLog:

PR target/104977
* gcc.target/i386/pr104977.c: New test.
---
 gcc/config/i386/sse.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr104977.c | 13 +
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104977.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ed98120be59..21bf3c55c95 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6723,7 +6723,7 @@ (define_insn 
"avx512fp16_fma_sh_v8hfsh\t{%2, %1, 
%0|%0, %1, 
%2}"
+  "vsh\t{%2, %1, 
%0|%0, %1, 
%2}"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "V8HF")])
 
diff --git a/gcc/testsuite/gcc.target/i386/pr104977.c 
b/gcc/testsuite/gcc.target/i386/pr104977.c
new file mode 100644
index 000..9faa4db3b0d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104977.c
@@ -0,0 +1,13 @@
+/* PR target/104977 */
+/* { dg-do assemble } */
+/* { dg-options "-O2 -mavx512fp16 -masm=intel" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-require-effective-target masm_intel } */
+
+#include
+
+__m128h
+foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{
+  return _mm_fcmadd_round_sch (a, b, c, 8);
+}
-- 
2.18.1



[PATCH] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

2022-03-18 Thread Hongyu Wang via Gcc-patches
Hi,

For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
mask should be and by 1 to ensure the mask is bind to lowest byte.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

gcc/ChangeLog:

PR target/104978
* config/i386/sse.md
(avx512fp16_fmaddcsh_v8hf_mask1"
(match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && "
 {
-  rtx op0, op1;
+  rtx op0, op1, mask;
 
   if ()
 emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (
@@ -6590,11 +6590,13 @@ (define_expand 
"avx512fp16_fmaddcsh_v8hf_mask1"
   {
 op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
 op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
+mask = gen_reg_rtx (QImode);
+emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
+emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
   }
   else
   {
-rtx mask, tmp, vec_mask;
+rtx tmp, vec_mask;
 mask = lowpart_subreg (SImode, operands[4], QImode),
 tmp = gen_reg_rtx (SImode);
 emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
@@ -6631,7 +6633,7 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1"
(match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && "
 {
-  rtx op0, op1;
+  rtx op0, op1, mask;
 
   if ()
 emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (
@@ -6645,11 +6647,13 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1"
   {
 op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
 op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
+mask = gen_reg_rtx (QImode);
+emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
+emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
   }
   else
   {
-rtx mask, tmp, vec_mask;
+rtx tmp, vec_mask;
 mask = lowpart_subreg (SImode, operands[4], QImode),
 tmp = gen_reg_rtx (SImode);
 emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c 
b/gcc/testsuite/gcc.target/i386/pr104978.c
new file mode 100644
index 000..fd22a6c3f43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104978.c
@@ -0,0 +1,18 @@
+/* PR target/104978 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
+
+#include
+
+__m128h
+foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
+}
+
+__m128h
+foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
+}
-- 
2.18.1



Re: [PATCH] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

2022-03-20 Thread Hongyu Wang via Gcc-patches
> Would it be better to use vmovss under avx512vl without & 1 for mask.

vmovss clears the upper bits, but the intrinsic requires src1. We
still need either a mask move or blend for the high part.

LLVM generates mask & 1 for these intrinsics.

Hongtao Liu via Gcc-patches  于2022年3月21日周一 09:08写道:
>
> On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Hi,
> >
> > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> > mask should be and by 1 to ensure the mask is bind to lowest byte.
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> > PR target/104978
> > * config/i386/sse.md
> > (avx512fp16_fmaddcsh_v8hf_mask1 > Generate mask & 1 before move to dest under TARGET_AVX512VL.
> > (avx512fp16_fcmaddcsh_v8hf_mask1 >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/104978
> > * gcc.target/i386/pr104978.c: New test.
> > ---
> >  gcc/config/i386/sse.md   | 16 ++--
> >  gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++
> >  2 files changed, 28 insertions(+), 6 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index ed98120be59..cc4c5542ee6 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -6576,7 +6576,7 @@ (define_expand 
> > "avx512fp16_fmaddcsh_v8hf_mask1"
> > (match_operand:QI 4 "register_operand")]
> >"TARGET_AVX512FP16 && "
> >  {
> > -  rtx op0, op1;
> > +  rtx op0, op1, mask;
> >
> >if ()
> >  emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (
> > @@ -6590,11 +6590,13 @@ (define_expand 
> > "avx512fp16_fmaddcsh_v8hf_mask1"
> >{
> >  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> >  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > +mask = gen_reg_rtx (QImode);
> > +emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > +emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> >}
> >else
> >{
> > -rtx mask, tmp, vec_mask;
> > +rtx tmp, vec_mask;
> >  mask = lowpart_subreg (SImode, operands[4], QImode),
> >  tmp = gen_reg_rtx (SImode);
> >  emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > @@ -6631,7 +6633,7 @@ (define_expand 
> > "avx512fp16_fcmaddcsh_v8hf_mask1"
> > (match_operand:QI 4 "register_operand")]
> >"TARGET_AVX512FP16 && "
> >  {
> > -  rtx op0, op1;
> > +  rtx op0, op1, mask;
> >
> >if ()
> >  emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (
> > @@ -6645,11 +6647,13 @@ (define_expand 
> > "avx512fp16_fcmaddcsh_v8hf_mask1"
> >{
> >  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> >  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > +mask = gen_reg_rtx (QImode);
> > +emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > +emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> Would it be better to use vmovss under avx512vl without & 1 for mask.
> >}
> >else
> >{
> > -rtx mask, tmp, vec_mask;
> > +rtx tmp, vec_mask;
> >  mask = lowpart_subreg (SImode, operands[4], QImode),
> >  tmp = gen_reg_rtx (SImode);
> >  emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c 
> > b/gcc/testsuite/gcc.target/i386/pr104978.c
> > new file mode 100644
> > index 000..fd22a6c3f43
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c
> > @@ -0,0 +1,18 @@
> > +/* PR target/104978 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> > +/* { dg-final { scan-assembler-times "and\[^\\n\\r\]*\\\$1" 2 } } */
> > +
> > +#include
> > +
> > +__m128h
> > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > +{
> > +  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
> > +}
> > +
> > +__m128h
> > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
> > +{
> > +  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
> > +}
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao


Re: [PATCH] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

2022-03-20 Thread Hongyu Wang via Gcc-patches
> > > Would it be better to use vmovss under avx512vl without & 1 for mask.
> >
> > vmovss clears the upper bits, but the intrinsic requires src1. We
> > still need either a mask move or blend for the high part.
> not for __m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
> https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vmovss&ig_expand=3807,3081,3082,3084,3083,4837,4838

Oh, if this works, the non-avx512vl part could also be adjusted. Will
try this, thanks.

Hongtao Liu  于2022年3月21日周一 09:48写道:
>
> On Mon, Mar 21, 2022 at 9:22 AM Hongyu Wang  wrote:
> >
> > > Would it be better to use vmovss under avx512vl without & 1 for mask.
> >
> > vmovss clears the upper bits, but the intrinsic requires src1. We
> > still need either a mask move or blend for the high part.
> not for __m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
> https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vmovss&ig_expand=3807,3081,3082,3084,3083,4837,4838
> >
> > LLVM generates mask & 1 for these intrinsics.
> >
> > Hongtao Liu via Gcc-patches  于2022年3月21日周一 09:08写道:
> > >
> > > On Sat, Mar 19, 2022 at 8:09 AM Hongyu Wang via Gcc-patches
> > >  wrote:
> > > >
> > > > Hi,
> > > >
> > > > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> > > > mask should be and by 1 to ensure the mask is bind to lowest byte.
> > > >
> > > > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> > > >
> > > > Ok for master?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR target/104978
> > > > * config/i386/sse.md
> > > > (avx512fp16_fmaddcsh_v8hf_mask1 > > > Generate mask & 1 before move to dest under TARGET_AVX512VL.
> > > > (avx512fp16_fcmaddcsh_v8hf_mask1 > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/104978
> > > > * gcc.target/i386/pr104978.c: New test.
> > > > ---
> > > >  gcc/config/i386/sse.md   | 16 ++--
> > > >  gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++
> > > >  2 files changed, 28 insertions(+), 6 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
> > > >
> > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > > index ed98120be59..cc4c5542ee6 100644
> > > > --- a/gcc/config/i386/sse.md
> > > > +++ b/gcc/config/i386/sse.md
> > > > @@ -6576,7 +6576,7 @@ (define_expand 
> > > > "avx512fp16_fmaddcsh_v8hf_mask1"
> > > > (match_operand:QI 4 "register_operand")]
> > > >"TARGET_AVX512FP16 && "
> > > >  {
> > > > -  rtx op0, op1;
> > > > +  rtx op0, op1, mask;
> > > >
> > > >if ()
> > > >  emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (
> > > > @@ -6590,11 +6590,13 @@ (define_expand 
> > > > "avx512fp16_fmaddcsh_v8hf_mask1"
> > > >{
> > > >  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > > >  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > > > -emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, 
> > > > operands[4]));
> > > > +mask = gen_reg_rtx (QImode);
> > > > +emit_insn (gen_andqi3 (mask, operands[4], GEN_INT (1)));
> > > > +emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, mask));
> > > >}
> > > >else
> > > >{
> > > > -rtx mask, tmp, vec_mask;
> > > > +rtx tmp, vec_mask;
> > > >  mask = lowpart_subreg (SImode, operands[4], QImode),
> > > >  tmp = gen_reg_rtx (SImode);
> > > >  emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > > > @@ -6631,7 +6633,7 @@ (define_expand 
> > > > "avx512fp16_fcmaddcsh_v8hf_mask1"
> > > > (match_operand:QI 4 "register_operand")]
> > > >"TARGET_AVX512FP16 && "
> > > >  {
> > > > -  rtx op0, op1;
> > > > +  rtx op0, op1, mask;
> > > >
> > > >if ()
> > > >  emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (
> > > > @@ -6645,11 +6647,13 @@ (define_

[PATCH v2] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

2022-03-21 Thread Hongyu Wang via Gcc-patches
Hi,

For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
mask should be and by 1 to ensure the mask is bind to lowest byte.
Use masked vmovss to perform same operation which omits higher bits
of mask.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

gcc/ChangeLog:

PR target/104978
* config/i386/sse.md
(avx512fp16_fmaddcsh_v8hf_mask1"
 emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
   operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-rtx mask, tmp, vec_mask;
-mask = lowpart_subreg (SImode, operands[4], QImode),
-tmp = gen_reg_rtx (SImode);
-emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-vec_mask = gen_reg_rtx (V4SImode);
-emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
+  emit_move_insn (op0, op1);
   DONE;
 })
 
@@ -6641,26 +6625,10 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1"
 emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
   operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-rtx mask, tmp, vec_mask;
-mask = lowpart_subreg (SImode, operands[4], QImode),
-tmp = gen_reg_rtx (SImode);
-emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-vec_mask = gen_reg_rtx (V4SImode);
-emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
+  emit_move_insn (op0, op1);
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
index eb96588df39..0f87861f09b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
@@ -1,13 +1,13 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
+/* { dg-options "-mavx512fp16 -O2" } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendvps\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[
 \\t\]+#)" 2 } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
deleted file mode 

Re: [PATCH v2] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

2022-03-21 Thread Hongyu Wang via Gcc-patches
> Considering ICE in PR104976, it's better to force_reg before lowpart_subreg.
> i.e.
> op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), V8HFmode);
> if (!MEM_P (operands[1]))
>   operands[1] = force_reg (V8HFmode, operands[1]);
> op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> rtx dest = gen_reg_rtx (V4SFmode);
> emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
> emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode);

I think this is different from PR104976, since operands[0] and
operands[1] here are strictly V8HF operands from builtin input.
I suppose there should be no chance to input a different size subreg
for the expander, otherwise (__v8hf) convert in builtin would fail
first.

Hongtao Liu via Gcc-patches  于2022年3月21日周一 20:53写道:

>
> On Mon, Mar 21, 2022 at 7:52 PM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Hi,
> >
> > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
> > mask should be and by 1 to ensure the mask is bind to lowest byte.
> > Use masked vmovss to perform same operation which omits higher bits
> > of mask.
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> > PR target/104978
> > * config/i386/sse.md
> > (avx512fp16_fmaddcsh_v8hf_mask1 > Use avx512f_movsf_mask instead of vmovaps or vblend.
> > (avx512fp16_fcmaddcsh_v8hf_mask1 >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/104978
> > * gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Adjust scan.
> > * gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto.
> > * gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: Removed.
> > * gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto.
> > * gcc.target/i386/pr104978.c: New test.
> > ---
> >  gcc/config/i386/sse.md| 48 ---
> >  .../i386/avx512fp16-vfcmaddcsh-1a.c   |  4 +-
> >  .../i386/avx512fp16-vfcmaddcsh-1c.c   | 13 -
> >  .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c |  4 +-
> >  .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 -
> >  gcc/testsuite/gcc.target/i386/pr104978.c  | 18 +++
> >  6 files changed, 30 insertions(+), 70 deletions(-)
> >  delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
> >  delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 21bf3c55c95..1087a37812f 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -6586,26 +6586,10 @@ (define_expand 
> > "avx512fp16_fmaddcsh_v8hf_mask1"
> >  emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
> >operands[1], operands[2], operands[3], operands[4]));
> >
> > -  if (TARGET_AVX512VL)
> > -  {
> > -op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > -op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
> > -  }
> > -  else
> > -  {
> > -rtx mask, tmp, vec_mask;
> > -mask = lowpart_subreg (SImode, operands[4], QImode),
> > -tmp = gen_reg_rtx (SImode);
> > -emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
> > -vec_mask = gen_reg_rtx (V4SImode);
> > -emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
> > -emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
> > -vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
> > -op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > -op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > -emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
> > -  }
> > +  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
> > +  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> > +  emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4]));
> > +  emit_move_insn (op0, op1);
> Considering ICE in PR104976, it's better to force_reg before lowpart_subreg.
> i.e.
> op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), V8HFmode);
> if (!MEM_P (operands[1]))
>   operands[1] = force_reg (V8HFmode, operands[1]);
> op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
> rtx dest = gen_reg_rtx (V4SFmode);
> emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[

[PATCH v3] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]

2022-03-21 Thread Hongyu Wang via Gcc-patches
Hi, here is the patch with force_reg before lowpart_subreg.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
mask should be and by 1 to ensure the mask is bind to lowest byte.
Use masked vmovss to perform same operation which omits higher bits
of mask.

gcc/ChangeLog:

PR target/104978
* config/i386/sse.md
(avx512fp16_fmaddcsh_v8hf_mask1"
(match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && "
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
 
   if ()
 emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (
@@ -6586,26 +6586,15 @@ (define_expand 
"avx512fp16_fmaddcsh_v8hf_mask1"
 emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
   operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-rtx mask, tmp, vec_mask;
-mask = lowpart_subreg (SImode, operands[4], QImode),
-tmp = gen_reg_rtx (SImode);
-emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-vec_mask = gen_reg_rtx (V4SImode);
-emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]),
+   V8HFmode);
+  if (!MEM_P (operands[1]))
+operands[1] = force_reg (V8HFmode, operands[1]);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest,
+  V4SFmode));
   DONE;
 })
 
@@ -6631,7 +6620,7 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1"
(match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && "
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
 
   if ()
 emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (
@@ -6641,26 +6630,15 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1"
 emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
   operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-rtx mask, tmp, vec_mask;
-mask = lowpart_subreg (SImode, operands[4], QImode),
-tmp = gen_reg_rtx (SImode);
-emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-vec_mask = gen_reg_rtx (V4SImode);
-emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]),
+   V8HFmode);
+  if (!MEM_P (operands[1]))
+operands[1] = force_reg (V8HFmode, operands[1]);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest,
+  V4SFmode));
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
index eb96588df39..0f87861f09b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
@@ -1,13 +1,13 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
+/* { dg-options "-mavx512fp16 -O2" } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-

Re: [PATCH] x86: Use x constraint on KL patterns

2022-03-25 Thread Hongyu Wang via Gcc-patches
Is it possible to create a test case that gas would throw an error for
invalid operands?

H.J. Lu via Gcc-patches  于2022年3月26日周六 04:50写道:
>
> Since KL instructions have no AVX512 version, replace the "v" register
> constraint with the "x" register constraint.
>
> PR target/105058
> * config/i386/sse.md (loadiwkey): Replace "v" with "x".
> (aesu8): Likewise.
> ---
>  gcc/config/i386/sse.md | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 29802d00ce6..33bd2c4768a 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -28364,8 +28364,8 @@ (define_insn "avx512f_dpbf16ps__mask"
>
>  ;; KEYLOCKER
>  (define_insn "loadiwkey"
> -  [(unspec_volatile:V2DI [(match_operand:V2DI 0 "register_operand" "v")
> - (match_operand:V2DI 1 "register_operand" "v")
> +  [(unspec_volatile:V2DI [(match_operand:V2DI 0 "register_operand" "x")
> + (match_operand:V2DI 1 "register_operand" "x")
>   (match_operand:V2DI 2 "register_operand" "Yz")
>   (match_operand:SI   3 "register_operand" "a")]
>  UNSPECV_LOADIWKEY)
> @@ -28498,7 +28498,7 @@ (define_int_attr aesklvariant
> (UNSPECV_AESENC256KLU8 "enc256kl")])
>
>  (define_insn "aesu8"
> -  [(set (match_operand:V2DI 0 "register_operand" "=v")
> +  [(set (match_operand:V2DI 0 "register_operand" "=x")
> (unspec_volatile:V2DI [(match_operand:V2DI 1 "register_operand" "0")
>(match_operand:BLK   2 "memory_operand" "m")]
>   AESDECENCKL))
> --
> 2.35.1
>


Re: [PATCH] x86: Use x constraint on KL patterns

2022-03-25 Thread Hongyu Wang via Gcc-patches
> > Is it possible to create a test case that gas would throw an error for
> > invalid operands?
>
> You can use -ffix-xmmN to disable XMM0-15.

I mean can we create an intrinsic test for this PR that produces xmm16-31?
And the -ffix-xmmN is an option for assembler or compiler? I didn't
find it in document.

H.J. Lu  于2022年3月26日周六 09:22写道:
>
> On Fri, Mar 25, 2022 at 6:08 PM Hongyu Wang  wrote:
> >
> > Is it possible to create a test case that gas would throw an error for
> > invalid operands?
>
> You can use -ffix-xmmN to disable XMM0-15.
>
> > H.J. Lu via Gcc-patches  于2022年3月26日周六 04:50写道:
> > >
> > > Since KL instructions have no AVX512 version, replace the "v" register
> > > constraint with the "x" register constraint.
> > >
> > > PR target/105058
> > > * config/i386/sse.md (loadiwkey): Replace "v" with "x".
> > > (aesu8): Likewise.
> > > ---
> > >  gcc/config/i386/sse.md | 6 +++---
> > >  1 file changed, 3 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 29802d00ce6..33bd2c4768a 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -28364,8 +28364,8 @@ (define_insn "avx512f_dpbf16ps__mask"
> > >
> > >  ;; KEYLOCKER
> > >  (define_insn "loadiwkey"
> > > -  [(unspec_volatile:V2DI [(match_operand:V2DI 0 "register_operand" "v")
> > > - (match_operand:V2DI 1 "register_operand" "v")
> > > +  [(unspec_volatile:V2DI [(match_operand:V2DI 0 "register_operand" "x")
> > > + (match_operand:V2DI 1 "register_operand" "x")
> > >   (match_operand:V2DI 2 "register_operand" "Yz")
> > >   (match_operand:SI   3 "register_operand" "a")]
> > >  UNSPECV_LOADIWKEY)
> > > @@ -28498,7 +28498,7 @@ (define_int_attr aesklvariant
> > > (UNSPECV_AESENC256KLU8 "enc256kl")])
> > >
> > >  (define_insn "aesu8"
> > > -  [(set (match_operand:V2DI 0 "register_operand" "=v")
> > > +  [(set (match_operand:V2DI 0 "register_operand" "=x")
> > > (unspec_volatile:V2DI [(match_operand:V2DI 1 "register_operand" 
> > > "0")
> > >(match_operand:BLK   2 "memory_operand" 
> > > "m")]
> > >   AESDECENCKL))
> > > --
> > > 2.35.1
> > >
>
>
>
> --
> H.J.


[PATCH] i386: Fix infinite loop under -mrelax-cmpxchg-loop [PR 103069]

2022-04-13 Thread Hongyu Wang via Gcc-patches
Hi,

For -mrelax-cmpxchg-loop which relaxes atomic_fetch_ loops,
there is a missing set to %eax when compare fails, which would result
in infinite loop in some benchmark. Add set to %eax to avoid it.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}

Ok for master?

gcc/ChangeLog:

PR target/103069
* config/i386/i386-expand.cc (ix86_expand_cmpxchg_loop):
  Add missing set to target_val at pause label.
---
 gcc/config/i386/i386-expand.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 794315ee2f7..2144d3f968d 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23388,6 +23388,7 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx 
target_val,
 
 /* If mem is not expected, pause and loop back.  */
 emit_label (cmp_label);
+emit_move_insn (target_val, new_mem);
 emit_insn (gen_pause ());
 emit_jump_insn (gen_jump (loop_label));
 emit_barrier ();
-- 
2.18.1



[PATCH] i386: Disable stv under optimize_size [PR 105034]

2022-04-13 Thread Hongyu Wang via Gcc-patches
Hi,

>From -Os point of view, stv converts scalar register to vector mode
which introduces extra reg conversion and increase instruction size.
Disabling stv under optimize_size would avoid such code size increment
and no need to touch ix86_size_cost that has not been tuned for long
time.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,},

Ok for master?

gcc/ChangeLog:

PR target/105034
* config/i386/i386-features.cc (pass_stv::gate()): Block out
optimize_size.

gcc/testsuite/ChangeLog:

PR target/105034
* gcc.target/i386/pr105034.c: New test.
---
 gcc/config/i386/i386-features.cc |  3 ++-
 gcc/testsuite/gcc.target/i386/pr105034.c | 23 +++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105034.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6fe41c3c24f..f57281e672f 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1911,7 +1911,8 @@ public:
   virtual bool gate (function *)
 {
   return ((!timode_p || TARGET_64BIT)
- && TARGET_STV && TARGET_SSE2 && optimize > 1);
+ && TARGET_STV && TARGET_SSE2 && optimize > 1
+ && !optimize_size);
 }
 
   virtual unsigned int execute (function *)
diff --git a/gcc/testsuite/gcc.target/i386/pr105034.c 
b/gcc/testsuite/gcc.target/i386/pr105034.c
new file mode 100644
index 000..d997e26e9ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105034.c
@@ -0,0 +1,23 @@
+/* PR target/105034 */
+/* { dg-do compile } */
+/* { dg-options "-Os -msse4.1" } */
+
+#define max(a,b) (((a) > (b))? (a) : (b))
+#define min(a,b) (((a) < (b))? (a) : (b))
+
+int foo(int x)
+{
+  return max(x,0);
+}
+
+int bar(int x)
+{
+  return min(x,0);
+}
+
+unsigned int baz(unsigned int x)
+{
+  return min(x,1);
+}
+
+/* { dg-final { scan-assembler-not "xmm" } } */
-- 
2.18.1



Re: [PATCH] i386: Disable stv under optimize_size [PR 105034]

2022-04-14 Thread Hongyu Wang via Gcc-patches
>
> optimize_function_for_speed ()?
>

Yes, updated patch with optimize_function_for_speed_p()

gcc/ChangeLog:

PR target/105034
* config/i386/i386-features.cc (pass_stv::gate()): Add
  optimize_function_for_speed_p ().

gcc/testsuite/ChangeLog:

PR target/105034
* gcc.target/i386/pr105034.c: New test.
---
 gcc/config/i386/i386-features.cc |  3 ++-
 gcc/testsuite/gcc.target/i386/pr105034.c | 23 +++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105034.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6fe41c3c24f..a49c3aa1525 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1911,7 +1911,8 @@ public:
   virtual bool gate (function *)
 {
   return ((!timode_p || TARGET_64BIT)
-   && TARGET_STV && TARGET_SSE2 && optimize > 1);
+   && TARGET_STV && TARGET_SSE2 && optimize > 1
+   && optimize_function_for_speed_p (cfun));
 }

   virtual unsigned int execute (function *)
diff --git a/gcc/testsuite/gcc.target/i386/pr105034.c
b/gcc/testsuite/gcc.target/i386/pr105034.c
new file mode 100644
index 000..d997e26e9ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105034.c
@@ -0,0 +1,23 @@
+/* PR target/105034 */
+/* { dg-do compile } */
+/* { dg-options "-Os -msse4.1" } */
+
+#define max(a,b) (((a) > (b))? (a) : (b))
+#define min(a,b) (((a) < (b))? (a) : (b))
+
+int foo(int x)
+{
+  return max(x,0);
+}
+
+int bar(int x)
+{
+  return min(x,0);
+}
+
+unsigned int baz(unsigned int x)
+{
+  return min(x,1);
+}
+
+/* { dg-final { scan-assembler-not "xmm" } } */
-- 
2.18.1

Richard Biener via Gcc-patches  于2022年4月14日周四 14:56写道:
>
> On Thu, Apr 14, 2022 at 3:18 AM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Hi,
> >
> > From -Os point of view, stv converts scalar register to vector mode
> > which introduces extra reg conversion and increase instruction size.
> > Disabling stv under optimize_size would avoid such code size increment
> > and no need to touch ix86_size_cost that has not been tuned for long
> > time.
> >
> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,},
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> > PR target/105034
> > * config/i386/i386-features.cc (pass_stv::gate()): Block out
> > optimize_size.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/105034
> > * gcc.target/i386/pr105034.c: New test.
> > ---
> >  gcc/config/i386/i386-features.cc |  3 ++-
> >  gcc/testsuite/gcc.target/i386/pr105034.c | 23 +++
> >  2 files changed, 25 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105034.c
> >
> > diff --git a/gcc/config/i386/i386-features.cc 
> > b/gcc/config/i386/i386-features.cc
> > index 6fe41c3c24f..f57281e672f 100644
> > --- a/gcc/config/i386/i386-features.cc
> > +++ b/gcc/config/i386/i386-features.cc
> > @@ -1911,7 +1911,8 @@ public:
> >virtual bool gate (function *)
> >  {
> >return ((!timode_p || TARGET_64BIT)
> > - && TARGET_STV && TARGET_SSE2 && optimize > 1);
> > + && TARGET_STV && TARGET_SSE2 && optimize > 1
> > + && !optimize_size);
>
> optimize_function_for_speed ()?
>
> >  }
> >
> >virtual unsigned int execute (function *)
> > diff --git a/gcc/testsuite/gcc.target/i386/pr105034.c 
> > b/gcc/testsuite/gcc.target/i386/pr105034.c
> > new file mode 100644
> > index 000..d997e26e9ed
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr105034.c
> > @@ -0,0 +1,23 @@
> > +/* PR target/105034 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os -msse4.1" } */
> > +
> > +#define max(a,b) (((a) > (b))? (a) : (b))
> > +#define min(a,b) (((a) < (b))? (a) : (b))
> > +
> > +int foo(int x)
> > +{
> > +  return max(x,0);
> > +}
> > +
> > +int bar(int x)
> > +{
> > +  return min(x,0);
> > +}
> > +
> > +unsigned int baz(unsigned int x)
> > +{
> > +  return min(x,1);
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > --
> > 2.18.1
> >


Re: [PATCH] i386: Disable stv under optimize_size [PR 105034]

2022-04-14 Thread Hongyu Wang via Gcc-patches
> >virtual bool gate (function *)
>
> please name the parameter ...
>
> >  {
> >return ((!timode_p || TARGET_64BIT)
> > -   && TARGET_STV && TARGET_SSE2 && optimize > 1);
> > +   && TARGET_STV && TARGET_SSE2 && optimize > 1
> > +   && optimize_function_for_speed_p (cfun));
>
> ... and use it here instead of referencing 'cfun'

Updated. Thanks!

gcc/ChangeLog:

PR target/105034
* config/i386/i386-features.cc (pass_stv::gate()): Name param
to fun and add optimize_function_for_speed_p (fun).

gcc/testsuite/ChangeLog:

PR target/105034
* gcc.target/i386/pr105034.c: New test.
---
 gcc/config/i386/i386-features.cc |  5 +++--
 gcc/testsuite/gcc.target/i386/pr105034.c | 23 +++
 2 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105034.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6fe41c3c24f..26be2986486 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1908,10 +1908,11 @@ public:
   {}

   /* opt_pass methods: */
-  virtual bool gate (function *)
+  virtual bool gate (function *fun)
 {
   return ((!timode_p || TARGET_64BIT)
-  && TARGET_STV && TARGET_SSE2 && optimize > 1);
+  && TARGET_STV && TARGET_SSE2 && optimize > 1
+  && optimize_function_for_speed_p (fun));
 }

   virtual unsigned int execute (function *)
diff --git a/gcc/testsuite/gcc.target/i386/pr105034.c
b/gcc/testsuite/gcc.target/i386/pr105034.c
new file mode 100644
index 000..d997e26e9ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105034.c
@@ -0,0 +1,23 @@
+/* PR target/105034 */
+/* { dg-do compile } */
+/* { dg-options "-Os -msse4.1" } */
+
+#define max(a,b) (((a) > (b))? (a) : (b))
+#define min(a,b) (((a) < (b))? (a) : (b))
+
+int foo(int x)
+{
+  return max(x,0);
+}
+
+int bar(int x)
+{
+  return min(x,0);
+}
+
+unsigned int baz(unsigned int x)
+{
+  return min(x,1);
+}
+
+/* { dg-final { scan-assembler-not "xmm" } } */
-- 
2.18.1


Richard Biener  于2022年4月14日周四 16:06写道:
>
> On Thu, Apr 14, 2022 at 9:55 AM Hongyu Wang  wrote:
> >
> > >
> > > optimize_function_for_speed ()?
> > >
> >
> > Yes, updated patch with optimize_function_for_speed_p()
> >
> > gcc/ChangeLog:
> >
> > PR target/105034
> > * config/i386/i386-features.cc (pass_stv::gate()): Add
> >   optimize_function_for_speed_p ().
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/105034
> > * gcc.target/i386/pr105034.c: New test.
> > ---
> >  gcc/config/i386/i386-features.cc |  3 ++-
> >  gcc/testsuite/gcc.target/i386/pr105034.c | 23 +++
> >  2 files changed, 25 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105034.c
> >
> > diff --git a/gcc/config/i386/i386-features.cc 
> > b/gcc/config/i386/i386-features.cc
> > index 6fe41c3c24f..a49c3aa1525 100644
> > --- a/gcc/config/i386/i386-features.cc
> > +++ b/gcc/config/i386/i386-features.cc
> > @@ -1911,7 +1911,8 @@ public:
> >virtual bool gate (function *)
>
> please name the parameter ...
>
> >  {
> >return ((!timode_p || TARGET_64BIT)
> > -   && TARGET_STV && TARGET_SSE2 && optimize > 1);
> > +   && TARGET_STV && TARGET_SSE2 && optimize > 1
> > +   && optimize_function_for_speed_p (cfun));
>
> ... and use it here instead of referencing 'cfun'
>
> Richard.
>
> >  }
> >
> >virtual unsigned int execute (function *)
> > diff --git a/gcc/testsuite/gcc.target/i386/pr105034.c
> > b/gcc/testsuite/gcc.target/i386/pr105034.c
> > new file mode 100644
> > index 000..d997e26e9ed
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr105034.c
> > @@ -0,0 +1,23 @@
> > +/* PR target/105034 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os -msse4.1" } */
> > +
> > +#define max(a,b) (((a) > (b))? (a) : (b))
> > +#define min(a,b) (((a) < (b))? (a) : (b))
> > +
> > +int foo(int x)
> > +{
> > +  return max(x,0);
> > +}
> > +
> > +int bar(int x)
> > +{
> > +  return min(x,0);
> > +}
> > +
> > +unsigned int baz(unsigned int x)
> > +{
> > +  return min(x,1);
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > --
> > 2.18.1
> >
> &

Re: [PATCH] i386: Disable stv under optimize_size [PR 105034]

2022-04-14 Thread Hongyu Wang via Gcc-patches
; > > * config/i386/i386-features.cc (pass_stv::gate()): Add
> > > >   optimize_function_for_speed_p ().
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/105034
> > > > * gcc.target/i386/pr105034.c: New test.
> > > > ---
> > > >  gcc/config/i386/i386-features.cc |  3 ++-
> > > >  gcc/testsuite/gcc.target/i386/pr105034.c | 23 +++
> > > >  2 files changed, 25 insertions(+), 1 deletion(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105034.c
> > > >
> > > > diff --git a/gcc/config/i386/i386-features.cc 
> > > > b/gcc/config/i386/i386-features.cc
> > > > index 6fe41c3c24f..a49c3aa1525 100644
> > > > --- a/gcc/config/i386/i386-features.cc
> > > > +++ b/gcc/config/i386/i386-features.cc
> > > > @@ -1911,7 +1911,8 @@ public:
> > > >virtual bool gate (function *)
> > >
> > > please name the parameter ...
> > >
> > > >  {
> > > >return ((!timode_p || TARGET_64BIT)
> > > > -   && TARGET_STV && TARGET_SSE2 && optimize > 1);
> > > > +   && TARGET_STV && TARGET_SSE2 && optimize > 1
> > > > +   && optimize_function_for_speed_p (cfun));
> > >
> > > ... and use it here instead of referencing 'cfun'
> > >
> > > Richard.
> > >
> > > >  }
> > > >
> > > >virtual unsigned int execute (function *)
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr105034.c
> > > > b/gcc/testsuite/gcc.target/i386/pr105034.c
> > > > new file mode 100644
> > > > index 000..d997e26e9ed
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr105034.c
> > > > @@ -0,0 +1,23 @@
> > > > +/* PR target/105034 */
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-Os -msse4.1" } */
> > > > +
> > > > +#define max(a,b) (((a) > (b))? (a) : (b))
> > > > +#define min(a,b) (((a) < (b))? (a) : (b))
> > > > +
> > > > +int foo(int x)
> > > > +{
> > > > +  return max(x,0);
> > > > +}
> > > > +
> > > > +int bar(int x)
> > > > +{
> > > > +  return min(x,0);
> > > > +}
> > > > +
> > > > +unsigned int baz(unsigned int x)
> > > > +{
> > > > +  return min(x,1);
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-assembler-not "xmm" } } */
> > > > --
> > > > 2.18.1
> > > >
> > > > Richard Biener via Gcc-patches  于2022年4月14日周四 
> > > > 14:56写道:
> > > > >
> > > > > On Thu, Apr 14, 2022 at 3:18 AM Hongyu Wang via Gcc-patches
> > > > >  wrote:
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > From -Os point of view, stv converts scalar register to vector mode
> > > > > > which introduces extra reg conversion and increase instruction size.
> > > > > > Disabling stv under optimize_size would avoid such code size 
> > > > > > increment
> > > > > > and no need to touch ix86_size_cost that has not been tuned for long
> > > > > > time.
> > > > > >
> > > > > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,},
> > > > > >
> > > > > > Ok for master?
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR target/105034
> > > > > > * config/i386/i386-features.cc (pass_stv::gate()): Block out
> > > > > > optimize_size.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > PR target/105034
> > > > > > * gcc.target/i386/pr105034.c: New test.
> > > > > > ---
> > > > > >  gcc/config/i386/i386-features.cc |  3 ++-
> > > > > >  gcc/testsuite/gcc.target/i386/pr105034.c | 23 
> > > > > > +++
> > > > > >  2 files changed, 25 insertions(+), 1 deletion(-)
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105034.c
> > > > > >
> > > > > > diff --git a/gcc/config/i386/i386-features.cc 
> > > > > > b/gcc/config/i386/i386-features.cc
> > > > > > index 6fe41c3c24f..f57281e672f 100644
> > > > > > --- a/gcc/config/i386/i386-features.cc
> > > > > > +++ b/gcc/config/i386/i386-features.cc
> > > > > > @@ -1911,7 +1911,8 @@ public:
> > > > > >virtual bool gate (function *)
> > > > > >  {
> > > > > >return ((!timode_p || TARGET_64BIT)
> > > > > > - && TARGET_STV && TARGET_SSE2 && optimize > 1);
> > > > > > + && TARGET_STV && TARGET_SSE2 && optimize > 1
> > > > > > + && !optimize_size);
> > > > >
> > > > > optimize_function_for_speed ()?
> > > > >
> > > > > >  }
> > > > > >
> > > > > >virtual unsigned int execute (function *)
> > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr105034.c 
> > > > > > b/gcc/testsuite/gcc.target/i386/pr105034.c
> > > > > > new file mode 100644
> > > > > > index 000..d997e26e9ed
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr105034.c
> > > > > > @@ -0,0 +1,23 @@
> > > > > > +/* PR target/105034 */
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-Os -msse4.1" } */
> > > > > > +
> > > > > > +#define max(a,b) (((a) > (b))? (a) : (b))
> > > > > > +#define min(a,b) (((a) < (b))? (a) : (b))
> > > > > > +
> > > > > > +int foo(int x)
> > > > > > +{
> > > > > > +  return max(x,0);
> > > > > > +}
> > > > > > +
> > > > > > +int bar(int x)
> > > > > > +{
> > > > > > +  return min(x,0);
> > > > > > +}
> > > > > > +
> > > > > > +unsigned int baz(unsigned int x)
> > > > > > +{
> > > > > > +  return min(x,1);
> > > > > > +}
> > > > > > +
> > > > > > +/* { dg-final { scan-assembler-not "xmm" } } */
> > > > > > --
> > > > > > 2.18.1
> > > > > >


[PATCH] i386: Correct target attribute for crc32 intrinsics

2022-04-14 Thread Hongyu Wang via Gcc-patches
Hi,

Complile _mm_crc32_u8/16/32/64 intrinsics with -mcrc32
would meet target specific option mismatch. Correct target pragma
to fix.

Bootstrapped/regtest on x86_64-pc-linux-gnu{-m32,}.

Ok for master and backport to GCC 11?

gcc/ChangeLog:

* config/i386/smmintrin.h: Correct target pragma from sse4.1
and sse4.2 to crc32 for crc32 intrinsics.

gcc/testsuite/ChangeLog:

* gcc.target/i386/crc32-6.c: Adjust to call builtin.
* gcc.target/i386/crc32-7.c: New test.
---
 gcc/config/i386/smmintrin.h | 25 +-
 gcc/testsuite/gcc.target/i386/crc32-6.c |  2 +-
 gcc/testsuite/gcc.target/i386/crc32-7.c | 34 +
 3 files changed, 42 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/crc32-7.c

diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h
index b42b212300f..eb6a451c10a 100644
--- a/gcc/config/i386/smmintrin.h
+++ b/gcc/config/i386/smmintrin.h
@@ -810,17 +810,11 @@ _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
 
 #include 
 
-#ifndef __SSE4_1__
+#ifndef __CRC32__
 #pragma GCC push_options
-#pragma GCC target("sse4.1")
-#define __DISABLE_SSE4_1__
-#endif /* __SSE4_1__ */
-
-#ifndef __SSE4_2__
-#pragma GCC push_options
-#pragma GCC target("sse4.2")
-#define __DISABLE_SSE4_2__
-#endif /* __SSE4_1__ */
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
 
 /* Accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -849,14 +843,9 @@ _mm_crc32_u64 (unsigned long long __C, unsigned long long 
__V)
 }
 #endif
 
-#ifdef __DISABLE_SSE4_2__
-#undef __DISABLE_SSE4_2__
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
 #pragma GCC pop_options
-#endif /* __DISABLE_SSE4_2__ */
-
-#ifdef __DISABLE_SSE4_1__
-#undef __DISABLE_SSE4_1__
-#pragma GCC pop_options
-#endif /* __DISABLE_SSE4_1__ */
+#endif /* __DISABLE_CRC32__ */
 
 #endif /* _SMMINTRIN_H_INCLUDED */
diff --git a/gcc/testsuite/gcc.target/i386/crc32-6.c 
b/gcc/testsuite/gcc.target/i386/crc32-6.c
index 464e3444069..1f306534bb8 100644
--- a/gcc/testsuite/gcc.target/i386/crc32-6.c
+++ b/gcc/testsuite/gcc.target/i386/crc32-6.c
@@ -7,7 +7,7 @@
 unsigned int
 test_mm_crc32_u8 (unsigned int CRC, unsigned char V)
 {
-  return _mm_crc32_u8 (CRC, V);
+  return __builtin_ia32_crc32qi (CRC, V);
 }
 
 /* { dg-error "needs isa option -mcrc32" "" { target *-*-* } 0  } */
diff --git a/gcc/testsuite/gcc.target/i386/crc32-7.c 
b/gcc/testsuite/gcc.target/i386/crc32-7.c
new file mode 100644
index 000..2e310e38b82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/crc32-7.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcrc32" } */
+/* { dg-final { scan-assembler "crc32b\[^\\n\]*eax" } } */
+/* { dg-final { scan-assembler "crc32w\[^\\n\]*eax" } } */
+/* { dg-final { scan-assembler "crc32l\[^\\n\]*eax" } } */
+/* { dg-final { scan-assembler "crc32q\[^\\n\]*rax" { target { ! ia32 } } } } 
*/
+
+#include 
+
+unsigned int
+test_mm_crc32_u8 (unsigned int CRC, unsigned char V)
+{
+  return _mm_crc32_u8 (CRC, V);
+}
+
+unsigned int
+test_mm_crc32_u16 (unsigned int CRC, unsigned short V)
+{
+  return _mm_crc32_u16 (CRC, V);
+}
+
+unsigned int
+test_mm_crc32_u32 (unsigned int CRC, unsigned int V)
+{
+  return _mm_crc32_u32 (CRC, V);
+}
+
+#ifdef __x86_64__
+unsigned long long
+test_mm_crc32_u64 (unsigned long long CRC, unsigned long long V)
+{
+  return _mm_crc32_u64 (CRC, V);
+}
+#endif
-- 
2.18.1



Re: [PATCH] i386: Correct target attribute for crc32 intrinsics

2022-04-15 Thread Hongyu Wang via Gcc-patches
> This test should not be changed, it correctly reports ISA mismatch. It
> even passes -mno-crc32.

The error message changes from "needs isa option -mcrc32" to "target
specific option mismatch" with the #pragma change.
I see many of our intrinsic would throw such error, it has been a long
term issue for intrinsic diagnostic.

So for this test either I change the dg-error message or the call to
builtin, otherwise it would fail.

Uros Bizjak via Gcc-patches  于2022年4月15日周五 15:54写道:
>
> On Fri, Apr 15, 2022 at 6:30 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > Complile _mm_crc32_u8/16/32/64 intrinsics with -mcrc32
> > would meet target specific option mismatch. Correct target pragma
> > to fix.
> >
> > Bootstrapped/regtest on x86_64-pc-linux-gnu{-m32,}.
> >
> > Ok for master and backport to GCC 11?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/smmintrin.h: Correct target pragma from sse4.1
> > and sse4.2 to crc32 for crc32 intrinsics.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/crc32-6.c: Adjust to call builtin.
> > * gcc.target/i386/crc32-7.c: New test.
> > ---
> >  gcc/config/i386/smmintrin.h | 25 +-
> >  gcc/testsuite/gcc.target/i386/crc32-6.c |  2 +-
> >  gcc/testsuite/gcc.target/i386/crc32-7.c | 34 +
> >  3 files changed, 42 insertions(+), 19 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/crc32-7.c
> >
> > diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h
> > index b42b212300f..eb6a451c10a 100644
> > --- a/gcc/config/i386/smmintrin.h
> > +++ b/gcc/config/i386/smmintrin.h
> > @@ -810,17 +810,11 @@ _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
> >
> >  #include 
> >
> > -#ifndef __SSE4_1__
> > +#ifndef __CRC32__
> >  #pragma GCC push_options
> > -#pragma GCC target("sse4.1")
> > -#define __DISABLE_SSE4_1__
> > -#endif /* __SSE4_1__ */
> > -
> > -#ifndef __SSE4_2__
> > -#pragma GCC push_options
> > -#pragma GCC target("sse4.2")
> > -#define __DISABLE_SSE4_2__
> > -#endif /* __SSE4_1__ */
> > +#pragma GCC target("crc32")
> > +#define __DISABLE_CRC32__
> > +#endif /* __CRC32__ */
> >
> >  /* Accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
> >  extern __inline unsigned int __attribute__((__gnu_inline__, 
> > __always_inline__, __artificial__))
> > @@ -849,14 +843,9 @@ _mm_crc32_u64 (unsigned long long __C, unsigned long 
> > long __V)
> >  }
> >  #endif
> >
> > -#ifdef __DISABLE_SSE4_2__
> > -#undef __DISABLE_SSE4_2__
> > +#ifdef __DISABLE_CRC32__
> > +#undef __DISABLE_CRC32__
> >  #pragma GCC pop_options
> > -#endif /* __DISABLE_SSE4_2__ */
> > -
> > -#ifdef __DISABLE_SSE4_1__
> > -#undef __DISABLE_SSE4_1__
> > -#pragma GCC pop_options
> > -#endif /* __DISABLE_SSE4_1__ */
> > +#endif /* __DISABLE_CRC32__ */
> >
> >  #endif /* _SMMINTRIN_H_INCLUDED */
> > diff --git a/gcc/testsuite/gcc.target/i386/crc32-6.c 
> > b/gcc/testsuite/gcc.target/i386/crc32-6.c
> > index 464e3444069..1f306534bb8 100644
> > --- a/gcc/testsuite/gcc.target/i386/crc32-6.c
> > +++ b/gcc/testsuite/gcc.target/i386/crc32-6.c
> > @@ -7,7 +7,7 @@
> >  unsigned int
> >  test_mm_crc32_u8 (unsigned int CRC, unsigned char V)
> >  {
> > -  return _mm_crc32_u8 (CRC, V);
> > +  return __builtin_ia32_crc32qi (CRC, V);
>
> This test should not be changed, it correctly reports ISA mismatch. It
> even passes -mno-crc32.
>
> Uros.
>
> >  }
> >
> >  /* { dg-error "needs isa option -mcrc32" "" { target *-*-* } 0  } */
> > diff --git a/gcc/testsuite/gcc.target/i386/crc32-7.c 
> > b/gcc/testsuite/gcc.target/i386/crc32-7.c
> > new file mode 100644
> > index 000..2e310e38b82
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/crc32-7.c
> > @@ -0,0 +1,34 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mcrc32" } */
> > +/* { dg-final { scan-assembler "crc32b\[^\\n\]*eax" } } */
> > +/* { dg-final { scan-assembler "crc32w\[^\\n\]*eax" } } */
> > +/* { dg-final { scan-assembler "crc32l\[^\\n\]*eax" } } */
> > +/* { dg-final { scan-assembler "crc32q\[^\\n\]*rax" { target { ! ia32 } } 
> > } } */
> > +
> > +#include 
> > +
> > +unsigned int
> > +test_mm_crc32_u8 (unsigned int CRC, unsigned char V)
> > +{
> > +  return _mm_crc32_u8 (CRC, V);
> > +}
> > +
> > +unsigned int
> > +test_mm_crc32_u16 (unsigned int CRC, unsigned short V)
> > +{
> > +  return _mm_crc32_u16 (CRC, V);
> > +}
> > +
> > +unsigned int
> > +test_mm_crc32_u32 (unsigned int CRC, unsigned int V)
> > +{
> > +  return _mm_crc32_u32 (CRC, V);
> > +}
> > +
> > +#ifdef __x86_64__
> > +unsigned long long
> > +test_mm_crc32_u64 (unsigned long long CRC, unsigned long long V)
> > +{
> > +  return _mm_crc32_u64 (CRC, V);
> > +}
> > +#endif
> > --
> > 2.18.1
> >


[PATCH] AVX512F: Add missing macro for mask(z?)_scalf_s[sd] [PR 105339]

2022-04-22 Thread Hongyu Wang via Gcc-patches
Hi,

Add missing macro under O0 and adjust macro format for scalf
intrinsics.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for master and backport to GCC 9/10/11?

gcc/ChangeLog:

PR target/105339
* config/i386/avx512fintrin.h (_mm512_scalef_round_pd):
Add parentheses for parameters and djust format.
(_mm512_mask_scalef_round_pd): Ditto.
(_mm512_maskz_scalef_round_pd): Ditto.
(_mm512_scalef_round_ps): Ditto.
(_mm512_mask_scalef_round_ps): Ditto.
(_mm512_maskz_scalef_round_ps): Ditto.
(_mm_scalef_round_sd): Use _mm_undefined_pd.
(_mm_scalef_round_ss): Use _mm_undefined_ps.
(_mm_mask_scalef_round_sd): New macro.
(_mm_mask_scalef_round_ss): Ditto.
(_mm_maskz_scalef_round_sd): Ditto.
(_mm_maskz_scalef_round_ss): Ditto.
---
 gcc/config/i386/avx512fintrin.h | 76 -
 1 file changed, 56 insertions(+), 20 deletions(-)

diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index 29511fd2831..6dc69ff0234 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -3286,31 +3286,67 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, 
__m128 __B, const int __R)
  (__mmask8) __U, __R);
 }
 #else
-#define _mm512_scalef_round_pd(A, B, C)\
-(__m512d)__builtin_ia32_scalefpd512_mask(A, B, 
(__v8df)_mm512_undefined_pd(), -1, C)
-
-#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
-(__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C)
-
-#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
-(__m512d)__builtin_ia32_scalefpd512_mask(A, B, 
(__v8df)_mm512_setzero_pd(), U, C)
+#define _mm512_scalef_round_pd(A, B, C)
\
+  ((__m512d)   \
+   __builtin_ia32_scalefpd512_mask((A), (B),   \
+  (__v8df) _mm512_undefined_pd(),  \
+  -1, (C)))
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
+  ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
+  ((__m512d)   \
+   __builtin_ia32_scalefpd512_mask((A), (B),   \
+  (__v8df) _mm512_setzero_pd(),\
+  (U), (C)))
+
+#define _mm512_scalef_round_ps(A, B, C)
\
+  ((__m512)\
+   __builtin_ia32_scalefps512_mask((A), (B),   \
+  (__v16sf) _mm512_undefined_ps(), \
+  -1, (C)))
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
+  ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, C)   \
+  ((__m512)\
+   __builtin_ia32_scalefps512_mask((A), (B),   \
+  (__v16sf) _mm512_setzero_ps(),   \
+  (U), (C)))
+
+#define _mm_scalef_round_sd(A, B, C)   \
+  ((__m128d)   \
+   __builtin_ia32_scalefsd_mask_round ((A), (B),   \
+  (__v2df) _mm_undefined_pd (),\
+  -1, (C)))
 
-#define _mm512_scalef_round_ps(A, B, C)\
-(__m512)__builtin_ia32_scalefps512_mask(A, B, 
(__v16sf)_mm512_undefined_ps(), -1, C)
+#define _mm_scalef_round_ss(A, B, C)   \
+  ((__m128)\
+   __builtin_ia32_scalefss_mask_round ((A), (B),   \
+  (__v4sf) _mm_undefined_ps (),\
+  -1, (C)))
 
-#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
-(__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C)
+#define _mm_mask_scalef_round_sd(W, U, A, B, C)
\
+  ((__m128d)   \
+   __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C)))
 
-#define _mm512_maskz_scalef_round_ps(U, A, B, C)   \
-(__m512)__builtin_ia32_scalefps512_mask(A, B, 
(__v16sf)_mm512_setzero_ps(), U, C)
+#define _mm_mask_scalef_round_ss(W, U, A, B, C)
\
+  ((__m128)\
+   __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C)))

Re: [PATCH] AVX512F: Add missing macro for mask(z?)_scalf_s[sd] [PR 105339]

2022-04-22 Thread Hongyu Wang via Gcc-patches
> Please add the corresponding intrinsic test in sse-14.c

Sorry for forgetting this part. Updated patch. Thanks.

Hongtao Liu via Gcc-patches  于2022年4月22日周五 16:49写道:
>
> On Fri, Apr 22, 2022 at 4:12 PM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Hi,
> >
> > Add missing macro under O0 and adjust macro format for scalf
> > intrinsics.
> >
> Please add the corresponding intrinsic test in sse-14.c.
> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}.
> >
> > Ok for master and backport to GCC 9/10/11?
> >
> > gcc/ChangeLog:
> >
> > PR target/105339
> > * config/i386/avx512fintrin.h (_mm512_scalef_round_pd):
> > Add parentheses for parameters and djust format.
> > (_mm512_mask_scalef_round_pd): Ditto.
> > (_mm512_maskz_scalef_round_pd): Ditto.
> > (_mm512_scalef_round_ps): Ditto.
> > (_mm512_mask_scalef_round_ps): Ditto.
> > (_mm512_maskz_scalef_round_ps): Ditto.
> > (_mm_scalef_round_sd): Use _mm_undefined_pd.
> > (_mm_scalef_round_ss): Use _mm_undefined_ps.
> > (_mm_mask_scalef_round_sd): New macro.
> > (_mm_mask_scalef_round_ss): Ditto.
> > (_mm_maskz_scalef_round_sd): Ditto.
> > (_mm_maskz_scalef_round_ss): Ditto.
> > ---
> >  gcc/config/i386/avx512fintrin.h | 76 -
> >  1 file changed, 56 insertions(+), 20 deletions(-)
> >
> > diff --git a/gcc/config/i386/avx512fintrin.h 
> > b/gcc/config/i386/avx512fintrin.h
> > index 29511fd2831..6dc69ff0234 100644
> > --- a/gcc/config/i386/avx512fintrin.h
> > +++ b/gcc/config/i386/avx512fintrin.h
> > @@ -3286,31 +3286,67 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 
> > __A, __m128 __B, const int __R)
> >   (__mmask8) __U, __R);
> >  }
> >  #else
> > -#define _mm512_scalef_round_pd(A, B, C)\
> > -(__m512d)__builtin_ia32_scalefpd512_mask(A, B, 
> > (__v8df)_mm512_undefined_pd(), -1, C)
> > -
> > -#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
> > -(__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C)
> > -
> > -#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
> > -(__m512d)__builtin_ia32_scalefpd512_mask(A, B, 
> > (__v8df)_mm512_setzero_pd(), U, C)
> > +#define _mm512_scalef_round_pd(A, B, C)
> > \
> > +  ((__m512d)   \
> > +   __builtin_ia32_scalefpd512_mask((A), (B),   \
> > +  (__v8df) _mm512_undefined_pd(),  \
> > +  -1, (C)))
> > +
> > +#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
> > +  ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C)))
> > +
> > +#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
> > +  ((__m512d)   \
> > +   __builtin_ia32_scalefpd512_mask((A), (B),   \
> > +  (__v8df) _mm512_setzero_pd(),\
> > +  (U), (C)))
> > +
> > +#define _mm512_scalef_round_ps(A, B, C)
> > \
> > +  ((__m512)\
> > +   __builtin_ia32_scalefps512_mask((A), (B),   \
> > +  (__v16sf) _mm512_undefined_ps(), \
> > +  -1, (C)))
> > +
> > +#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
> > +  ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C)))
> > +
> > +#define _mm512_maskz_scalef_round_ps(U, A, B, C)   \
> > +  ((__m512)\
> > +   __builtin_ia32_scalefps512_mask((A), (B),   \
> > +  (__v16sf) _mm512_setzero_ps(),   \
> > +  (U), (C)))
> > +
> > +#define _mm_scalef_round_sd(A, B, C)   \
> > +  ((__m128d)   \
> > +   __builtin_ia32_scalefsd_mask_round ((A), (B),   \
> > +  (__v2df) _mm_undefined_pd (),\
> > +  -1, (C)))
> 

[PATCH] i386: Relax cmpxchg instruction under -mrelax-cmpxchg-loop [PR 103069]

2022-02-21 Thread Hongyu Wang via Gcc-patches
Hi,

For cmpxchg, it is commonly used in spin loop, and several user code
such as pthread directly takes cmpxchg as loop condition, which cause
huge cache bouncing.

This patch extends previous implementation to relax all cmpxchg
instruction under -mrelax-cmpxchg-loop with an extra atomic load,
compare and emulate the failed cmpxchg behavior.

For original spin loop which looks like

loop: mov%eax,%r8d
  or $1,%r8d
  lock cmpxchg %r8d,(%rdi)
  jneloop

It will now truns to

loop: mov%eax,%r8d
  or $1,%r8d
  mov(%r8),%rsi <--- load lock first
  cmp%rsi,%rax <--- compare with expected input
  jne.L2 <--- lock ne expected
  lock cmpxchg %r8d,(%rdi)
  jneloop
  L2: mov%rsi,%rax <--- perform the behavior of failed cmpxchg
  jneloop

under -mrelax-cmpxchg-loop.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}

OK for master?

gcc/ChangeLog:

PR target/103069
* config/i386/i386-expand.cc (ix86_expand_atomic_fetch_op_loop):
Split atomic fetch and loop part.
(ix86_expand_cmpxchg_loop): New expander for cmpxchg loop.
* config/i386/i386-protos.h (ix86_expand_cmpxchg_loop): New
prototype.
* config/i386/sync.md (atomic_compare_and_swap): Call new
expander under TARGET_RELAX_CMPXCHG_LOOP.
(atomic_compare_and_swap): Likewise for doubleword modes.

gcc/testsuite/ChangeLog:

PR target/103069
* gcc.target/i386/pr103069-2.c: Adjust result check.
* gcc.target/i386/pr103069-3.c: New test.
* gcc.target/i386/pr103069-4.c: Likewise.
---
 gcc/config/i386/i386-expand.cc | 153 +++--
 gcc/config/i386/i386-protos.h  |   2 +
 gcc/config/i386/sync.md|  65 +
 gcc/testsuite/gcc.target/i386/pr103069-2.c |   4 +-
 gcc/testsuite/gcc.target/i386/pr103069-3.c |  24 
 gcc/testsuite/gcc.target/i386/pr103069-4.c |  43 ++
 6 files changed, 226 insertions(+), 65 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-4.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ce9607e36de..6cf1a0b9cb6 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23203,16 +23203,14 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, 
rtx mem, rtx val,
   enum rtx_code code, bool after,
   bool doubleword)
 {
-  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
-  rtx_code_label *loop_label, *pause_label, *done_label;
+  rtx old_reg, new_reg, old_mem, success;
   machine_mode mode = GET_MODE (target);
+  rtx_code_label *loop_label = NULL;
 
   old_reg = gen_reg_rtx (mode);
   new_reg = old_reg;
-  loop_label = gen_label_rtx ();
-  pause_label = gen_label_rtx ();
-  done_label = gen_label_rtx ();
   old_mem = copy_to_reg (mem);
+  loop_label = gen_label_rtx ();
   emit_label (loop_label);
   emit_move_insn (old_reg, old_mem);
 
@@ -23234,50 +23232,125 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, 
rtx mem, rtx val,
   if (after)
 emit_move_insn (target, new_reg);
 
-  /* Load memory again inside loop.  */
-  new_mem = copy_to_reg (mem);
-  /* Compare mem value with expected value.  */
+  success = NULL_RTX;
+
+  ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
+   gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
+ SImode),
+   doubleword, loop_label);
+}
+
+/* Relax cmpxchg instruction, param loop_label indicates whether
+   the instruction should be relaxed with a pause loop.  If not,
+   it will be relaxed to an atomic load + compare, and skip
+   cmpxchg instruction if mem != exp_input.  */
+
+void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
+  rtx mem, rtx exp_input, rtx new_input,
+  rtx mem_model, bool doubleword,
+  rtx_code_label *loop_label)
+{
+  rtx_code_label *cmp_label = NULL;
+  rtx_code_label *done_label = NULL;
+  rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
+  rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
+  rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
+  machine_mode mode = GET_MODE (target_val), hmode = mode;
+
+  if (*ptarget_bool == NULL)
+target_bool = gen_reg_rtx (QImode);
+  else
+target_bool = *ptarget_bool;
+
+  cmp_label = gen_label_rtx ();
+  done_label = gen_label_rtx ();
+
+  new_mem = gen_reg_rtx (mode);
+  /* Load memory first.  */
+  expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
+
+  switch (mode)
+{
+case TImode:
+  gendw = gen_atomic_compare_and_swapti_doubleword;
+  hmode = DImode;
+  break;
+case DImode:
+  if (doubleword)
+   {
+ gendw = gen_atomic_compare_and_swapdi_double

[PATCH] AVX512F: Add helper enumeration for ternary logic intrinsics.

2022-02-25 Thread Hongyu Wang via Gcc-patches
Hi,

This patch intends to sync with llvm change in
https://reviews.llvm.org/D120307 to add enumeration and truncate
imm to unsigned char, so users could use ~ on immediates.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for master?

gcc/ChangeLog:

* config/i386/avx512fintrin.h (_MM_TERNLOG_ENUM): New enum.
(_mm512_ternarylogic_epi64): Truncate imm to unsigned
char to avoid error when using ~enum as parameter.
(_mm512_mask_ternarylogic_epi64): Likewise.
(_mm512_maskz_ternarylogic_epi64): Likewise.
(_mm512_ternarylogic_epi32): Likewise.
(_mm512_mask_ternarylogic_epi32): Likewise.
(_mm512_maskz_ternarylogic_epi32): Likewise.
* config/i386/avx512vlintrin.h (_mm256_ternarylogic_epi64):
Adjust imm param type to unsigned char.
(_mm256_mask_ternarylogic_epi64): Likewise.
(_mm256_maskz_ternarylogic_epi64): Likewise.
(_mm256_ternarylogic_epi32): Likewise.
(_mm256_mask_ternarylogic_epi32): Likewise.
(_mm256_maskz_ternarylogic_epi32): Likewise.
(_mm_ternarylogic_epi64): Likewise.
(_mm_mask_ternarylogic_epi64): Likewise.
(_mm_maskz_ternarylogic_epi64): Likewise.
(_mm_ternarylogic_epi32): Likewise.
(_mm_mask_ternarylogic_epi32): Likewise.
(_mm_maskz_ternarylogic_epi32): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-vpternlogd-1.c: Use new enum.
* gcc.target/i386/avx512f-vpternlogq-1.c: Likewise.
* gcc.target/i386/avx512vl-vpternlogd-1.c: Likewise.
* gcc.target/i386/avx512vl-vpternlogq-1.c: Likewise.
* gcc.target/i386/testimm-10.c: Remove imm check for vpternlog
insns since the imm has been truncated in intrinsic.
---
 gcc/config/i386/avx512fintrin.h   | 132 ++---
 gcc/config/i386/avx512vlintrin.h  | 278 +++---
 .../gcc.target/i386/avx512f-vpternlogd-1.c|   7 +-
 .../gcc.target/i386/avx512f-vpternlogq-1.c|   7 +-
 .../gcc.target/i386/avx512vl-vpternlogd-1.c   |  13 +-
 .../gcc.target/i386/avx512vl-vpternlogq-1.c   |  14 +-
 gcc/testsuite/gcc.target/i386/testimm-10.c|   7 -
 7 files changed, 285 insertions(+), 173 deletions(-)

diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index bc10c823c76..29511fd2831 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -1639,16 +1639,27 @@ _mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, 
__m128 __B,
 
 #endif
 
+/* Constant helper to represent the ternary logic operations among
+   vector A, B and C.  */
+typedef enum
+{
+  _MM_TERNLOG_A = 0xF0,
+  _MM_TERNLOG_B = 0xCC,
+  _MM_TERNLOG_C = 0xAA
+} _MM_TERNLOG_ENUM;
+
 #ifdef __OPTIMIZE__
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C,
   const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
-(__v8di) __B,
-(__v8di) __C, __imm,
-(__mmask8) -1);
+  return (__m512i)
+__builtin_ia32_pternlogq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __C,
+ (unsigned char) __imm,
+ (__mmask8) -1);
 }
 
 extern __inline __m512i
@@ -1656,10 +1667,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B,
__m512i __C, const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
-(__v8di) __B,
-(__v8di) __C, __imm,
-(__mmask8) __U);
+  return (__m512i)
+__builtin_ia32_pternlogq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
 }
 
 extern __inline __m512i
@@ -1667,10 +1680,12 @@ __attribute__ ((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
 __m512i __C, const int __imm)
 {
-  return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A,
- (__v8di) __B,
- (__v8di) __C,
- __imm, (__mmask8) __U);
+  return (__m512i)
+__builtin_ia32_pternlogq512_maskz ((__v8di) __A,
+ 

[PATCH] i386: Fix V8HF vector init under -mno-avx [PR 104664]

2022-02-28 Thread Hongyu Wang via Gcc-patches
Hi,

For V8HFmode vector init with HFmode, do not directly emits V8HF move
with subreg, which may cause reload to assign general register to move
src.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for master?

gcc/ChangeLog:

PR target/104664
* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
  Use vec_setv8hf_0 for HF to V8HFmode move instead of subreg.

gcc/testsuite/ChangeLog:

PR target/104664
* gcc.target/i386/pr104664.c: New test.
---
 gcc/config/i386/i386-expand.cc   |  7 ++-
 gcc/testsuite/gcc.target/i386/pr104664.c | 16 
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104664.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index faa0191c6dd..530f83fab88 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -14899,7 +14899,12 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
  dperm.one_operand_p = true;
 
  if (mode == V8HFmode)
-   tmp1 = lowpart_subreg (V8HFmode, force_reg (HFmode, val), HFmode);
+   {
+ tmp1 = force_reg (HFmode, val);
+ tmp2 = gen_reg_rtx (mode);
+ emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
+ tmp1 = gen_lowpart (mode, tmp2);
+   }
  else
{
  /* Extend to SImode using a paradoxical SUBREG.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr104664.c 
b/gcc/testsuite/gcc.target/i386/pr104664.c
new file mode 100644
index 000..8a3d6c7cc85
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104664.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-march=x86-64 -mtune=generic -Og -ffinite-math-only" } */
+
+typedef _Float128 __attribute__((__vector_size__ (16))) U;
+typedef _Float128 __attribute__((__vector_size__ (32))) V;
+typedef _Float16  __attribute__((__vector_size__ (16))) W;
+
+U u;
+V v;
+W w;
+
+void
+foo (void)
+{
+w *= (W)(u == __builtin_shufflevector (v, u, 2));
+}
-- 
2.18.1



[PATCH] i386: Fix wrong codegen for -mrelax-cmpxchg-loop

2021-11-17 Thread Hongyu Wang via Gcc-patches
Hi Uros,

For -mrelax-cmpxchg-loop introduced by PR 103069/r12-5265, it would
produce infinite loop. The correct code should be

.L84:
movl(%rdi), %ecx
movl%eax, %edx
orl %esi, %edx
cmpl%eax, %ecx
jne .L82
lock cmpxchgl   %edx, (%rdi)
jne .L84
movl%r8d, %eax  <<< retval is missing in previous impl
ret
.L82:
rep nop
jmp .L84

Adjust corresponding expander to fix such issue, and fix runtime test
so the problem would be exposed.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for master?

gcc/ChangeLog:

* config/i386/i386-expand.c (ix86_expand_atomic_fetch_op_loop):
Adjust generated cfg to avoid infinite loop.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr103069-2.c: Adjust.
---
 gcc/config/i386/i386-expand.c  |  7 ++-
 gcc/testsuite/gcc.target/i386/pr103069-2.c | 11 ++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 3e4de64ec24..0d5d1a0e205 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -23143,13 +23143,14 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, 
rtx mem, rtx val,
   bool doubleword)
 {
   rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
-  rtx_code_label *loop_label, *pause_label;
+  rtx_code_label *loop_label, *pause_label, *done_label;
   machine_mode mode = GET_MODE (target);
 
   old_reg = gen_reg_rtx (mode);
   new_reg = old_reg;
   loop_label = gen_label_rtx ();
   pause_label = gen_label_rtx ();
+  done_label = gen_label_rtx ();
   old_mem = copy_to_reg (mem);
   emit_label (loop_label);
   emit_move_insn (old_reg, old_mem);
@@ -23207,11 +23208,15 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, 
rtx mem, rtx val,
   GET_MODE (success), 1, loop_label,
   profile_probability::guessed_never ());
 
+  emit_jump_insn (gen_jump (done_label));
+  emit_barrier ();
+
   /* If mem is not expected, pause and loop back.  */
   emit_label (pause_label);
   emit_insn (gen_pause ());
   emit_jump_insn (gen_jump (loop_label));
   emit_barrier ();
+  emit_label (done_label);
 }
 
 #include "gt-i386-expand.h"
diff --git a/gcc/testsuite/gcc.target/i386/pr103069-2.c 
b/gcc/testsuite/gcc.target/i386/pr103069-2.c
index 8ac824cc8e8..b3f2235fd55 100644
--- a/gcc/testsuite/gcc.target/i386/pr103069-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr103069-2.c
@@ -1,5 +1,5 @@
-/* PR target/103068 */
-/* { dg-do compile } */
+/* PR target/103069 */
+/* { dg-do run } */
 /* { dg-additional-options "-O2 -march=x86-64 -mtune=generic" } */ 
 
 #include 
@@ -37,13 +37,14 @@ FUNC_ATOMIC_RELAX (char, xor)
 #define TEST_ATOMIC_FETCH_LOGIC(TYPE, OP) \
 { \
   TYPE a = 11, b = 101, res, exp; \
+  TYPE c = 11, d = 101;\
   res = relax_##TYPE##_##OP##_fetch (&a, b); \
-  exp = f_##TYPE##_##OP##_fetch (&a, b);  \
+  exp = f_##TYPE##_##OP##_fetch (&c, d);  \
   if (res != exp) \
 abort (); \
-  a = 21, b = 92; \
+  a = c = 21, b = d = 92; \
   res = relax_##TYPE##_fetch_##OP (&a, b); \
-  exp = f_##TYPE##_fetch_##OP (&a, b);  \
+  exp = f_##TYPE##_fetch_##OP (&c, d);  \
   if (res != exp) \
 abort (); \
 }
-- 
2.18.1



[PATCH] Support Intel AVX-IFMA

2022-10-18 Thread Hongyu Wang via Gcc-patches
Hi,

Here is the update patch that align the implementation to AVX-VNNI,
and corrects some spelling error for AVX512IFMA pattern.

Bootstrapped/regtested on x86_64-pc-linux-gnu and sde. Ok for trunk?

gcc/

* common/config/i386/i386-common.cc
(OPTION_MASK_ISA_AVXIFMA_SET, OPTION_MASK_ISA2_AVXIFMA_UNSET,
OPTION_MASK_ISA2_AVX2_UNSET): New macro.
(ix86_handle_option): Handle -mavxifma.
* common/config/i386/i386-cpuinfo.h (processor_types): Add
FEATURE_AVXIFMA.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
avxifma.
* common/config/i386/cpuinfo.h (get_available_features):
Detect avxifma.
* config.gcc: Add avxifmaintrin.h
* config/i386/avx512ifmavlintrin.h: (_mm_madd52lo_epu64): Change
to macro.
(_mm_madd52hi_epu64): Likewise.
(_mm256_madd52lo_epu64): Likewise.
(_mm256_madd52hi_epu64): Likewise.
* config/i386/avxifmaintrin.h: New header.
* config/i386/cpuid.h (bit_AVXIFMA): New.
* config/i386/i386-builtin.def: Add new builtins, and correct
  pattern names for AVX512IFMA.
* config/i386/i386-builtins.cc (def_builtin): Handle AVX-IFMA
  builtins like AVX-VNNI.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__AVXIFMA__.
* config/i386/i386-expand.cc (ix86_check_builtin_isa_match):
  Relax ISA masks for AVXIFMA.
* config/i386/i386-isa.def: Add AVXIFMA.
* config/i386/i386-options.cc (isa2_opts): Add -mavxifma.
(ix86_valid_target_attribute_inner_p): Handle avxifma.
* config/i386/i386.md (isa): Add attr avxifma and avxifmavl.
* config/i386/i386.opt: Add option -mavxifma.
* config/i386/immintrin.h: Inculde avxifmaintrin.h.
* config/i386/sse.md (avx_vpmadd52_):
Remove.
(vpamdd52): Remove.
(vpamdd52huq_maskz): Rename to ...
(vpmadd52huq_maskz): ... this.
(vpamdd52luq_maskz): Rename to ...
(vpmadd52luq_maskz): ... this.
(vpmadd52): New define_insn.
(vpmadd52v8di): Likewise.
(vpmadd52_maskz_1): Likewise.
(vpamdd52_mask): Rename to ...
(vpmadd52_mask): ... this.
* doc/invoke.texi: Document -mavxifma.
* doc/extend.texi: Document avxifma.
* doc/sourcebuild.texi: Document target avxifma.

gcc/testsuite/

* gcc.target/i386/avx-check.h: Add avxifma check.
* gcc.target/i386/avx512ifma-vpmaddhuq-1.c: Remane..
* gcc.target/i386/avx512ifma-vpmaddhuq-1a.c: To this.
* gcc.target/i386/avx512ifma-vpmaddluq-1.c: Ditto.
* gcc.target/i386/avx512ifma-vpmaddluq-1a.c: Ditto.
* gcc.target/i386/avx512ifma-vpmaddhuq-1b.c: New Test.
* gcc.target/i386/avx512ifma-vpmaddluq-1b.c: Ditto.
* gcc.target/i386/avx-ifma-1.c: Ditto.
* gcc.target/i386/avx-ifma-2.c: Ditto.
* gcc.target/i386/avx-ifma-3.c: Ditto.
* gcc.target/i386/avx-ifma-4.c: Ditto.
* gcc.target/i386/avx-ifma-5.c: Ditto.
* gcc.target/i386/avx-ifma-6.c: Ditto.
* gcc.target/i386/avx-ifma-vpmaddhuq-2.c: Ditto.
* gcc.target/i386/avx-ifma-vpmaddluq-2.c: Ditto.
* gcc.target/i386/sse-12.c: Add -mavxifma.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* g++.dg/other/i386-2.C: Ditto.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* lib/target-supports.exp
(check_effective_target_avxifma): New.
---
 gcc/common/config/i386/cpuinfo.h  |  2 +
 gcc/common/config/i386/i386-common.cc | 20 -
 gcc/common/config/i386/i386-cpuinfo.h |  1 +
 gcc/common/config/i386/i386-isas.h|  1 +
 gcc/config.gcc|  3 +-
 gcc/config/i386/avx512ifmavlintrin.h  | 59 +-
 gcc/config/i386/avxifmaintrin.h   | 78 +++
 gcc/config/i386/cpuid.h   |  1 +
 gcc/config/i386/i386-builtin.def  | 28 ---
 gcc/config/i386/i386-builtins.cc  |  8 +-
 gcc/config/i386/i386-c.cc |  2 +
 gcc/config/i386/i386-expand.cc| 13 
 gcc/config/i386/i386-isa.def  |  1 +
 gcc/config/i386/i386-options.cc   |  4 +-
 gcc/config/i386/i386.md   |  6 +-
 gcc/config/i386/i386.opt  |  5 ++
 gcc/config/i386/immintrin.h   |  2 +
 gcc/config/i386/sse.md| 56 ++---
 gcc/doc/extend.texi   |  5 ++
 gcc/doc/invoke.texi   |  9 ++-
 gcc/doc/sourcebuild.texi  |  3 +
 gcc/testsuite/g++.dg/other/i386-2.C   |  2 +-
 gcc/testsuite/g++.dg/other/i386-3.C  

[PATCH] i386: Enable small loop unrolling for O2

2022-10-25 Thread Hongyu Wang via Gcc-patches
Hi,

Inspired by rs6000 and s390 port changes, this patch
enables loop unrolling for small size loop at O2 by default.
The default behavior is to unroll loop with unknown trip-count and
less than 4 insns by 1 time.

This improves 548.exchange2 by 3.5% on icelake and 6% on zen3 with
1.2% codesize increment. For other benchmarks the variants are minor
and overall codesize increased by 0.2%.

The kernel image size increased by 0.06%, and no impact on eembc.

Bootstrapped & regrtested on x86_64-pc-linux-gnu.

Ok for trunk?

gcc/ChangeLog:

* common/config/i386/i386-common.cc (ix86_optimization_table):
Enable loop unroll and small loop unroll at O2 by default.
* config/i386/i386-options.cc
(ix86_override_options_after_change):
Disable small loop unroll when funroll-loops enabled, reset
cunroll_grow_size when it is not explicitly enabled.
(ix86_option_override_internal): Call
ix86_override_options_after_change instead of calling
ix86_recompute_optlev_based_flags and ix86_default_align
separately.
* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
factor if -munroll-only-small-loops enabled.
* config/i386/i386.opt: Add -munroll-only-small-loops,
-param=x86-small-unroll-ninsns= for loop insn limit,
-param=x86-small-unroll-factor= for unroll factor.
* doc/invoke.texi: Document -munroll-only-small-loops,
x86-small-unroll-ninsns and x86-small-unroll-factor.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
* gcc.target/i386/pr93002.c: Likewise.
---
 gcc/common/config/i386/i386-common.cc   |  6 
 gcc/config/i386/i386-options.cc | 40 ++---
 gcc/config/i386/i386.cc | 13 
 gcc/config/i386/i386.opt| 13 
 gcc/doc/invoke.texi | 14 +
 gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
 7 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index d6a68dc9b1d..0e580b39d14 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1686,6 +1686,12 @@ static const struct default_options 
ix86_option_optimization_table[] =
 /* The STC algorithm produces the smallest code at -Os, for x86.  */
 { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
   REORDER_BLOCKS_ALGORITHM_STC },
+{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
+{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
+/* Turns off -frename-registers and -fweb which are enabled by
+   funroll-loops.  */
+{ OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
+{ OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
 /* Turn off -fschedule-insns by default.  It tends to make the
problem with not enough registers even worse.  */
 { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index acb2291e70f..6ea347c32e1 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1819,8 +1819,43 @@ ix86_recompute_optlev_based_flags (struct gcc_options 
*opts,
 void
 ix86_override_options_after_change (void)
 {
+  /* Default align_* from the processor table.  */
   ix86_default_align (&global_options);
+
   ix86_recompute_optlev_based_flags (&global_options, &global_options_set);
+
+  /* Disable unrolling small loops when there's explicit
+ -f{,no}unroll-loop.  */
+  if ((OPTION_SET_P (flag_unroll_loops))
+ || (OPTION_SET_P (flag_unroll_all_loops)
+&& flag_unroll_all_loops))
+{
+  if (!OPTION_SET_P (ix86_unroll_only_small_loops))
+   ix86_unroll_only_small_loops = 0;
+  /* Re-enable -frename-registers and -fweb if funroll-loops
+enabled.  */
+  if (!OPTION_SET_P (flag_web))
+   flag_web = flag_unroll_loops;
+  if (!OPTION_SET_P (flag_rename_registers))
+   flag_rename_registers = flag_unroll_loops;
+  if (!OPTION_SET_P (flag_cunroll_grow_size))
+   flag_cunroll_grow_size = flag_unroll_loops
+|| flag_peel_loops
+|| optimize >= 3;
+}
+  else
+{
+  if (!OPTION_SET_P (flag_cunroll_grow_size))
+   flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
+  /* Disables loop unrolling if -mno-unroll-only-small-loops is
+explicitly set and -funroll-loops is not enabled.  */
+  if (OPTION_SET_P (ix86_unroll_only_small_loops)
+ && !ix86_unroll_only_small_loops
+ && !(OPTION_SET_P (flag_unroll_loops)
+  || OPTION_SET_P (flag_unroll_all_loops)))
+   flag_unroll_loops = flag_unroll_all_loops = 0;
+}
+
 }
 
 /* Clear stack slot assignments remembered fr

Re: [PATCH] i386: Enable small loop unrolling for O2

2022-10-26 Thread Hongyu Wang via Gcc-patches
> Does this setting benefit all targets?  IIRC, in the past all
> benchmarks also enabled -funroll-loops, so it looks to me that
> unrolling small loops by default is a good compromise.

The idea to unroll small loops can be explained from the x86
micro-architecture. Modern x86 processors has multiple way instruction
decoder (5uops for icelake/zen3). So for small loop with <= 4
instructions (usually has 3 uops with a cmp/jmp pair that can be
macro-fused), the decoder would have 2 uops bubble for each iteration
and the pipeline could not be fully utilized. Therefore we decide to
unroll the 4 insn loop once to at least to full-fill the decoder and
enhance the pipeline utilization.
We are not familiar with micro architecture of other targets, we don't
know whether the unrolling could benefit the instruction decoder, so
the decision could be different.

Uros Bizjak via Gcc-patches  于2022年10月26日周三 14:57写道:

>
> On Wed, Oct 26, 2022 at 7:53 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > Inspired by rs6000 and s390 port changes, this patch
> > enables loop unrolling for small size loop at O2 by default.
> > The default behavior is to unroll loop with unknown trip-count and
> > less than 4 insns by 1 time.
> >
> > This improves 548.exchange2 by 3.5% on icelake and 6% on zen3 with
> > 1.2% codesize increment. For other benchmarks the variants are minor
> > and overall codesize increased by 0.2%.
> >
> > The kernel image size increased by 0.06%, and no impact on eembc.
>
> Does this setting benefit all targets?  IIRC, in the past all
> benchmarks also enabled -funroll-loops, so it looks to me that
> unrolling small loops by default is a good compromise.
>
> The patch is technically OK, but as a tuning default, I would leave
> the final approval to HJ.
>
> Thanks,
> Uros.
>
> >
> > Bootstrapped & regrtested on x86_64-pc-linux-gnu.
> >
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * common/config/i386/i386-common.cc (ix86_optimization_table):
> > Enable loop unroll and small loop unroll at O2 by default.
> > * config/i386/i386-options.cc
> > (ix86_override_options_after_change):
> > Disable small loop unroll when funroll-loops enabled, reset
> > cunroll_grow_size when it is not explicitly enabled.
> > (ix86_option_override_internal): Call
> > ix86_override_options_after_change instead of calling
> > ix86_recompute_optlev_based_flags and ix86_default_align
> > separately.
> > * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
> > factor if -munroll-only-small-loops enabled.
> > * config/i386/i386.opt: Add -munroll-only-small-loops,
> > -param=x86-small-unroll-ninsns= for loop insn limit,
> > -param=x86-small-unroll-factor= for unroll factor.
> > * doc/invoke.texi: Document -munroll-only-small-loops,
> > x86-small-unroll-ninsns and x86-small-unroll-factor.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
> > * gcc.target/i386/pr93002.c: Likewise.
> > ---
> >  gcc/common/config/i386/i386-common.cc   |  6 
> >  gcc/config/i386/i386-options.cc | 40 ++---
> >  gcc/config/i386/i386.cc | 13 
> >  gcc/config/i386/i386.opt| 13 
> >  gcc/doc/invoke.texi | 14 +
> >  gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
> >  7 files changed, 84 insertions(+), 6 deletions(-)
> >
> > diff --git a/gcc/common/config/i386/i386-common.cc 
> > b/gcc/common/config/i386/i386-common.cc
> > index d6a68dc9b1d..0e580b39d14 100644
> > --- a/gcc/common/config/i386/i386-common.cc
> > +++ b/gcc/common/config/i386/i386-common.cc
> > @@ -1686,6 +1686,12 @@ static const struct default_options 
> > ix86_option_optimization_table[] =
> >  /* The STC algorithm produces the smallest code at -Os, for x86.  */
> >  { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
> >REORDER_BLOCKS_ALGORITHM_STC },
> > +{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
> > +{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 
> > },
> > +/* Turns off -frename-registers and -fweb which are enabled by
> > +   funroll-loops.  */
> > +{ OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
> > +{ OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
> >  /* Turn off -fschedule-insns by default.  It tends to make the
> > problem with not enough registers even worse.  */
> >  { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
> > diff --git a/gcc/config/i386/i386-options.cc 
> > b/gcc/config/i386/i386-options.cc
> > index acb2291e70f..6ea347c32e1 100644
> > --- a/gcc/config/i386/i386-options.cc
> > +++ b/gcc/config/i386/i386-options.cc
> > @@ -1819,8 +1819,43 @@ ix86_recompute_optlev_based_flags (struct 
> > gcc_options *opts,
> >  void
> > 

Re: [PATCH] i386: Enable small loop unrolling for O2

2022-10-28 Thread Hongyu Wang via Gcc-patches
> Ugh, that's all quite ugly and unmaintainable, no?
Agreed, I have the same feeling.

> I'm quite sure that if this works it's not by intention.  Doesn't this
> also disable
> register renaming and web when the user explicitely specifies -funroll-loops?
>
> Doesn't this change -funroll-loops behavior everywhere, only unrolling small
> loops?

The ugly part ensures that -funroll-loops would not be affected at all
by -munroll-only-small-loops.

>
> I'd like to see a -munroll-only-small-loops addition that doesn't have any 
> such
> effects.  Note RTL unrolling could also
> conditionally enabled on a new -funroll-small-loops which wouldn't enable
> register renaming or web.

Did you mean something like

index b9e07973dd6..b707d4afb84 100644
--- a/gcc/loop-init.cc
+++ b/gcc/loop-init.cc
@@ -567,7 +567,8 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
 {
-  return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
+  return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+ || flag_unroll_only_small_loops);
 }

then the backend can turn it on by default in O2?
I don't know if there is a way to turn on middle-end pass by
target-specific flags.

Richard Biener via Gcc-patches  于2022年10月28日周五 15:33写道:
>
> On Wed, Oct 26, 2022 at 7:53 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > Inspired by rs6000 and s390 port changes, this patch
> > enables loop unrolling for small size loop at O2 by default.
> > The default behavior is to unroll loop with unknown trip-count and
> > less than 4 insns by 1 time.
> >
> > This improves 548.exchange2 by 3.5% on icelake and 6% on zen3 with
> > 1.2% codesize increment. For other benchmarks the variants are minor
> > and overall codesize increased by 0.2%.
> >
> > The kernel image size increased by 0.06%, and no impact on eembc.
> >
> > Bootstrapped & regrtested on x86_64-pc-linux-gnu.
> >
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * common/config/i386/i386-common.cc (ix86_optimization_table):
> > Enable loop unroll and small loop unroll at O2 by default.
> > * config/i386/i386-options.cc
> > (ix86_override_options_after_change):
> > Disable small loop unroll when funroll-loops enabled, reset
> > cunroll_grow_size when it is not explicitly enabled.
> > (ix86_option_override_internal): Call
> > ix86_override_options_after_change instead of calling
> > ix86_recompute_optlev_based_flags and ix86_default_align
> > separately.
> > * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
> > factor if -munroll-only-small-loops enabled.
> > * config/i386/i386.opt: Add -munroll-only-small-loops,
> > -param=x86-small-unroll-ninsns= for loop insn limit,
> > -param=x86-small-unroll-factor= for unroll factor.
> > * doc/invoke.texi: Document -munroll-only-small-loops,
> > x86-small-unroll-ninsns and x86-small-unroll-factor.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
> > * gcc.target/i386/pr93002.c: Likewise.
> > ---
> >  gcc/common/config/i386/i386-common.cc   |  6 
> >  gcc/config/i386/i386-options.cc | 40 ++---
> >  gcc/config/i386/i386.cc | 13 
> >  gcc/config/i386/i386.opt| 13 
> >  gcc/doc/invoke.texi | 14 +
> >  gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
> >  7 files changed, 84 insertions(+), 6 deletions(-)
> >
> > diff --git a/gcc/common/config/i386/i386-common.cc 
> > b/gcc/common/config/i386/i386-common.cc
> > index d6a68dc9b1d..0e580b39d14 100644
> > --- a/gcc/common/config/i386/i386-common.cc
> > +++ b/gcc/common/config/i386/i386-common.cc
> > @@ -1686,6 +1686,12 @@ static const struct default_options 
> > ix86_option_optimization_table[] =
> >  /* The STC algorithm produces the smallest code at -Os, for x86.  */
> >  { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
> >REORDER_BLOCKS_ALGORITHM_STC },
> > +{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
> > +{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 
> > },
> > +/* Turns off -frename-registers and -fweb which are enabled by
> > +   funroll-loops.  */
> > +{ OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
> > +{ OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
>
> I'm quite sure that if this works it's not by intention.  Doesn't this
> also disable
> register renaming and web when the user explicitely specifies -funroll-loops?
>
> Doesn't this change -funroll-loops behavior everywhere, only unrolling small
> loops?
>
> I'd like to see a -munroll-only-small-loops addition that doesn't have any 
> such
> effects.  Note RTL unrolling could also
> conditionally enabled on a new -funroll-small-

[PATCH V2] Enable small loop unrolling for O2

2022-11-01 Thread Hongyu Wang via Gcc-patches
Hi, this is the updated patch of
https://gcc.gnu.org/pipermail/gcc-patches/2022-October/604345.html,
which uses targetm.loop_unroll_adjust as gate to enable small loop unroll.

This patch does not change rs6000/s390 since I don't have machine to 
test them, but I suppose the default behavior is the same since they
enable flag_unroll_loops at O2.

Bootstrapped & regrtested on x86_64-pc-linux-gnu.

Ok for trunk?

-- Patch content 

Modern processors has multiple way instruction decoders
For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
instructions (usually has 3 uops with a cmp/jmp pair that can be
macro-fused), the decoder would have 2 uops bubble for each iteration
and the pipeline could not be fully utilized.

Therefore, this patch enables loop unrolling for small size loop at O2
to fullfill the decoder as much as possible. It turns on rtl loop
unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
In x86 backend the default behavior is to unroll small loops with less
than 4 insns by 1 time.

This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
0.9% codesize increment. For other benchmarks the variants are minor
and overall codesize increased by 0.2%.

The kernel image size increased by 0.06%, and no impact on eembc.

gcc/ChangeLog:

* common/config/i386/i386-common.cc (ix86_optimization_table):
Enable small loop unroll at O2 by default.
* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
factor if -munroll-only-small-loops enabled and -funroll-loops/
-funroll-all-loops are disabled.
* config/i386/i386.opt: Add -munroll-only-small-loops,
-param=x86-small-unroll-ninsns= for loop insn limit,
-param=x86-small-unroll-factor= for unroll factor.
* doc/invoke.texi: Document -munroll-only-small-loops,
x86-small-unroll-ninsns and x86-small-unroll-factor.
* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
loop unrolling for -O2-speed and above if target hook
loop_unroll_adjust exists.

gcc/testsuite/ChangeLog:

* gcc.dg/guality/loop-1.c: Add additional option
  -mno-unroll-only-small-loops.
* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
* gcc.target/i386/pr93002.c: Likewise.
---
 gcc/common/config/i386/i386-common.cc   |  1 +
 gcc/config/i386/i386.cc | 18 ++
 gcc/config/i386/i386.opt| 13 +
 gcc/doc/invoke.texi | 16 
 gcc/loop-init.cc| 10 +++---
 gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 ++
 gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
 8 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index f66bdd5a2af..c6891486078 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1724,6 +1724,7 @@ static const struct default_options 
ix86_option_optimization_table[] =
 /* The STC algorithm produces the smallest code at -Os, for x86.  */
 { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
   REORDER_BLOCKS_ALGORITHM_STC },
+{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
 /* Turn off -fschedule-insns by default.  It tends to make the
problem with not enough registers even worse.  */
 { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index c0f37149ed0..0f94a3b609e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23827,6 +23827,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop 
*loop)
   unsigned i;
   unsigned mem_count = 0;
 
+  /* Unroll small size loop when unroll factor is not explicitly
+ specified.  */
+  if (!(flag_unroll_loops
+   || flag_unroll_all_loops
+   || loop->unroll))
+{
+  nunroll = 1;
+
+  /* Any explicit -f{no-}unroll-{all-}loops turns off
+-munroll-only-small-loops.  */
+  if (ix86_unroll_only_small_loops
+ && !OPTION_SET_P (flag_unroll_loops))
+   if (loop->ninsns <= (unsigned) ix86_small_unroll_ninsns)
+ nunroll = (unsigned) ix86_small_unroll_factor;
+
+  return nunroll;
+}
+
   if (!TARGET_ADJUST_UNROLL)
  return nunroll;
 
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 53d534f6392..6da9c8d670d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1224,3 +1224,16 @@ mavxvnniint8
 Target Mask(ISA2_AVXVNNIINT8) Var(ix86_isa_flags2) Save
 Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and
 AVXVNNIINT8 built-in functions and code generation.
+
+munroll-only-small-loops
+Target Var(ix86_unroll_only_small_loops) Init(0) Save
+Enable conservative small loop unrolling.
+
+-param=x86-small-unroll

Re: [PATCH] [i386]Add combine splitter to transform pxor/pcmpeqb/pmovmskb/cmp 0xffff to ptest.

2022-05-06 Thread Hongyu Wang via Gcc-patches
> +(define_split
> +  [(set (reg:CCZ FLAGS_REG)
> +   (compare:CCZ (unspec:SI
> +   [(eq:VI1_AVX2
> +   (match_operand:VI1_AVX2 0 "vector_operand")
> +   (match_operand:VI1_AVX2 1 "const0_operand"))]
> +   UNSPEC_MOVMSK)
> +(match_operand 2 "const_int_operand")))]
> +  "TARGET_SSE4_1 && ix86_match_ccmode (insn, CCmode)

It looks like set_src and set_dst are all CCZmode, do we really need
ix86_match_ccmode?

> +  && (INTVAL (operands[2]) == (int) ())"

I think (int) convert is not needed for const, and INTVAL actually
returns HOST_WIDE_INT

> +#include 
> +
> +bool is_zero(__m128i x)

bool is not necessary here, we can use int and drop stdbool.

Haochen Jiang via Gcc-patches  于2022年5月6日周五 16:01写道:
>
> Hi all,
>
> This patch aims to add a combine splitter to transform 
> pxor/pcmpeqb/pmovmskb/cmp 0x to ptest.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
>
> BRs,
> Haochen
>
> gcc/ChangeLog:
>
> PR target/104371
> * config/i386/sse.md: Add new define_mode_attr and define_split.
>
> gcc/testsuite/ChangeLog:
>
> PR target/104371
> * gcc.target/i386/pr104371-1.c: New test.
> * gcc.target/i386/pr104371-2.c: Ditto.
> ---
>  gcc/config/i386/sse.md | 19 +++
>  gcc/testsuite/gcc.target/i386/pr104371-1.c | 14 ++
>  gcc/testsuite/gcc.target/i386/pr104371-2.c | 14 ++
>  3 files changed, 47 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104371-1.c
>  create mode 100755 gcc/testsuite/gcc.target/i386/pr104371-2.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 7b791def542..71afda73c8f 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -20083,6 +20083,25 @@
> (set_attr "prefix" "maybe_vex")
> (set_attr "mode" "SI")])
>
> +;; Optimize pxor/pcmpeqb/pmovmskb/cmp 0x to ptest.
> +(define_mode_attr vi1avx2const
> +  [(V32QI "0x") (V16QI "0x")])
> +
> +(define_split
> +  [(set (reg:CCZ FLAGS_REG)
> +   (compare:CCZ (unspec:SI
> +   [(eq:VI1_AVX2
> +   (match_operand:VI1_AVX2 0 "vector_operand")
> +   (match_operand:VI1_AVX2 1 "const0_operand"))]
> +   UNSPEC_MOVMSK)
> +(match_operand 2 "const_int_operand")))]
> +  "TARGET_SSE4_1 && ix86_match_ccmode (insn, CCmode)
> +  && (INTVAL (operands[2]) == (int) ())"
> +  [(set (reg:CC FLAGS_REG)
> +   (unspec:CC [(match_dup 0)
> +   (match_dup 0)]
> +  UNSPEC_PTEST))])
> +
>  (define_expand "sse2_maskmovdqu"
>[(set (match_operand:V16QI 0 "memory_operand")
> (unspec:V16QI [(match_operand:V16QI 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/i386/pr104371-1.c 
> b/gcc/testsuite/gcc.target/i386/pr104371-1.c
> new file mode 100644
> index 000..df7c0b074e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104371-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse4" } */
> +/* { dg-final { scan-assembler "ptest\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "pxor\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "pcmpeqb\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "pmovmskb\[ \\t\]" } } */
> +
> +#include 
> +#include 
> +
> +bool is_zero(__m128i x)
> +{
> +  return _mm_movemask_epi8(_mm_cmpeq_epi8(x, _mm_setzero_si128())) == 0x;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr104371-2.c 
> b/gcc/testsuite/gcc.target/i386/pr104371-2.c
> new file mode 100755
> index 000..f0d0afd5897
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104371-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2" } */
> +/* { dg-final { scan-assembler "vptest\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vpxor\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vpcmpeqb\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vpmovmskb\[ \\t\]" } } */
> +
> +#include 
> +#include 
> +
> +bool is_zero256(__m256i x)
> +{
> +  return _mm256_movemask_epi8(_mm256_cmpeq_epi8(x, _mm256_setzero_si256())) 
> == 0x;
> +}
> --
> 2.18.1
>


Re: [PATCH] Reconstruct i386 testsuite with __builtin_cpu_supports

2022-05-06 Thread Hongyu Wang via Gcc-patches
> I don't think *_os_support calls should be removed. IIRC,
> __builtin_cpu_supports function checks if the feature is supported by
> CPU, whereas *_os_supports calls check via xgetbv if OS supports
> handling of new registers.

avx_os_support is like

avx_os_support (void)
{
  unsigned int eax, edx;
  unsigned int ecx = XCR_XFEATURE_ENABLED_MASK;

  __asm__ ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));

  return (eax & (XSTATE_SSE | XSTATE_YMM)) == (XSTATE_SSE | XSTATE_YMM);
}

While in get_avaliable_features we have

#define XCR_AVX_ENABLED_MASK \
  (XSTATE_SSE | XSTATE_YMM)
  if ((ecx & bit_OSXSAVE))
{
  /* Check if XMM, YMM, OPMASK, upper 256 bits of ZMM0-ZMM15 and
ZMM16-ZMM31 states are supported by OSXSAVE.  */
  unsigned int xcrlow;
  unsigned int xcrhigh;
  __asm__ (".byte 0x0f, 0x01, 0xd0" /* xgetbv  */
   : "=a" (xcrlow), "=d" (xcrhigh)
   : "c" (XCR_XFEATURE_ENABLED_MASK));
  if ((xcrlow & XCR_AVX_ENABLED_MASK) == XCR_AVX_ENABLED_MASK)
{
  avx_usable = 1;

So __builtin_cpu_supports already inherits same check

Uros Bizjak via Gcc-patches  于2022年5月6日周五 16:27写道:
>
> On Fri, May 6, 2022 at 9:57 AM Haochen Jiang  wrote:
> >
> > Hi all,
> >
> > There are some check files in i386 testsuite are written before the 
> > function __builtin_cpu_supports is introduced. All of them are using 
> > __get_cpuid_count. This patch aims to reconstruct the i386 testsuite with 
> > __builtin_cpu_supports so that we can have a much clearer code.
> >
> > Regtested on x86_64-pc-linux-gnu. Ok for trunk?
>
> I don't think *_os_support calls should be removed. IIRC,
> __builtin_cpu_supports function checks if the feature is supported by
> CPU, whereas *_os_supports calls check via xgetbv if OS supports
> handling of new registers.
>
> Uros.
>
> >
> > Also when writting this patch, I also find some files in testsuite that 
> > might be useless currently. For example, in the file 
> > gcc/testsuite/gcc.target/i386/sse-os-support.h, it always return 1. And 
> > there are also some files will no longer be included at all with this 
> > patch. Should we remove those files when we have time?
> >
> > BRs,
> > Haochen
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/adx-check.h: Change bit check to
> > __builtin_cpu_supports.
> > * gcc.target/i386/aes-avx-check.h: Ditto.
> > * gcc.target/i386/aes-check.h: Ditto.
> > * gcc.target/i386/avx-check.h: Ditto.
> > * gcc.target/i386/avx2-check.h: Ditto.
> > * gcc.target/i386/avx512-check.h: Ditto.
> > * gcc.target/i386/bmi-check.h: Ditto.
> > * gcc.target/i386/bmi2-check.h: Ditto.
> > * gcc.target/i386/f16c-check.h: Ditto.
> > * gcc.target/i386/fma-check.h: Ditto.
> > * gcc.target/i386/fma4-check.h: Ditto.
> > * gcc.target/i386/lzcnt-check.h: Ditto.
> > * gcc.target/i386/mmx-3dnow-check.h: Ditto.
> > * gcc.target/i386/mmx-check.h: Ditto.
> > * gcc.target/i386/pclmul-avx-check.h: Ditto.
> > * gcc.target/i386/pclmul-check.h: Ditto.
> > * gcc.target/i386/rtm-check.h: Ditto.
> > * gcc.target/i386/sha-check.h: Ditto.
> > * gcc.target/i386/sse-check.h: Ditto.
> > * gcc.target/i386/sse2-check.h: Ditto.
> > * gcc.target/i386/sse3-check.h: Ditto.
> > * gcc.target/i386/sse4_1-check.h: Ditto.
> > * gcc.target/i386/sse4_2-check.h: Ditto.
> > * gcc.target/i386/sse4a-check.h: Ditto.
> > * gcc.target/i386/ssse3-check.h: Ditto.
> > * gcc.target/i386/xop-check.h: Ditto.
> > ---
> >  gcc/testsuite/gcc.target/i386/adx-check.h | 10 +---
> >  gcc/testsuite/gcc.target/i386/aes-avx-check.h | 14 +
> >  gcc/testsuite/gcc.target/i386/aes-check.h | 11 +---
> >  gcc/testsuite/gcc.target/i386/avx-check.h | 12 +---
> >  gcc/testsuite/gcc.target/i386/avx2-check.h| 20 +--
> >  gcc/testsuite/gcc.target/i386/avx512-check.h  | 59 +++
> >  gcc/testsuite/gcc.target/i386/bmi-check.h | 11 +---
> >  gcc/testsuite/gcc.target/i386/bmi2-check.h| 10 +---
> >  gcc/testsuite/gcc.target/i386/f16c-check.h| 10 +---
> >  gcc/testsuite/gcc.target/i386/fma-check.h | 11 +---
> >  gcc/testsuite/gcc.target/i386/fma4-check.h| 11 +---
> >  gcc/testsuite/gcc.target/i386/lzcnt-check.h   | 11 +---
> >  .../gcc.target/i386/mmx-3dnow-check.h | 11 +---
> >  gcc/testsuite/gcc.target/i386/mmx-check.h | 11 +---
> >  .../gcc.target/i386/pclmul-avx-check.h| 14 +
> >  gcc/testsuite/gcc.target/i386/pclmul-check.h  | 11 +---
> >  gcc/testsuite/gcc.target/i386/rtm-check.h | 10 +---
> >  gcc/testsuite/gcc.target/i386/sha-check.h | 10 +---
> >  gcc/testsuite/gcc.target/i386/sse-check.h | 11 +---
> >  gcc/testsuite/gcc.target/i386/sse2-check.h| 11 +---
> >  gcc/testsuite/gcc.target/i386/sse3-check.h| 11 +---
> >  gcc/testsuite/gcc.target/i386/sse4_1-check.h  | 11 +---
> >  gcc/testsuite/gcc.target/i386/

[PATCH] i386: Add a constraint for absolute symboilc address [PR 105576]

2022-05-18 Thread Hongyu Wang via Gcc-patches
Hi,

This patch adds a constraint "Ws" to allow absolute symbolic address for either
function or variable. This also works under -mcmodel=large.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}

Ok for master?

gcc/ChangeLog:

PR target/105576
* config/i386/constraints.md (Ws): New constraint.
* config/i386/i386-protos.h (ix86_symbolic_address_p):
New proto type.
* config/i386/i386.cc (ix86_symbolic_address_p):
New function to ensure a rtx is a symbolic address.

gcc/testsuite/ChangeLog:

PR target/105576
* gcc.target/i386/pr105576.c: New test.
---
 gcc/config/i386/constraints.md   |  4 
 gcc/config/i386/i386-protos.h|  1 +
 gcc/config/i386/i386.cc  |  7 +++
 gcc/testsuite/gcc.target/i386/pr105576.c | 11 +++
 4 files changed, 23 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105576.c

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7361687632f..ec0702be368 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -348,6 +348,10 @@ (define_constraint "Z"
instructions)."
   (match_operand 0 "x86_64_zext_immediate_operand"))
 
+(define_constraint "Ws"
+ "A constraint that matches an absolute symbolic address."
+ (match_test "ix86_symbolic_address_p (op)"))
+
 ;; T prefix is used for different address constraints
 ;;   v - VSIB address
 ;;   s - address with no segment register
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3596ce81ecf..2b8d063850f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -337,6 +337,7 @@ extern void x86_output_aligned_bss (FILE *, tree, const 
char *,
unsigned HOST_WIDE_INT, unsigned);
 extern void x86_elf_aligned_decl_common (FILE *, tree, const char *,
 unsigned HOST_WIDE_INT, unsigned);
+extern bool ix86_symbolic_address_p (rtx x);
 
 #ifdef RTX_CODE
 extern void ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 86752a6516a..76728d10c8d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23956,6 +23956,13 @@ ix86_push_rounding (poly_int64 bytes)
   return ROUND_UP (bytes, UNITS_PER_WORD);
 }
 
+bool ix86_symbolic_address_p (rtx x)
+{
+  poly_int64 offset;
+  x = strip_offset (x, &offset);
+  return SYMBOL_REF_P (x) || LABEL_REF_P (x);
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
diff --git a/gcc/testsuite/gcc.target/i386/pr105576.c 
b/gcc/testsuite/gcc.target/i386/pr105576.c
new file mode 100644
index 000..06dd860d3f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105576.c
@@ -0,0 +1,11 @@
+/* PR target/105576 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcmodel=large" } */
+
+extern int var;
+void *addr(void) { return &var; }
+void addr_via_asm(void)
+{
+  asm (".pushsection .xxx,\"aw\"\n\t .dc.a %0\n\t .popsection" :: "Ws"(addr));
+  asm (".pushsection .xxx,\"aw\"\n\t .dc.a %0\n\t .popsection" :: "Ws"(&var));
+}
-- 
2.18.1



Re: [PATCH] i386: Add a constraint for absolute symboilc address [PR 105576]

2022-05-18 Thread Hongyu Wang via Gcc-patches
Oh, I just found that asm ("%p0" :: "i"(addr)); also works on
-mcmodel=large in this case, please ignore this patch. Thanks.

Uros Bizjak via Gcc-patches  于2022年5月18日周三 17:46写道:
>
> On Wed, May 18, 2022 at 9:32 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > This patch adds a constraint "Ws" to allow absolute symbolic address for 
> > either
> > function or variable. This also works under -mcmodel=large.
> >
> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}
> >
> > Ok for master?
>
> Maybe you should use:
>
>   asm ("%p0" :: "X"(addr));
>   asm ("%p0" :: "X"(&var));
>
> instead.
>
> Uros.
>
> > gcc/ChangeLog:
> >
> > PR target/105576
> > * config/i386/constraints.md (Ws): New constraint.
> > * config/i386/i386-protos.h (ix86_symbolic_address_p):
> > New proto type.
> > * config/i386/i386.cc (ix86_symbolic_address_p):
> > New function to ensure a rtx is a symbolic address.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/105576
> > * gcc.target/i386/pr105576.c: New test.
> > ---
> >  gcc/config/i386/constraints.md   |  4 
> >  gcc/config/i386/i386-protos.h|  1 +
> >  gcc/config/i386/i386.cc  |  7 +++
> >  gcc/testsuite/gcc.target/i386/pr105576.c | 11 +++
> >  4 files changed, 23 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105576.c
> >
> > diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
> > index 7361687632f..ec0702be368 100644
> > --- a/gcc/config/i386/constraints.md
> > +++ b/gcc/config/i386/constraints.md
> > @@ -348,6 +348,10 @@ (define_constraint "Z"
> > instructions)."
> >(match_operand 0 "x86_64_zext_immediate_operand"))
> >
> > +(define_constraint "Ws"
> > + "A constraint that matches an absolute symbolic address."
> > + (match_test "ix86_symbolic_address_p (op)"))
> > +
> >  ;; T prefix is used for different address constraints
> >  ;;   v - VSIB address
> >  ;;   s - address with no segment register
> > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > index 3596ce81ecf..2b8d063850f 100644
> > --- a/gcc/config/i386/i386-protos.h
> > +++ b/gcc/config/i386/i386-protos.h
> > @@ -337,6 +337,7 @@ extern void x86_output_aligned_bss (FILE *, tree, const 
> > char *,
> > unsigned HOST_WIDE_INT, unsigned);
> >  extern void x86_elf_aligned_decl_common (FILE *, tree, const char *,
> >  unsigned HOST_WIDE_INT, unsigned);
> > +extern bool ix86_symbolic_address_p (rtx x);
> >
> >  #ifdef RTX_CODE
> >  extern void ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *,
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 86752a6516a..76728d10c8d 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -23956,6 +23956,13 @@ ix86_push_rounding (poly_int64 bytes)
> >return ROUND_UP (bytes, UNITS_PER_WORD);
> >  }
> >
> > +bool ix86_symbolic_address_p (rtx x)
> > +{
> > +  poly_int64 offset;
> > +  x = strip_offset (x, &offset);
> > +  return SYMBOL_REF_P (x) || LABEL_REF_P (x);
> > +}
> > +
> >  /* Target-specific selftests.  */
> >
> >  #if CHECKING_P
> > diff --git a/gcc/testsuite/gcc.target/i386/pr105576.c 
> > b/gcc/testsuite/gcc.target/i386/pr105576.c
> > new file mode 100644
> > index 000..06dd860d3f3
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr105576.c
> > @@ -0,0 +1,11 @@
> > +/* PR target/105576 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mcmodel=large" } */
> > +
> > +extern int var;
> > +void *addr(void) { return &var; }
> > +void addr_via_asm(void)
> > +{
> > +  asm (".pushsection .xxx,\"aw\"\n\t .dc.a %0\n\t .popsection" :: 
> > "Ws"(addr));
> > +  asm (".pushsection .xxx,\"aw\"\n\t .dc.a %0\n\t .popsection" :: 
> > "Ws"(&var));
> > +}
> > --
> > 2.18.1
> >


Re: [PATCH] i386: Add a constraint for absolute symboilc address [PR 105576]

2022-05-18 Thread Hongyu Wang via Gcc-patches
> -fpic will break compilation with "i" constraint.

Ah, yes. But "X" is like no constraint, shouldn't we provide something
similar to "S" in aarch64 and riscv?
I think it is better to constrain the operand to constant symbols
rather than allowing everything.

Uros Bizjak  于2022年5月18日周三 18:18写道:
>
> On Wed, May 18, 2022 at 12:14 PM Hongyu Wang  wrote:
> >
> > Oh, I just found that asm ("%p0" :: "i"(addr)); also works on
> > -mcmodel=large in this case, please ignore this patch. Thanks.
>
> -fpic will break compilation with "i" constraint.
>
> Uros.
>
> >
> > Uros Bizjak via Gcc-patches  于2022年5月18日周三 17:46写道:
> > >
> > > On Wed, May 18, 2022 at 9:32 AM Hongyu Wang  wrote:
> > > >
> > > > Hi,
> > > >
> > > > This patch adds a constraint "Ws" to allow absolute symbolic address 
> > > > for either
> > > > function or variable. This also works under -mcmodel=large.
> > > >
> > > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}
> > > >
> > > > Ok for master?
> > >
> > > Maybe you should use:
> > >
> > >   asm ("%p0" :: "X"(addr));
> > >   asm ("%p0" :: "X"(&var));
> > >
> > > instead.
> > >
> > > Uros.
> > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR target/105576
> > > > * config/i386/constraints.md (Ws): New constraint.
> > > > * config/i386/i386-protos.h (ix86_symbolic_address_p):
> > > > New proto type.
> > > > * config/i386/i386.cc (ix86_symbolic_address_p):
> > > > New function to ensure a rtx is a symbolic address.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/105576
> > > > * gcc.target/i386/pr105576.c: New test.
> > > > ---
> > > >  gcc/config/i386/constraints.md   |  4 
> > > >  gcc/config/i386/i386-protos.h|  1 +
> > > >  gcc/config/i386/i386.cc  |  7 +++
> > > >  gcc/testsuite/gcc.target/i386/pr105576.c | 11 +++
> > > >  4 files changed, 23 insertions(+)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105576.c
> > > >
> > > > diff --git a/gcc/config/i386/constraints.md 
> > > > b/gcc/config/i386/constraints.md
> > > > index 7361687632f..ec0702be368 100644
> > > > --- a/gcc/config/i386/constraints.md
> > > > +++ b/gcc/config/i386/constraints.md
> > > > @@ -348,6 +348,10 @@ (define_constraint "Z"
> > > > instructions)."
> > > >(match_operand 0 "x86_64_zext_immediate_operand"))
> > > >
> > > > +(define_constraint "Ws"
> > > > + "A constraint that matches an absolute symbolic address."
> > > > + (match_test "ix86_symbolic_address_p (op)"))
> > > > +
> > > >  ;; T prefix is used for different address constraints
> > > >  ;;   v - VSIB address
> > > >  ;;   s - address with no segment register
> > > > diff --git a/gcc/config/i386/i386-protos.h 
> > > > b/gcc/config/i386/i386-protos.h
> > > > index 3596ce81ecf..2b8d063850f 100644
> > > > --- a/gcc/config/i386/i386-protos.h
> > > > +++ b/gcc/config/i386/i386-protos.h
> > > > @@ -337,6 +337,7 @@ extern void x86_output_aligned_bss (FILE *, tree, 
> > > > const char *,
> > > > unsigned HOST_WIDE_INT, unsigned);
> > > >  extern void x86_elf_aligned_decl_common (FILE *, tree, const char *,
> > > >  unsigned HOST_WIDE_INT, 
> > > > unsigned);
> > > > +extern bool ix86_symbolic_address_p (rtx x);
> > > >
> > > >  #ifdef RTX_CODE
> > > >  extern void ix86_fp_comparison_codes (enum rtx_code code, enum 
> > > > rtx_code *,
> > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > > index 86752a6516a..76728d10c8d 100644
> > > > --- a/gcc/config/i386/i386.cc
> > > > +++ b/gcc/config/i386/i386.cc
> > > > @@ -23956,6 +23956,13 @@ ix86_push_rounding (poly_int64 bytes)
> > > >return ROUND_UP (bytes, UNITS_PER_WORD);
> > > >  }
> > > >
> > > > +bool ix86_symbolic_address_p (rtx x)
> > > > +{
> > > > +  poly_int64 offset;
> > > > +  x = strip_offset (x, &offset);
> > > > +  return SYMBOL_REF_P (x) || LABEL_REF_P (x);
> > > > +}
> > > > +
> > > >  /* Target-specific selftests.  */
> > > >
> > > >  #if CHECKING_P
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr105576.c 
> > > > b/gcc/testsuite/gcc.target/i386/pr105576.c
> > > > new file mode 100644
> > > > index 000..06dd860d3f3
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr105576.c
> > > > @@ -0,0 +1,11 @@
> > > > +/* PR target/105576 */
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mcmodel=large" } */
> > > > +
> > > > +extern int var;
> > > > +void *addr(void) { return &var; }
> > > > +void addr_via_asm(void)
> > > > +{
> > > > +  asm (".pushsection .xxx,\"aw\"\n\t .dc.a %0\n\t .popsection" :: 
> > > > "Ws"(addr));
> > > > +  asm (".pushsection .xxx,\"aw\"\n\t .dc.a %0\n\t .popsection" :: 
> > > > "Ws"(&var));
> > > > +}
> > > > --
> > > > 2.18.1
> > > >


[PATCH] x86: Adjust keylocker testcases for fail on darwin

2020-11-09 Thread Hongyu Wang via Gcc-patches
Hi

According to the discussion in
https://gcc.gnu.org/pipermail/gcc/2020-November/234096.html,
The testcase for keylocker-* is too strict for darwin target. This
patch adjusted the regex, and add a missing test for aesenc256kl
instruction.

Tested by Iain Sandone and all get pass in darwin target.

Ok for trunk?

gcc/testsuite/ChangeLog

* gcc.target/i386/keylocker-aesdec128kl.c: Adjust regex patterns.
* gcc.target/i386/keylocker-aesdec256kl.c: Likewise.
* gcc.target/i386/keylocker-aesdecwide128kl.c: Likewise.
* gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
* gcc.target/i386/keylocker-aesenc128kl.c: Likewise.
* gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
* gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
* gcc.target/i386/keylocker-encodekey128.c: Likewise.
* gcc.target/i386/keylocker-encodekey256.c: Likewise.
* gcc.target/i386/keylocker-aesenc256kl.c: New test.

-- 
Regards,

Hongyu, Wang
From 9009ce97099b3a80fdf61a1927c1fff9c7f5b9bf Mon Sep 17 00:00:00 2001
From: hongyuw1 
Date: Fri, 6 Nov 2020 15:08:10 +0800
Subject: [PATCH] Adjust Keylocker regex pattern for darwin, and add missing
 aesenc256kl test.

gcc/testsuite/ChangeLog

	* gcc.target/i386/keylocker-aesdec128kl.c: Adjust regex patterns.
	* gcc.target/i386/keylocker-aesdec256kl.c: Likewise.
	* gcc.target/i386/keylocker-aesdecwide128kl.c: Likewise.
	* gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
	* gcc.target/i386/keylocker-aesenc128kl.c: Likewise.
	* gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
	* gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
	* gcc.target/i386/keylocker-encodekey128.c: Likewise.
	* gcc.target/i386/keylocker-encodekey256.c: Likewise.
	* gcc.target/i386/keylocker-aesenc256kl.c: New test.
---
 .../gcc.target/i386/keylocker-aesdec128kl.c   |  8 ++---
 .../gcc.target/i386/keylocker-aesdec256kl.c   |  8 ++---
 .../i386/keylocker-aesdecwide128kl.c  | 36 +--
 .../i386/keylocker-aesdecwide256kl.c  | 36 +--
 .../gcc.target/i386/keylocker-aesenc128kl.c   |  8 ++---
 .../gcc.target/i386/keylocker-aesenc256kl.c   | 17 +
 .../i386/keylocker-aesencwide128kl.c  | 36 +--
 .../i386/keylocker-aesencwide256kl.c  | 36 +--
 .../gcc.target/i386/keylocker-encodekey128.c  | 14 
 .../gcc.target/i386/keylocker-encodekey256.c  | 18 +-
 10 files changed, 117 insertions(+), 100 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/keylocker-aesenc256kl.c

diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c
index 3cdda8ed7b0..9c3c8a88b0e 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c
@@ -1,9 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-mkl -O2" } */
-/* { dg-final { scan-assembler "movdqa\[ \\t\]+\[^\n\]*k2\[^\n\r]*%xmm0" } } */
-/* { dg-final { scan-assembler "aesdec128kl\[ \\t\]+\[^\n\]*h1\[^\n\r]*%xmm0" } } */
-/* { dg-final { scan-assembler "sete" } } */
-/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\n\]*%xmm0\[^\n\r]*k1" } } */
+/* { dg-final { scan-assembler {movdqa[ \t]+[^\n\r]*, %xmm0} } } */
+/* { dg-final { scan-assembler {aesdec128kl[ \t]+[^\n\r]*, %xmm0} } } */
+/* { dg-final { scan-assembler {sete} } } */
+/* { dg-final { scan-assembler {(?:movdqu|movups)[ \t]+[^\n\r]*%xmm0,[^\n\r]*} } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c
index 70b2c6357fa..6012b69e9bf 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c
@@ -1,9 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-mkl -O2" } */
-/* { dg-final { scan-assembler "movdqa\[ \\t\]+\[^\n\]*k2\[^\n\r]*%xmm0" } } */
-/* { dg-final { scan-assembler "aesdec256kl\[ \\t\]+\[^\n\]*h1\[^\n\r]*%xmm0" } } */
-/* { dg-final { scan-assembler "sete" } } */
-/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\n\]*%xmm0\[^\n\r]*k1" } } */
+/* { dg-final { scan-assembler {movdqa[ \t]+[^\n\r]*, %xmm0} } } */
+/* { dg-final { scan-assembler {aesdec256kl[ \t]+[^\n\r]*, %xmm0} } } */
+/* { dg-final { scan-assembler {sete} } } */
+/* { dg-final { scan-assembler {(?:movdqu|movups)[ \t]+[^\n\r]*%xmm0,[^\n\r]*} } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
index f2806891bff..61c294ee052 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
@@ -1,23 +1,23 @@
 /* { dg-do compile } */
 /* { dg-options "-mwidekl -O2" } */
-/* { dg-final { scan-assembler "movdqu\[ \\t\]+\[^\n\]*idata(\\(%rip\\))?\[^\n\r]*%xmm0" } } */
-/* { dg-final { scan-assembler "movdqu\[ \\t\]+\[^\n\]

Re: [PATCH] x86: Adjust keylocker testcases for fail on darwin

2020-11-09 Thread Hongyu Wang via Gcc-patches
>
> Please rewrite scan strings back to using double-quotation marks.
>

Yes, updated patch.

Uros Bizjak  于2020年11月9日周一 下午7:41写道:

>
> On Mon, Nov 9, 2020 at 11:50 AM Hongyu Wang  wrote:
> >
> > Hi
> >
> > According to the discussion in
> > https://gcc.gnu.org/pipermail/gcc/2020-November/234096.html,
> > The testcase for keylocker-* is too strict for darwin target. This
> > patch adjusted the regex, and add a missing test for aesenc256kl
> > instruction.
> >
> > Tested by Iain Sandone and all get pass in darwin target.
> >
> > Ok for trunk?
> >
> > gcc/testsuite/ChangeLog
> >
> > * gcc.target/i386/keylocker-aesdec128kl.c: Adjust regex patterns.
> > * gcc.target/i386/keylocker-aesdec256kl.c: Likewise.
> > * gcc.target/i386/keylocker-aesdecwide128kl.c: Likewise.
> > * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
> > * gcc.target/i386/keylocker-aesenc128kl.c: Likewise.
> > * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
> > * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
> > * gcc.target/i386/keylocker-encodekey128.c: Likewise.
> > * gcc.target/i386/keylocker-encodekey256.c: Likewise.
> > * gcc.target/i386/keylocker-aesenc256kl.c: New test.
>
> Please rewrite scan strings back to using double-quotation marks.
>
> Uros.
>
> >
> > --
> > Regards,
> >
> > Hongyu, Wang
From 826a48e5d08b2ad6865ef92c0965f095cad3d654 Mon Sep 17 00:00:00 2001
From: hongyuw1 
Date: Fri, 6 Nov 2020 15:08:10 +0800
Subject: [PATCH] Adjust Keylocker regex pattern for darwin, and add missing
 aesenc256kl test.

gcc/testsuite/ChangeLog

	* gcc.target/i386/keylocker-aesdec128kl.c: Adjust regex patterns.
	* gcc.target/i386/keylocker-aesdec256kl.c: Likewise.
	* gcc.target/i386/keylocker-aesdecwide128kl.c: Likewise.
	* gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
	* gcc.target/i386/keylocker-aesenc128kl.c: Likewise.
	* gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
	* gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
	* gcc.target/i386/keylocker-encodekey128.c: Likewise.
	* gcc.target/i386/keylocker-encodekey256.c: Likewise.
	* gcc.target/i386/keylocker-aesenc256kl.c: New test.
---
 .../gcc.target/i386/keylocker-aesdec128kl.c   |  6 ++--
 .../gcc.target/i386/keylocker-aesdec256kl.c   |  6 ++--
 .../i386/keylocker-aesdecwide128kl.c  | 34 +--
 .../i386/keylocker-aesdecwide256kl.c  | 34 +--
 .../gcc.target/i386/keylocker-aesenc128kl.c   |  6 ++--
 .../gcc.target/i386/keylocker-aesenc256kl.c   | 17 ++
 .../i386/keylocker-aesencwide128kl.c  | 34 +--
 .../i386/keylocker-aesencwide256kl.c  | 34 +--
 .../gcc.target/i386/keylocker-encodekey128.c  | 14 
 .../gcc.target/i386/keylocker-encodekey256.c  | 18 +-
 10 files changed, 110 insertions(+), 93 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/keylocker-aesenc256kl.c

diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c
index 3cdda8ed7b0..d134612beea 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdec128kl.c
@@ -1,9 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-mkl -O2" } */
-/* { dg-final { scan-assembler "movdqa\[ \\t\]+\[^\n\]*k2\[^\n\r]*%xmm0" } } */
-/* { dg-final { scan-assembler "aesdec128kl\[ \\t\]+\[^\n\]*h1\[^\n\r]*%xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \\t\]+\[^\\n\\r\]*, %xmm0" } } */
+/* { dg-final { scan-assembler "aesdec128kl\[ \\t\]+\[^\\n\\r\]*, %xmm0" } } */
 /* { dg-final { scan-assembler "sete" } } */
-/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\n\]*%xmm0\[^\n\r]*k1" } } */
+/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm0,\[^\\n\\r\]*" } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c
index 70b2c6357fa..34736d2d61a 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdec256kl.c
@@ -1,9 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-mkl -O2" } */
-/* { dg-final { scan-assembler "movdqa\[ \\t\]+\[^\n\]*k2\[^\n\r]*%xmm0" } } */
-/* { dg-final { scan-assembler "aesdec256kl\[ \\t\]+\[^\n\]*h1\[^\n\r]*%xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \\t\]+\[^\\n\\r\]*, %xmm0" } } */
+/* { dg-final { scan-assembler "aesdec256kl\[ \\t\]+\[^\\n\\r\]*, %xmm0" } } */
 /* { dg-final { scan-assembler "sete" } } */
-/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\n\]*%xmm0\[^\n\r]*k1" } } */
+/* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm0,\[^\\n\\r\]*" } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
index f2806891bff..d23cf4b6517 100644
--- a/gcc/testsuite/gcc

[PATCH][PR target/97770] x86: Add missing popcount2 expander

2020-11-11 Thread Hongyu Wang via Gcc-patches
Hi,

According to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97770, x86
backend need popcount2 expander so __builtin_popcount could be
auto vectorized with AVX512BITALG/AVX512VPOPCNTDQ targets.

For DImode the middle-end vectorizer could not generate expected code,
and for QI/HImode there is no corresponding IFN, xfails are added for
these tests.

Bootstrap/regression test for x86 backend is OK.

OK for master?

gcc/ChangeLog

PR target/97770
* gcc/config/i386/sse.md (popcount2): New expander
for SI/DI vector modes.
(popcount2): Likewise for QI/HI vector modes.

gcc/testsuite/ChangeLog

PR target/97770
* gcc.target/i386/avx512bitalg-pr97770-1.c: New test.
* gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Likewise.
* gcc.target/i386/avx512vpopcntdq-pr97770-2.c: Likewise.
* gcc.target/i386/avx512vpopcntdqvl-pr97770-1.c: Likewise.

-- 
Regards,

Hongyu, Wang
From b809052b0bab5d80dd0a1b1ffbd55faa8179a416 Mon Sep 17 00:00:00 2001
From: Hongyu Wang 
Date: Wed, 11 Nov 2020 09:41:13 +0800
Subject: [PATCH] Add popcount expander to enable popcount auto
 vectorization under AVX512BITALG/AVX512POPCNTDQ target.

gcc/ChangeLog

	PR target/97770
	* gcc/config/i386/sse.md (popcount2): New expander
	for SI/DI vector modes.
	(popcount2): Likewise for QI/HI vector modes.

gcc/testsuite/ChangeLog

	PR target/97770
	* gcc.target/i386/avx512bitalg-pr97770-1.c: New test.
	* gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Likewise.
	* gcc.target/i386/avx512vpopcntdq-pr97770-2.c: Likewise.
	* gcc.target/i386/avx512vpopcntdqvl-pr97770-1.c: Likewise.
---
 gcc/config/i386/sse.md| 12 
 .../gcc.target/i386/avx512bitalg-pr97770-1.c  | 60 ++
 .../i386/avx512vpopcntdq-pr97770-1.c  | 63 +++
 .../i386/avx512vpopcntdq-pr97770-2.c  | 39 
 .../i386/avx512vpopcntdqvl-pr97770-1.c| 14 +
 5 files changed, 188 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-pr97770-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vpopcntdqvl-pr97770-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8437ad27087..8566b2ccda2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -22678,6 +22678,12 @@ (define_insn "avx5124vnniw_vp4dpwssds_maskz"
 (set_attr ("prefix") ("evex"))
 (set_attr ("mode") ("TI"))])
 
+(define_expand "popcount2"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+	(popcount:VI48_AVX512VL
+	  (match_operand:VI48_AVX512VL 1 "nonimmediate_operand")))]
+  "TARGET_AVX512VPOPCNTDQ")
+
 (define_insn "vpopcount"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
 	(popcount:VI48_AVX512VL
@@ -22722,6 +22728,12 @@ (define_insn "*restore_multiple_leave_return"
   "TARGET_SSE && TARGET_64BIT"
   "jmp\t%P1")
 
+(define_expand "popcount2"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+	(popcount:VI12_AVX512VL
+	  (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512BITALG")
+
 (define_insn "vpopcount"
   [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
 	(popcount:VI12_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
new file mode 100644
index 000..c83a477045c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-pr97770-1.c
@@ -0,0 +1,60 @@
+/* PR target/97770 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bitalg -mavx512vl -mprefer-vector-width=512" } */
+/* Add xfail since no IFN for QI/HImode popcount */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*xmm" 1 {xfail *-*-*} } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*ymm" 1 {xfail *-*-*} } } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \\t\]+\[^\\n\\r\]*zmm" 1 {xfail *-*-*} } } */
+
+#include 
+
+void
+__attribute__ ((noipa, optimize("-O3")))
+popcountb_128 (char * __restrict dest, char* src)
+{
+  for (int i = 0; i != 16; i++)
+dest[i] = __builtin_popcount (src[i]);
+}
+
+void
+__attribute__ ((noipa, optimize("-O3")))
+popcountw_128 (short* __restrict dest, short* src)
+{
+  for (int i = 0; i != 8; i++)
+dest[i] = __builtin_popcount (src[i]);
+}
+
+void
+__attribute__ ((noipa, optimize("-O3")))
+popcountb_256 (char * __restrict dest, char* src)
+{
+  for (int i = 0; i != 32; i++)
+dest[i] = __builtin_popcount (src[i]);
+}
+
+void
+__attribute__ ((noipa, optimize("-O3")))
+popcountw_256 (short* __re

Re: [PATCH] Remove redundant builtins for avx512f scalar instructions.

2020-11-12 Thread Hongyu Wang via Gcc-patches
Hi

Thanks for reminding me about this patch. I didn't remove any existing
intrinsics, just remove redundant builtin functions that end-users
would not likely to use.

Also I'm OK to keep current implementation, in case there might be
someone using the builtin directly.

Jeff Law  于2020年11月13日周五 下午1:43写道:
>
>
> On 12/23/19 10:31 PM, Hongyu Wang wrote:
>
> Hi:
>   For avx512f scalar instructions, current builtin function like
> __builtin_ia32_*{sd,ss}_round can be replaced by
> __builtin_ia32_*{sd,ss}_mask_round with mask parameter set to -1. This
> patch did the replacement and remove the corresponding redundant
> builtins.
>
>   Bootstrap is ok, make-check ok for i386 target.
>   Ok for trunk?
>
> Changelog
>
> gcc/
> * config/i386/avx512fintrin.h
> (_mm_add_round_sd, _mm_add_round_ss): Use
>  __builtin_ia32_adds?_mask_round builtins instead of
> __builtin_ia32_adds?_round.
> (_mm_sub_round_sd, _mm_sub_round_ss,
> _mm_mul_round_sd, _mm_mul_round_ss,
> _mm_div_round_sd, _mm_div_round_ss,
> _mm_getexp_sd, _mm_getexp_ss,
> _mm_getexp_round_sd, _mm_getexp_round_ss,
> _mm_getmant_sd, _mm_getmant_ss,
> _mm_getmant_round_sd, _mm_getmant_round_ss,
> _mm_max_round_sd, _mm_max_round_ss,
> _mm_min_round_sd, _mm_min_round_ss,
> _mm_fmadd_round_sd, _mm_fmadd_round_ss,
> _mm_fmsub_round_sd, _mm_fmsub_round_ss,
> _mm_fnmadd_round_sd, _mm_fnmadd_round_ss,
> _mm_fnmsub_round_sd, _mm_fnmsub_round_ss): Likewise.
> * config/i386/i386-builtin.def
> (__builtin_ia32_addsd_round, __builtin_ia32_addss_round,
> __builtin_ia32_subsd_round, __builtin_ia32_subss_round,
> __builtin_ia32_mulsd_round, __builtin_ia32_mulss_round,
> __builtin_ia32_divsd_round, __builtin_ia32_divss_round,
> __builtin_ia32_getexpsd128_round, __builtin_ia32_getexpss128_round,
> __builtin_ia32_getmantsd_round, __builtin_ia32_getmantss_round,
> __builtin_ia32_maxsd_round, __builtin_ia32_maxss_round,
> __builtin_ia32_minsd_round, __builtin_ia32_minss_round,
> __builtin_ia32_vfmaddsd3_round,
> __builtin_ia32_vfmaddss3_round): Remove.
> * config/i386/i386-expand.c
> (ix86_expand_round_builtin): Remove corresponding case.
>
> gcc/testsuite/
> * lib/target-supports.exp
> (check_effective_target_avx512f): Use
> __builtin_ia32_getmantsd_mask_round builtins instead of
> __builtin_ia32_getmantsd_round.
> *gcc.target/i386/avx-1.c
> (__builtin_ia32_addsd_round, __builtin_ia32_addss_round,
> __builtin_ia32_subsd_round, __builtin_ia32_subss_round,
> __builtin_ia32_mulsd_round, __builtin_ia32_mulss_round,
> __builtin_ia32_divsd_round, __builtin_ia32_divss_round,
> __builtin_ia32_getexpsd128_round, __builtin_ia32_getexpss128_round,
> __builtin_ia32_getmantsd_round, __builtin_ia32_getmantss_round,
> __builtin_ia32_maxsd_round, __builtin_ia32_maxss_round,
> __builtin_ia32_minsd_round, __builtin_ia32_minss_round,
> __builtin_ia32_vfmaddsd3_round,
> __builtin_ia32_vfmaddss3_round): Remove.
> *gcc.target/i386/sse-13.c: Ditto.
> *gcc.target/i386/sse-23.c: Ditto.
>
> So I like the idea of simplifying the implementation of some of the 
> intrinsics when we can, but ISTM that removing existing intrinsics would be a 
> mistake since end-users could be using them in their code.   I'd think we'd 
> want to keep the existing APIs, even if we change the implementation under 
> the hood.
>
>
> Thoughts?
>
>
> jeff
>
>
> Hongyu Wang
>
>
> 0001-Remove-redundant-round-builtins-for-avx512f-scalar-i.patch
>
> From 9cc4928aad5770c53ff580f5c996092cdaf2f9ba Mon Sep 17 00:00:00 2001
> From: hongyuw1 
> Date: Wed, 18 Dec 2019 14:52:54 +
> Subject: [PATCH] Remove redundant round builtins for avx512f scalar
>  instructions
>
> Changelog
>
> gcc/
> * config/i386/avx512fintrin.h
> (_mm_add_round_sd, _mm_add_round_ss): Use
> __builtin_ia32_adds?_mask_round builtins instead of
> __builtin_ia32_adds?_round.
> (_mm_sub_round_sd, _mm_sub_round_ss,
> _mm_mul_round_sd, _mm_mul_round_ss,
> _mm_div_round_sd, _mm_div_round_ss,
> _mm_getexp_sd, _mm_getexp_ss,
> _mm_getexp_round_sd, _mm_getexp_round_ss,
> _mm_getmant_sd, _mm_getmant_ss,
> _mm_getmant_round_sd, _mm_getmant_round_ss,
> _mm_max_round_sd, _mm_max_round_ss,
> _mm_min_round_sd, _mm_min_round_ss,
> _mm_fmadd_round_sd, _mm_fmadd_round_ss,
> _mm_fmsub_round_sd, _mm_fmsub_round_ss,
> _mm_fnmadd_round_sd, _mm_fnmadd_round_ss,
> _mm_fnmsub_round_sd, _mm_fnmsub_round_ss): Likewise.
> * config/i386/i386-builtin.def
> (__builtin_ia32_addsd_round, __builtin_ia32_addss_round,
> __builtin_ia32_subsd_round, __builtin_ia32_subss_round,
> __builtin_ia32_mulsd_round, __builtin_ia32_mulss_round,
> __builtin_ia32_divsd_round, __builtin_ia32_divss_round,
> __builtin_ia32_getexpsd128_round, __builtin_

[PATCH] AVX512FP16: Support cond_op for HFmode

2021-09-23 Thread Hongyu Wang via Gcc-patches
Hi,

This patch extend the expanders for cond_op to support vector HF modes.
bootstraped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for master?

gcc/ChangeLog:

* config/i386/sse.md (cond_): Extend to support
vector HFmodes.
(cond_mul): Likewise.
(cond_div): Likewise.
(cond_): Likewise.
(cond_fma): Likewise.
(cond_fms): Likewise.
(cond_fnma): Likewise.
(cond_fnms): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_addsubmuldiv__Float16-1.c: New test.
* gcc.target/i386/cond_op_addsubmuldiv__Float16-2.c: Ditto.
* gcc.target/i386/cond_op_fma__Float16-1.c: Ditto.
* gcc.target/i386/cond_op_fma__Float16-2.c: Ditto.
* gcc.target/i386/cond_op_maxmin__Float16-1.c: Ditto.
* gcc.target/i386/cond_op_maxmin__Float16-2.c: Ditto.
---
 gcc/config/i386/sse.md| 112 +-
 .../i386/cond_op_addsubmuldiv__Float16-1.c|   9 ++
 .../i386/cond_op_addsubmuldiv__Float16-2.c|   7 ++
 .../gcc.target/i386/cond_op_fma__Float16-1.c  |  20 
 .../gcc.target/i386/cond_op_fma__Float16-2.c  |   7 ++
 .../i386/cond_op_maxmin__Float16-1.c  |   8 ++
 .../i386/cond_op_maxmin__Float16-2.c  |   6 +
 7 files changed, 113 insertions(+), 56 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv__Float16-1.c
 create mode 100644 
gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv__Float16-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma__Float16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma__Float16-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin__Float16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin__Float16-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1ca95984afc..c2eeb7b1517 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2118,12 +2118,12 @@
   [(set_attr "isa" "noavx,noavx,avx,avx")])
 
 (define_expand "cond_"
-  [(set (match_operand:VF 0 "register_operand")
-   (vec_merge:VF
- (plusminus:VF
-   (match_operand:VF 2 "vector_operand")
-   (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+   (vec_merge:VFH
+ (plusminus:VFH
+   (match_operand:VFH 2 "vector_operand")
+   (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
  (match_operand: 1 "register_operand")))]
   " == 64 || TARGET_AVX512VL"
 {
@@ -2207,12 +2207,12 @@
(set_attr "mode" "")])
 
 (define_expand "cond_mul"
-  [(set (match_operand:VF 0 "register_operand")
-   (vec_merge:VF
- (mult:VF
-   (match_operand:VF 2 "vector_operand")
-   (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+   (vec_merge:VFH
+ (mult:VFH
+   (match_operand:VFH 2 "vector_operand")
+   (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
  (match_operand: 1 "register_operand")))]
   " == 64 || TARGET_AVX512VL"
 {
@@ -2322,12 +2322,12 @@
 })
 
 (define_expand "cond_div"
-  [(set (match_operand:VF 0 "register_operand")
-   (vec_merge:VF
- (div:VF
-   (match_operand:VF 2 "register_operand")
-   (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+   (vec_merge:VFH
+ (div:VFH
+   (match_operand:VFH 2 "register_operand")
+   (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
  (match_operand: 1 "register_operand")))]
   " == 64 || TARGET_AVX512VL"
 {
@@ -2660,12 +2660,12 @@
(set_attr "mode" "HF")])
 
 (define_expand "cond_"
-  [(set (match_operand:VF 0 "register_operand")
-   (vec_merge:VF
- (smaxmin:VF
-   (match_operand:VF 2 "vector_operand")
-   (match_operand:VF 3 "vector_operand"))
- (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+   (vec_merge:VFH
+ (smaxmin:VFH
+   (match_operand:VFH 2 "vector_operand")
+   (match_operand:VFH 3 "vector_operand"))
+ (match_operand:VFH 4 "nonimm_or_0_operand")
  (match_operand: 1 "register_operand")))]
   " == 64 || TARGET_AVX512VL"
 {
@@ -4785,13 +4785,13 @@
(set_attr "mode" "")])
 
 (define_expand "cond_fma"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand")
-   (vec_merge:VF_AVX512VL
- (fma:VF_AVX512VL
-   (match_operand:VF_AVX512VL 2 "vector_operand")
-   (match_operand:VF_AVX512VL 3 "vector_operand")
-   (match_operand:VF_AVX512VL 4 "vector_operand"))
- (match_

Re: [PATCH] AVX512FP16: Support cond_op for HFmode

2021-09-23 Thread Hongyu Wang via Gcc-patches
> >This patch extend the expanders for cond_op to support vector HF modes.
> >bootstraped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Do runtime tests passe on sde{-m32,}?

Yes, forgot to mention this.

Liu, Hongtao via Gcc-patches  于2021年9月23日周四 下午5:31写道:

>
>
>
> >-Original Message-
> >From: Wang, Hongyu 
> >Sent: Thursday, September 23, 2021 5:16 PM
> >To: Liu, Hongtao 
> >Cc: gcc-patches@gcc.gnu.org
> >Subject: [PATCH] AVX512FP16: Support cond_op for HFmode
> >
> >Hi,
> >
> >This patch extend the expanders for cond_op to support vector HF modes.
> >bootstraped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Do runtime tests passe on sde{-m32,}?
> >Ok for master?
> >
> >gcc/ChangeLog:
> >
> >   * config/i386/sse.md (cond_): Extend to support
> >   vector HFmodes.
> >   (cond_mul): Likewise.
> >   (cond_div): Likewise.
> >   (cond_): Likewise.
> >   (cond_fma): Likewise.
> >   (cond_fms): Likewise.
> >   (cond_fnma): Likewise.
> >   (cond_fnms): Likewise.
> >
> >gcc/testsuite/ChangeLog:
> >
> >   * gcc.target/i386/cond_op_addsubmuldiv__Float16-1.c: New test.
> >   * gcc.target/i386/cond_op_addsubmuldiv__Float16-2.c: Ditto.
> >   * gcc.target/i386/cond_op_fma__Float16-1.c: Ditto.
> >   * gcc.target/i386/cond_op_fma__Float16-2.c: Ditto.
> >   * gcc.target/i386/cond_op_maxmin__Float16-1.c: Ditto.
> >   * gcc.target/i386/cond_op_maxmin__Float16-2.c: Ditto.
> >---
> > gcc/config/i386/sse.md| 112 +-
> > .../i386/cond_op_addsubmuldiv__Float16-1.c|   9 ++
> > .../i386/cond_op_addsubmuldiv__Float16-2.c|   7 ++
> > .../gcc.target/i386/cond_op_fma__Float16-1.c  |  20 
> > .../gcc.target/i386/cond_op_fma__Float16-2.c  |   7 ++
> > .../i386/cond_op_maxmin__Float16-1.c  |   8 ++
> > .../i386/cond_op_maxmin__Float16-2.c  |   6 +
> > 7 files changed, 113 insertions(+), 56 deletions(-)  create mode 100644
> >gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv__Float16-1.c
> > create mode 100644
> >gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv__Float16-2.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma__Float16-1.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma__Float16-2.c
> > create mode 100644
> >gcc/testsuite/gcc.target/i386/cond_op_maxmin__Float16-1.c
> > create mode 100644
> >gcc/testsuite/gcc.target/i386/cond_op_maxmin__Float16-2.c
> >
> >diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index
> >1ca95984afc..c2eeb7b1517 100644
> >--- a/gcc/config/i386/sse.md
> >+++ b/gcc/config/i386/sse.md
> >@@ -2118,12 +2118,12 @@
> >   [(set_attr "isa" "noavx,noavx,avx,avx")])
> >
> > (define_expand "cond_"
> >-  [(set (match_operand:VF 0 "register_operand")
> >-  (vec_merge:VF
> >-(plusminus:VF
> >-  (match_operand:VF 2 "vector_operand")
> >-  (match_operand:VF 3 "vector_operand"))
> >-(match_operand:VF 4 "nonimm_or_0_operand")
> >+  [(set (match_operand:VFH 0 "register_operand")
> >+  (vec_merge:VFH
> >+(plusminus:VFH
> >+  (match_operand:VFH 2 "vector_operand")
> >+  (match_operand:VFH 3 "vector_operand"))
> >+(match_operand:VFH 4 "nonimm_or_0_operand")
> > (match_operand: 1 "register_operand")))]
> >   " == 64 || TARGET_AVX512VL"
> > {
> >@@ -2207,12 +2207,12 @@
> >(set_attr "mode" "")])
> >
> > (define_expand "cond_mul"
> >-  [(set (match_operand:VF 0 "register_operand")
> >-  (vec_merge:VF
> >-(mult:VF
> >-  (match_operand:VF 2 "vector_operand")
> >-  (match_operand:VF 3 "vector_operand"))
> >-(match_operand:VF 4 "nonimm_or_0_operand")
> >+  [(set (match_operand:VFH 0 "register_operand")
> >+  (vec_merge:VFH
> >+(mult:VFH
> >+  (match_operand:VFH 2 "vector_operand")
> >+  (match_operand:VFH 3 "vector_operand"))
> >+(match_operand:VFH 4 "nonimm_or_0_operand")
> > (match_operand: 1 "register_operand")))]
> >   " == 64 || TARGET_AVX512VL"
> > {
> >@@ -2322,12 +2322,12 @@
> > })
> >
> > (define_expand "cond_div"
> >-  [(set (match_operand:VF 0 "register_operand")
> >-  (vec_merge:VF
> >-(div:VF
> >-  (match_operand:VF 2 "register_operand")
> >-  (match_operand:VF 3 "vector_operand"))
> >-(match_operand:VF 4 "nonimm_or_0_operand")
> >+  [(set (match_operand:VFH 0 "register_operand")
> >+  (vec_merge:VFH
> >+(div:VFH
> >+  (match_operand:VFH 2 "register_operand")
> >+  (match_operand:VFH 3 "vector_operand"))
> >+(match_operand:VFH 4 "nonimm_or_0_operand")
> > (match_operand: 1 "register_operand")))]
> >   " == 64 || TARGET_AVX512VL"
> > {
> >@@ -2660,12 +2660,12 @@
> >(set_attr "mode" "HF")])
> >
> > (define_expand "cond_"
> >-  [(set (match_operand:VF 0 "register_operand")
> >-  (vec_merge:VF
> >-(smaxmin:VF
> >-  (match_operand:VF 2 "vector_operand")
> >-  (match_operan

[PATCH] AVX512FP16:support basic 64/32bit vector type and operation.

2021-09-27 Thread Hongyu Wang via Gcc-patches
Hi Uros,

This patch intends to support V4HF/V2HF vector type and basic operations.

For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
is parsed by stack and returned from GPR since it is not specified
by ABI.

We found for 64bit vector in ia32, when mmx disabled there seems no
mov_internal, so we add a define_insn for v4hf mode. It would be very
ppreciated if you know why the handling of 64bit vector looks as is and
give some advice.

Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.

OK for master?

gcc/ChangeLog:

PR target/102230
* config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
V4HF and V2HF mode check.
(VALID_SSE2_REG_VHF_MODE): Likewise.
(VALID_MMX_REG_MODE): Likewise.
(SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
vector mode condition.
* config/i386/i386.c (classify_argument): Parse V4HF/V2HF
via sse regs.
(function_arg_32): Add V4HFmode.
(function_arg_advance_32): Likewise.
* config/i386/i386.md (mode): Add V4HF/V2HF.
(MODE_SIZE): Likewise.
* config/i386/mmx.md (MMXMODE): Add V4HF mode.
(V_32): Add V2HF mode.
(*mov_internal): Adjust sse alternatives to support
V4HF mode vector move.
(*mov_internal): Adjust sse alternatives
to support V2HF mode move.
* config/i386/sse.md (VHF_32_64): New mode iterator.
(3): New define_insn for add/sub/mul/div.
(*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.

gcc/testsuite/ChangeLog:

PR target/102230
* gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
* gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
* gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
* gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
* gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
* gcc.target/i386/pr102230.c: Ditto.
---
 gcc/config/i386/i386.c|  4 +
 gcc/config/i386/i386.h| 12 ++-
 gcc/config/i386/i386.md   |  5 +-
 gcc/config/i386/mmx.md| 27 ---
 gcc/config/i386/sse.md| 49 
 .../i386/avx512fp16-64-32-vecop-1.c   | 30 
 .../i386/avx512fp16-64-32-vecop-2.c   | 75 +++
 .../gcc.target/i386/avx512fp16-floatvnhf.c| 12 +--
 .../i386/avx512fp16-trunc-extendvnhf.c| 12 +--
 .../gcc.target/i386/avx512fp16-truncvnhf.c| 12 +--
 gcc/testsuite/gcc.target/i386/pr102230.c  | 38 ++
 11 files changed, 243 insertions(+), 33 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ba89e111d28..b3e4add4b9e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
 case E_V2SFmode:
 case E_V2SImode:
 case E_V4HImode:
+case E_V4HFmode:
+case E_V2HFmode:
 case E_V8QImode:
   classes[0] = X86_64_SSE_CLASS;
   return 1;
@@ -2902,6 +2904,7 @@ pass_in_reg:
 
 case E_V8QImode:
 case E_V4HImode:
+case E_V4HFmode:
 case E_V2SImode:
 case E_V2SFmode:
 case E_V1TImode:
@@ -3149,6 +3152,7 @@ pass_in_reg:
 
 case E_V8QImode:
 case E_V4HImode:
+case E_V4HFmode:
 case E_V2SImode:
 case E_V2SFmode:
 case E_V1TImode:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8a4251b4926..9f3cad31f96 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const 
char **argv);
|| (MODE) == TImode)
 
 #define VALID_AVX512FP16_REG_MODE(MODE)
\
-  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
+  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode\
+   || (MODE) == V4HFmode || (MODE) == V2HFmode)
 
 #define VALID_SSE2_REG_MODE(MODE)  \
   ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \
@@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const 
char **argv);
|| (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
 
 #define VALID_SSE2_REG_VHF_MODE(MODE)  \
-  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
+  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode\
+   || (MODE) == V4HFmode || (MODE) == V2HFmode)
 
 #define VALID_SSE_REG_MODE(MODE)   \
   ((MODE) == V1TImode || (MODE) == TImode  \
@@ -1054,7 +1056,8 @@ extern const char *host_detect_local_cpu (int argc, const 
char **argv);
 #define VALID

Re: [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.

2021-09-27 Thread Hongyu Wang via Gcc-patches
> ia32 ABI declares that __m64 values pass via MMX registers. Due to
> this, we are not able to fully disable MMX register usage, as is the
> case with x86_64. So, V4HFmode values will pass to functions via MMX
> registers on ia32 targets.
>
> So, there should be no additional define_insn, the addition to the
> existing MMXMODE mode iterator should be enough. V4HFmodes should be
> handled in the same way as e.g. V8QImode.
>
> This is not the case with 4-byte values, which should be passed using
> integer ABI.

Thanks for the explanation, updated patch by removing the extra define_insn,
and drop V4HFmode from VALID_AVX512FP16_REG_MODE. Now v4hf would behave
same as v8qi.

Bootsrapped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.

OK for master with the updated one?

Uros Bizjak via Gcc-patches  于2021年9月27日周一 下午7:35写道:
>
> On Mon, Sep 27, 2021 at 12:42 PM Hongyu Wang  wrote:
> >
> > Hi Uros,
> >
> > This patch intends to support V4HF/V2HF vector type and basic operations.
> >
> > For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
> > is parsed by stack and returned from GPR since it is not specified
> > by ABI.
> >
> > We found for 64bit vector in ia32, when mmx disabled there seems no
> > mov_internal, so we add a define_insn for v4hf mode. It would be very
> > ppreciated if you know why the handling of 64bit vector looks as is and
> > give some advice.
>
> ia32 ABI declares that __m64 values pass via MMX registers. Due to
> this, we are not able to fully disable MMX register usage, as is the
> case with x86_64. So, V4HFmode values will pass to functions via MMX
> registers on ia32 targets.
>
> So, there should be no additional define_insn, the addition to the
> existing MMXMODE mode iterator should be enough. V4HFmodes should be
> handled in the same way as e.g. V8QImode.
>
> This is not the case with 4-byte values, which should be passed using
> integer ABI.
>
> Uros.
>
> >
> > Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > OK for master?
> >
> > gcc/ChangeLog:
> >
> > PR target/102230
> > * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
> > V4HF and V2HF mode check.
> > (VALID_SSE2_REG_VHF_MODE): Likewise.
> > (VALID_MMX_REG_MODE): Likewise.
> > (SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
> > vector mode condition.
> > * config/i386/i386.c (classify_argument): Parse V4HF/V2HF
> > via sse regs.
> > (function_arg_32): Add V4HFmode.
> > (function_arg_advance_32): Likewise.
> > * config/i386/i386.md (mode): Add V4HF/V2HF.
> > (MODE_SIZE): Likewise.
> > * config/i386/mmx.md (MMXMODE): Add V4HF mode.
> > (V_32): Add V2HF mode.
> > (*mov_internal): Adjust sse alternatives to support
> > V4HF mode vector move.
> > (*mov_internal): Adjust sse alternatives
> > to support V2HF mode move.
> > * config/i386/sse.md (VHF_32_64): New mode iterator.
> > (3): New define_insn for add/sub/mul/div.
> > (*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/102230
> > * gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
> > * gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
> > * gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
> > * gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
> > * gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
> > * gcc.target/i386/pr102230.c: Ditto.
> > ---
> >  gcc/config/i386/i386.c|  4 +
> >  gcc/config/i386/i386.h| 12 ++-
> >  gcc/config/i386/i386.md   |  5 +-
> >  gcc/config/i386/mmx.md| 27 ---
> >  gcc/config/i386/sse.md| 49 
> >  .../i386/avx512fp16-64-32-vecop-1.c   | 30 
> >  .../i386/avx512fp16-64-32-vecop-2.c   | 75 +++
> >  .../gcc.target/i386/avx512fp16-floatvnhf.c| 12 +--
> >  .../i386/avx512fp16-trunc-extendvnhf.c| 12 +--
> >  .../gcc.target/i386/avx512fp16-truncvnhf.c| 12 +--
> >  gcc/testsuite/gcc.target/i386/pr102230.c  | 38 ++
> >  11 files changed, 243 insertions(+), 33 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-64-32-vecop-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102230.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index ba89e111d28..b3e4add4b9e 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
> >  case E_V2SFmode:
> >  case E_V2SImode:
> >  case E_V4HImode:
> > +case E_V4HFmode:
> > +case E_V2HFmode:
> >  case E_V8QImode:
> > 

Re: [PATCH] AVX512FP16:support basic 64/32bit vector type and operation.

2021-09-28 Thread Hongyu Wang via Gcc-patches
> I'd put this new pattern in mmx.md to keep 64bit/32bit modes in
> mmx.md, similar to e.g. FMA patterns among others.

Yes, I put it after single-float patterns. Attached the patch I'm
going to check-in.
Thanks for your review.

Uros Bizjak  于2021年9月28日周二 下午2:27写道:
>
> On Tue, Sep 28, 2021 at 6:48 AM Hongyu Wang  wrote:
> >
> > > ia32 ABI declares that __m64 values pass via MMX registers. Due to
> > > this, we are not able to fully disable MMX register usage, as is the
> > > case with x86_64. So, V4HFmode values will pass to functions via MMX
> > > registers on ia32 targets.
> > >
> > > So, there should be no additional define_insn, the addition to the
> > > existing MMXMODE mode iterator should be enough. V4HFmodes should be
> > > handled in the same way as e.g. V8QImode.
> > >
> > > This is not the case with 4-byte values, which should be passed using
> > > integer ABI.
> >
> > Thanks for the explanation, updated patch by removing the extra define_insn,
> > and drop V4HFmode from VALID_AVX512FP16_REG_MODE. Now v4hf would behave
> > same as v8qi.
> >
> > Bootsrapped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > OK for master with the updated one?
>
> I'd put this new pattern in mmx.md to keep 64bit/32bit modes in
> mmx.md, similar to e.g. FMA patterns among others.
>
> OK with the eventual above change.
>
> Thanks,
> Uros.
>
> >
> > Uros Bizjak via Gcc-patches  于2021年9月27日周一 
> > 下午7:35写道:
> > >
> > > On Mon, Sep 27, 2021 at 12:42 PM Hongyu Wang  
> > > wrote:
> > > >
> > > > Hi Uros,
> > > >
> > > > This patch intends to support V4HF/V2HF vector type and basic 
> > > > operations.
> > > >
> > > > For 32bit target, V4HF vector is parsed same as __m64 type, V2HF
> > > > is parsed by stack and returned from GPR since it is not specified
> > > > by ABI.
> > > >
> > > > We found for 64bit vector in ia32, when mmx disabled there seems no
> > > > mov_internal, so we add a define_insn for v4hf mode. It would be 
> > > > very
> > > > ppreciated if you know why the handling of 64bit vector looks as is and
> > > > give some advice.
> > >
> > > ia32 ABI declares that __m64 values pass via MMX registers. Due to
> > > this, we are not able to fully disable MMX register usage, as is the
> > > case with x86_64. So, V4HFmode values will pass to functions via MMX
> > > registers on ia32 targets.
> > >
> > > So, there should be no additional define_insn, the addition to the
> > > existing MMXMODE mode iterator should be enough. V4HFmodes should be
> > > handled in the same way as e.g. V8QImode.
> > >
> > > This is not the case with 4-byte values, which should be passed using
> > > integer ABI.
> > >
> > > Uros.
> > >
> > > >
> > > > Bootstraped and regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> > > >
> > > > OK for master?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR target/102230
> > > > * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): Add
> > > > V4HF and V2HF mode check.
> > > > (VALID_SSE2_REG_VHF_MODE): Likewise.
> > > > (VALID_MMX_REG_MODE): Likewise.
> > > > (SSE_REG_MODE_P): Replace VALID_AVX512FP16_REG_MODE with
> > > > vector mode condition.
> > > > * config/i386/i386.c (classify_argument): Parse V4HF/V2HF
> > > > via sse regs.
> > > > (function_arg_32): Add V4HFmode.
> > > > (function_arg_advance_32): Likewise.
> > > > * config/i386/i386.md (mode): Add V4HF/V2HF.
> > > > (MODE_SIZE): Likewise.
> > > > * config/i386/mmx.md (MMXMODE): Add V4HF mode.
> > > > (V_32): Add V2HF mode.
> > > > (*mov_internal): Adjust sse alternatives to support
> > > > V4HF mode vector move.
> > > > (*mov_internal): Adjust sse alternatives
> > > > to support V2HF mode move.
> > > > * config/i386/sse.md (VHF_32_64): New mode iterator.
> > > > (3): New define_insn for add/sub/mul/div.
> > > > (*movv4hf_internal_sse): New define_insn for -mno-mmx and -msse.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/102230
> > > > * gcc.target/i386/avx512fp16-floatvnhf.c: Remove xfail.
> > > > * gcc.target/i386/avx512fp16-trunc-extendvnhf.c: Ditto.
> > > > * gcc.target/i386/avx512fp16-truncvnhf.c: Ditto.
> > > > * gcc.target/i386/avx512fp16-64-32-vecop-1.c: New test.
> > > > * gcc.target/i386/avx512fp16-64-32-vecop-2.c: Ditto.
> > > > * gcc.target/i386/pr102230.c: Ditto.
> > > > ---
> > > >  gcc/config/i386/i386.c|  4 +
> > > >  gcc/config/i386/i386.h| 12 ++-
> > > >  gcc/config/i386/i386.md   |  5 +-
> > > >  gcc/config/i386/mmx.md| 27 ---
> > > >  gcc/config/i386/sse.md| 49 
> > > >  .../i386/avx512fp16-64-32-vecop-1.c   | 30 
> > > >  .../i386/avx512fp16-64-32-vecop-2.c   | 75 +++
> > > >  .../gcc.target/i38

[PATCH] i386: Fix wrong result for AMX-TILE intrinsic when parsing expression.

2021-11-03 Thread Hongyu Wang via Gcc-patches
Hi,

_tile_loadd, _tile_stored, _tile_streamloadd intrinsics are defined by
macro, so the parameters should be wrapped by parentheses to accept
expressions.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

OK for master and backport to GCC11 branch?

gcc/ChangeLog:

* config/i386/amxtileintrin.h (_tile_loadd_internal): Add
parentheses to base and stride.
(_tile_stream_loadd_internal): Likewise.
(_tile_stored_internal): Likewise.
---
 gcc/config/i386/amxtileintrin.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
index 75d784ad160..3a0a6b44c17 100644
--- a/gcc/config/i386/amxtileintrin.h
+++ b/gcc/config/i386/amxtileintrin.h
@@ -62,7 +62,7 @@ _tile_release (void)
 #define _tile_loadd_internal(dst,base,stride)  \
   __asm__ volatile \
   ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \
-   :: "r" ((const void*) base), "r" ((long) stride))
+   :: "r" ((const void*) (base)), "r" ((long) (stride)))
 
 #define _tile_stream_loadd(dst,base,stride)\
   _tile_stream_loadd_internal (dst, base, stride)
@@ -70,7 +70,7 @@ _tile_release (void)
 #define _tile_stream_loadd_internal(dst,base,stride)   \
   __asm__ volatile \
   ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" 
\
-   :: "r" ((const void*) base), "r" ((long) stride))
+   :: "r" ((const void*) (base)), "r" ((long) (stride)))
 
 #define _tile_stored(dst,base,stride)  \
   _tile_stored_internal (dst, base, stride)
@@ -78,7 +78,7 @@ _tile_release (void)
 #define _tile_stored_internal(src,base,stride) \
   __asm__ volatile \
   ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \
-   :: "r" ((void*) base), "r" ((long) stride) \
+   :: "r" ((void*) (base)), "r" ((long) (stride)) \
: "memory")
 
 #define _tile_zero(dst)\
-- 
2.18.1



[PATCH] i386: Auto vectorize sdot_prod, usdot_prod with VNNI instruction.

2021-11-03 Thread Hongyu Wang via Gcc-patches
Hi,

AVX512VNNI/AVXVNNI has vpdpwssd for HImode, vpdpbusd for QImode, so
Adjust HImode sdot_prod expander and add QImode usdot_prod expander
to enhance vectorization for dotprod.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
Ok for master?

gcc/ChangeLog:

* config/i386/sse.md (VI2_AVX512VNNIBW): New mode iterator.
(VI1_AVX512VNNI): Likewise.
(SDOT_VPDP_SUF): New mode_attr.
(VI1SI): Likewise.
(vi1si): Likewise.
(sdot_prod): Use VI2_AVX512F iterator, expand to
vpdpwssd when VNNI targets available.
(usdot_prod): New expander for vector QImode.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vnni-auto-vectorize-1.c: New test.
* gcc.target/i386/vnni-auto-vectorize-2.c: Ditto.
---

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2764a250229..22435e5d036 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -500,6 +500,9 @@
 (define_mode_iterator VI1_AVX512F
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI])
 
+(define_mode_iterator VI1_AVX512VNNI
+  [(V64QI "TARGET_AVX512VNNI") (V32QI "TARGET_AVX2") V16QI])
+
 (define_mode_iterator VI12_256_512_AVX512VL
   [V64QI (V32QI "TARGET_AVX512VL")
V32HI (V16HI "TARGET_AVX512VL")])
@@ -510,6 +513,10 @@
 (define_mode_iterator VI2_AVX512F
   [(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI])
 
+(define_mode_iterator VI2_AVX512VNNIBW
+  [(V32HI "TARGET_AVX512BW || TARGET_AVX512VNNI")
+   (V16HI "TARGET_AVX2") V8HI])
+
 (define_mode_iterator VI4_AVX
   [(V8SI "TARGET_AVX") V4SI])
 
@@ -14798,19 +14805,37 @@
 (define_mode_attr SDOT_PMADD_SUF
   [(V32HI "512v32hi") (V16HI "") (V8HI "")])
 
+(define_mode_attr SDOT_VPDP_SUF
+  [(V32HI "v16si") (V16HI "v8si") (V8HI "v4si")])
+
 (define_expand "sdot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI2_AVX2 1 "register_operand")
-   (match_operand:VI2_AVX2 2 "register_operand")
+   (match_operand:VI2_AVX512VNNIBW 1 "register_operand")
+   (match_operand:VI2_AVX512VNNIBW 2 "register_operand")
(match_operand: 3 "register_operand")]
   "TARGET_SSE2"
 {
-  rtx t = gen_reg_rtx (mode);
-  emit_insn (gen__pmaddwd (t, operands[1], 
operands[2]));
-  emit_insn (gen_rtx_SET (operands[0],
- gen_rtx_PLUS (mode,
-   operands[3], t)));
-  DONE;
+  /* Try with vnni instructions.  */
+  if (( == 64 && TARGET_AVX512VNNI)
+  || ( < 64
+ && ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI)))
+{
+  operands[1] = lowpart_subreg (mode, operands[1], 
mode);
+  operands[2] = lowpart_subreg (mode, operands[2], 
mode);
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  emit_insn (gen_vpdpwssd_ (operands[0], operands[3],
+  operands[1], operands[2]));
+}
+/* Otherwise use pmaddwd + paddd.  */
+else
+{
+  rtx t = gen_reg_rtx (mode);
+  emit_insn (gen__pmaddwd (t, operands[1], 
operands[2]));
+  emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_PLUS (mode,
+   operands[3], t)));
+}
+DONE;
 })
 
 ;; Normally we use widen_mul_even/odd, but combine can't quite get it all
@@ -27065,6 +27090,29 @@
[(set_attr ("prefix") ("evex"))
(set_attr "mode" "")])
 
+(define_mode_attr VI1SI
+ [(V64QI "V16SI") (V32QI "V8SI") (V16QI "V4SI")])
+
+(define_mode_attr vi1si
+ [(V64QI "v16si") (V32QI "v8si") (V16QI "v4si")])
+
+(define_expand "usdot_prod"
+  [(match_operand: 0 "register_operand")
+   (match_operand:VI1_AVX512VNNI 1 "register_operand")
+   (match_operand:VI1_AVX512VNNI 2 "register_operand")
+   (match_operand: 3 "register_operand")]
+  "( == 64
+||((TARGET_AVX512VNNI && TARGET_AVX512VL)
+   || TARGET_AVXVNNI))"
+{
+  operands[1] = lowpart_subreg (mode, operands[1], mode);
+  operands[2] = lowpart_subreg (mode, operands[2], mode);
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  emit_insn (gen_vpdpbusd_ (operands[0], operands[3],
+ operands[1], operands[2]));
+  DONE;
+})
+
 (define_insn "vpdpbusd_v16si"
   [(set (match_operand:V16SI 0 "register_operand" "=v")
(unspec:V16SI
diff --git a/gcc/testsuite/gcc.target/i386/vnni-auto-vectorize-1.c 
b/gcc/testsuite/gcc.target/i386/vnni-auto-vectorize-1.c
new file mode 100644
index 000..844f37ddfc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vnni-auto-vectorize-1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */ 
+/* { dg-options "-mavx512f -mavx512vnni -mavx512vl -O2" } */
+
+/* { dg-final { scan-assembler "vpdpwssd\t" } } */
+/* { dg-final { scan-assembler "vpdpbusd\t" } } */
+/* { dg-final { scan-assembler-not "vpmaddwd\t" } } */
+
+int __attribute__((noinline, noclone, optimize("tree-vectorize")))
+sdot_prod_hi (short * restrict a, short * restrict b,
+ int c, int n)
+{
+  int i;
+  for (i = 0; i

Re: [PATCH] i386: Fix wrong result for AMX-TILE intrinsic when parsing expression.

2021-11-03 Thread Hongyu Wang via Gcc-patches
> Could you add a testcase for that?

Yes, updated patch.

Hongtao Liu via Gcc-patches  于2021年11月4日周四 上午10:25写道:
>
> On Thu, Nov 4, 2021 at 9:19 AM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Hi,
> >
> > _tile_loadd, _tile_stored, _tile_streamloadd intrinsics are defined by
> > macro, so the parameters should be wrapped by parentheses to accept
> > expressions.
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > OK for master and backport to GCC11 branch?
> Could you add a testcase for that?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/amxtileintrin.h (_tile_loadd_internal): Add
> > parentheses to base and stride.
> > (_tile_stream_loadd_internal): Likewise.
> > (_tile_stored_internal): Likewise.
> > ---
> >  gcc/config/i386/amxtileintrin.h | 6 +++---
> >  1 file changed, 3 insertions(+), 3 deletions(-)
> >
> > diff --git a/gcc/config/i386/amxtileintrin.h 
> > b/gcc/config/i386/amxtileintrin.h
> > index 75d784ad160..3a0a6b44c17 100644
> > --- a/gcc/config/i386/amxtileintrin.h
> > +++ b/gcc/config/i386/amxtileintrin.h
> > @@ -62,7 +62,7 @@ _tile_release (void)
> >  #define _tile_loadd_internal(dst,base,stride)  \
> >__asm__ volatile \
> >("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" 
> > \
> > -   :: "r" ((const void*) base), "r" ((long) stride))
> > +   :: "r" ((const void*) (base)), "r" ((long) (stride)))
> >
> >  #define _tile_stream_loadd(dst,base,stride)\
> >_tile_stream_loadd_internal (dst, base, stride)
> > @@ -70,7 +70,7 @@ _tile_release (void)
> >  #define _tile_stream_loadd_internal(dst,base,stride)   \
> >__asm__ volatile \
> >("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", 
> > [%0+%1*1]}" \
> > -   :: "r" ((const void*) base), "r" ((long) stride))
> > +   :: "r" ((const void*) (base)), "r" ((long) (stride)))
> >
> >  #define _tile_stored(dst,base,stride)  \
> >_tile_stored_internal (dst, base, stride)
> > @@ -78,7 +78,7 @@ _tile_release (void)
> >  #define _tile_stored_internal(src,base,stride) \
> >__asm__ volatile \
> >("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], 
> > %%tmm"#src"}" \
> > -   :: "r" ((void*) base), "r" ((long) stride) \
> > +   :: "r" ((void*) (base)), "r" ((long) (stride)) \
> > : "memory")
> >
> >  #define _tile_zero(dst)\
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao


0001-i386-Fix-wrong-result-for-AMX-TILE-intrinsic-when-pa.patch
Description: Binary data


[PATCH] PR target/103069: Relax cmpxchg loop for x86 target

2021-11-12 Thread Hongyu Wang via Gcc-patches
Hi,

>From the CPU's point of view, getting a cache line for writing is more
expensive than reading.  See Appendix A.2 Spinlock in:

https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/
xeon-lock-scaling-analysis-paper.pdf

The full compare and swap will grab the cache line exclusive and causes
excessive cache line bouncing.

The atomic_fetch_{or,xor,and,nand} builtins generates cmpxchg loop under
-march=x86-64 like:

movl(%rdi), %eax
.L2:
movl%eax, %edx
movl%eax, %r8d
orl $esi, %edx
lock cmpxchgl   %edx, (%rdi)
jne .L2
movl%r8d, %eax
ret

To relax above loop, GCC should first emit a normal load, check and jump to
.L2 if cmpxchgl may fail. Before jump to .L2, PAUSE should be inserted to
yield the CPU to another hyperthread and to save power, so the code is
like

movl(%rdi), %eax
.L4:
movl(%rdi), %ecx
movl%eax, %edx
orl %esi, %edx
cmpl%eax, %ecx
jne .L2
lock cmpxchgl   %edx, (%rdi)
jne .L4
.L2:
rep nop
jmp .L4

This patch adds corresponding atomic_fetch_op expanders to insert load/
compare and pause for all the atomic logic fetch builtins. Add flag
-mrelax-cmpxchg-loop to control whether to generate relaxed loop.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for master?

gcc/ChangeLog:

PR target/103069
* config/i386/i386-expand.c (ix86_expand_atomic_fetch_op_loop):
New expand function.
* config/i386/i386-options.c (ix86_target_string): Add
-mrelax-cmpxchg-loop flag.
(ix86_valid_target_attribute_inner_p): Likewise.
* config/i386/i386-protos.h (ix86_expand_atomic_fetch_op_loop):
New expand function prototype.
* config/i386/i386.opt: Add -mrelax-cmpxchg-loop.
* config/i386/sync.md (atomic_fetch_): New expander
for SI,HI,QI modes.
(atomic__fetch): Likewise.
(atomic_fetch_nand): Likewise.
(atomic_nand_fetch): Likewise.
(atomic_fetch_): New expander for DI,TI modes.
(atomic__fetch): Likewise.
(atomic_fetch_nand): Likewise.
(atomic_nand_fetch): Likewise.
* doc/invoke.texi: Document -mrelax-cmpxchg-loop.

gcc/testsuite/ChangeLog:

PR target/103069
* gcc.target/i386/pr103069-1.c: New test.
* gcc.target/i386/pr103069-2.c: Ditto.
---
 gcc/config/i386/i386-expand.c  |  77 ++
 gcc/config/i386/i386-options.c |   7 +-
 gcc/config/i386/i386-protos.h  |   2 +
 gcc/config/i386/i386.opt   |   4 +
 gcc/config/i386/sync.md| 117 +
 gcc/doc/invoke.texi|   9 +-
 gcc/testsuite/gcc.target/i386/pr103069-1.c |  35 ++
 gcc/testsuite/gcc.target/i386/pr103069-2.c |  70 
 8 files changed, 319 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-2.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 088e6af2258..f8a61835d85 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -23138,4 +23138,81 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode 
mode,
   *rem_p = rem;
 }
 
+void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
+  enum rtx_code code, bool after,
+  bool doubleword)
+{
+  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
+  rtx_code_label *loop_label, *pause_label;
+  machine_mode mode = GET_MODE (target);
+
+  old_reg = gen_reg_rtx (mode);
+  new_reg = old_reg;
+  loop_label = gen_label_rtx ();
+  pause_label = gen_label_rtx ();
+  old_mem = copy_to_reg (mem);
+  emit_label (loop_label);
+  emit_move_insn (old_reg, old_mem);
+
+  /* return value for atomic_fetch_op.  */
+  if (!after)
+emit_move_insn (target, old_reg);
+
+  if (code == NOT)
+{
+  new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
+true, OPTAB_LIB_WIDEN);
+  new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
+}
+  else
+new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
+  true, OPTAB_LIB_WIDEN);
+
+  /* return value for atomic_op_fetch.  */
+  if (after)
+emit_move_insn (target, new_reg);
+
+  /* Load memory again inside loop.  */
+  new_mem = copy_to_reg (mem);
+  /* Compare mem value with expected value.  */
+
+  if (doubleword)
+{
+  machine_mode half_mode = (mode == DImode)? SImode : DImode;
+  rtx low_new_mem = gen_lowpart (half_mode, new_mem);
+  rtx low_old_mem = gen_lowpart (half_mode, old_mem);
+  rtx high_new_mem = gen_highpart (half_mode, new_mem);
+  rtx high_old_mem = gen_highpart (

[PATCH] PR libgomp/103068: Optimize gomp_mutex_lock_slow for x86 target

2021-11-13 Thread Hongyu Wang via Gcc-patches
Hi, 

>From the CPU's point of view, getting a cache line for writing is more
expensive than reading.  See Appendix A.2 Spinlock in:

https://www.intel.com/content/dam/www/public/us/en/documents/white-papers
/xeon-lock-scaling-analysis-paper.pdf

The full compare and swap will grab the cache line exclusive and causes
excessive cache line bouncing.

For gomp_mutex_lock_slow, it spins on __atomic_compare_exchange_n, so
add load-check to continue spin if cmpxchg may fail.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for master?

libgomp/ChangeLog:

PR libgomp/103068
* config/linux/mutex.c (gomp_mutex_lock_slow): Continue spin
loop when mutex is not 0 under x86 target.
* config/linux/x86/futex.h (TARGET_X86_AVOID_CMPXCHG): Define.
---
 libgomp/config/linux/mutex.c | 5 +
 libgomp/config/linux/x86/futex.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/libgomp/config/linux/mutex.c b/libgomp/config/linux/mutex.c
index 838264dc1f9..4e87566eb2b 100644
--- a/libgomp/config/linux/mutex.c
+++ b/libgomp/config/linux/mutex.c
@@ -49,6 +49,11 @@ gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval)
}
   else
{
+#ifdef TARGET_X86_AVOID_CMPXCHG
+ /* For x86, omit cmpxchg when atomic load shows mutex is not 0.  */
+ if ((oldval = __atomic_load_n (mutex, MEMMODEL_RELAXED)) != 0)
+   continue;
+#endif
  /* Something changed.  If now unlocked, we're good to go.  */
  oldval = 0;
  if (__atomic_compare_exchange_n (mutex, &oldval, 1, false,
diff --git a/libgomp/config/linux/x86/futex.h b/libgomp/config/linux/x86/futex.h
index e7f53399a4e..acc1d1467d7 100644
--- a/libgomp/config/linux/x86/futex.h
+++ b/libgomp/config/linux/x86/futex.h
@@ -122,3 +122,5 @@ cpu_relax (void)
 {
   __builtin_ia32_pause ();
 }
+
+#define TARGET_X86_AVOID_CMPXCHG
-- 
2.18.1



Re: [PATCH] PR target/103069: Relax cmpxchg loop for x86 target

2021-11-15 Thread Hongyu Wang via Gcc-patches
Thanks for your review, this is the patch I'm going to check-in.

Uros Bizjak via Gcc-patches  于2021年11月15日周一 下午4:25写道:
>
> On Sat, Nov 13, 2021 at 3:34 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > From the CPU's point of view, getting a cache line for writing is more
> > expensive than reading.  See Appendix A.2 Spinlock in:
> >
> > https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/
> > xeon-lock-scaling-analysis-paper.pdf
> >
> > The full compare and swap will grab the cache line exclusive and causes
> > excessive cache line bouncing.
> >
> > The atomic_fetch_{or,xor,and,nand} builtins generates cmpxchg loop under
> > -march=x86-64 like:
> >
> > movl(%rdi), %eax
> > .L2:
> > movl%eax, %edx
> > movl%eax, %r8d
> > orl $esi, %edx
> > lock cmpxchgl   %edx, (%rdi)
> > jne .L2
> > movl%r8d, %eax
> > ret
> >
> > To relax above loop, GCC should first emit a normal load, check and jump to
> > .L2 if cmpxchgl may fail. Before jump to .L2, PAUSE should be inserted to
> > yield the CPU to another hyperthread and to save power, so the code is
> > like
> >
> > movl(%rdi), %eax
> > .L4:
> > movl(%rdi), %ecx
> > movl%eax, %edx
> > orl %esi, %edx
> > cmpl%eax, %ecx
> > jne .L2
> > lock cmpxchgl   %edx, (%rdi)
> > jne .L4
> > .L2:
> > rep nop
> > jmp .L4
> >
> > This patch adds corresponding atomic_fetch_op expanders to insert load/
> > compare and pause for all the atomic logic fetch builtins. Add flag
> > -mrelax-cmpxchg-loop to control whether to generate relaxed loop.
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> > PR target/103069
> > * config/i386/i386-expand.c (ix86_expand_atomic_fetch_op_loop):
> > New expand function.
> > * config/i386/i386-options.c (ix86_target_string): Add
> > -mrelax-cmpxchg-loop flag.
> > (ix86_valid_target_attribute_inner_p): Likewise.
> > * config/i386/i386-protos.h (ix86_expand_atomic_fetch_op_loop):
> > New expand function prototype.
> > * config/i386/i386.opt: Add -mrelax-cmpxchg-loop.
> > * config/i386/sync.md (atomic_fetch_): New expander
> > for SI,HI,QI modes.
> > (atomic__fetch): Likewise.
> > (atomic_fetch_nand): Likewise.
> > (atomic_nand_fetch): Likewise.
> > (atomic_fetch_): New expander for DI,TI modes.
> > (atomic__fetch): Likewise.
> > (atomic_fetch_nand): Likewise.
> > (atomic_nand_fetch): Likewise.
> > * doc/invoke.texi: Document -mrelax-cmpxchg-loop.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/103069
> > * gcc.target/i386/pr103069-1.c: New test.
> > * gcc.target/i386/pr103069-2.c: Ditto.
>
> LGTM, with a couple of issues in the testsuite section.
>
> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386-expand.c  |  77 ++
> >  gcc/config/i386/i386-options.c |   7 +-
> >  gcc/config/i386/i386-protos.h  |   2 +
> >  gcc/config/i386/i386.opt   |   4 +
> >  gcc/config/i386/sync.md| 117 +
> >  gcc/doc/invoke.texi|   9 +-
> >  gcc/testsuite/gcc.target/i386/pr103069-1.c |  35 ++
> >  gcc/testsuite/gcc.target/i386/pr103069-2.c |  70 
> >  8 files changed, 319 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr103069-2.c
> >
> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > index 088e6af2258..f8a61835d85 100644
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -23138,4 +23138,81 @@ ix86_expand_divmod_libfunc (rtx libfunc, 
> > machine_mode mode,
> >*rem_p = rem;
> >  }
> >
> > +void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
> > +  enum rtx_code code, bool after,
> > +  bool doubleword)
> > +{
> > +  rtx old_reg, new_reg, old_mem, success, oldval, new_mem;
> > +  rtx_code_label *loop_label, *pause_label;
> > +  machine_mode mode = GET_MODE (target);
> > +
> > +  old_reg = gen_reg_rtx (mode);
> > +  new_reg = old_reg;
> > +  loop_label = gen_label_rtx ();
> > +  pause_label = gen_label_rtx ();
> > +  old_mem = copy_to_reg (mem);
> > +  emit_label (loop_label);
> > +  emit_move_insn (old_reg, old_mem);
> > +
> > +  /* return value for atomic_fetch_op.  */
> > +  if (!after)
> > +emit_move_insn (target, old_reg);
> > +
> > +  if (code == NOT)
> > +{
> > +  new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
> > +true, OPTAB_LIB_WIDEN);
> > +  new_reg = expa

[PATCH] AVX512FP16: Adjust builtin for mask complex fma

2021-10-13 Thread Hongyu Wang via Gcc-patches
Hi,

Current mask/mask3 implementation for complex fma contains
duplicated parameter in macro, which may cause error at -O0.
Refactor macro implementation to builtins to avoid potential
error.

For round intrinsic with NO_ROUND as input, ix86_erase_embedded_rounding
erases embedded_rounding upspec but could break other emit_insn in
expanders. Skip those expanders with multiple emit_insn for this
function and check rounding in expander with subst.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}.
OK for master?

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h (_mm512_mask_fcmadd_pch):
Adjust builtin call.
(_mm512_mask3_fcmadd_pch): Likewise.
(_mm512_mask_fmadd_pch): Likewise
(_mm512_mask3_fmadd_pch): Likewise
(_mm512_mask_fcmadd_round_pch): Likewise
(_mm512_mask3_fcmadd_round_pch): Likewise
(_mm512_mask_fmadd_round_pch): Likewise
(_mm512_mask3_fmadd_round_pch): Likewise
(_mm_mask_fcmadd_sch): Likewise
(_mm_mask3_fcmadd_sch): Likewise
(_mm_mask_fmadd_sch): Likewise
(_mm_mask3_fmadd_sch): Likewise
(_mm_mask_fcmadd_round_sch): Likewise
(_mm_mask3_fcmadd_round_sch): Likewise
(_mm_mask_fmadd_round_sch): Likewise
(_mm_mask3_fmadd_round_sch): Likewise
(_mm_fcmadd_round_sch): Likewise
* config/i386/avx512fp16vlintrin.h (_mm_mask_fmadd_pch):
Adjust builtin call.
(_mm_mask3_fmadd_pch): Likewise
(_mm256_mask_fmadd_pch): Likewise
(_mm256_mask3_fmadd_pch): Likewise
(_mm_mask_fcmadd_pch): Likewise
(_mm_mask3_fcmadd_pch): Likewise
(_mm256_mask_fcmadd_pch): Likewise
(_mm256_mask3_fcmadd_pch): Likewise
* config/i386/i386-builtin.def: Add mask3 builtin for complex
fma, and adjust mask_builtin to corresponding expander.
* config/i386/i386-expand.c (ix86_expand_round_builtin):
Skip eraseing embedded rounding for expanders that emits
multiple insns.
* config/i386/sse.md (complexmove): New mode_attr.
(_fmaddc__mask1): New expander.
(_fcmaddc__mask1): Likewise.
(avx512fp16_fmaddcsh_v8hf_mask1): Likewise.
(avx512fp16_fcmaddcsh_v8hf_mask1): Likewise.
(avx512fp16_fcmaddcsh_v8hf_mask3): Likewise.
(avx512fp16_fmaddcsh_v8hf_mask3): Likewise.
* config/i386/subst.md (round_embedded_complex): New subst.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new mask3 builtins.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Add scanning for
mask/mask3 intrinsic.
* gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto.
* gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: New test for
-mavx512vl.
* gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto.
---
 gcc/config/i386/avx512fp16intrin.h| 261 ++
 gcc/config/i386/avx512fp16vlintrin.h  |  56 ++--
 gcc/config/i386/i386-builtin.def  |  24 +-
 gcc/config/i386/i386-expand.c |  22 +-
 gcc/config/i386/sse.md| 183 
 gcc/config/i386/subst.md  |   3 +
 gcc/testsuite/gcc.target/i386/avx-1.c |   4 +
 .../i386/avx512fp16-vfcmaddcsh-1a.c   |   2 +
 .../i386/avx512fp16-vfcmaddcsh-1c.c   |  13 +
 .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c |   2 +
 .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c |  13 +
 gcc/testsuite/gcc.target/i386/sse-13.c|   4 +
 gcc/testsuite/gcc.target/i386/sse-23.c|   4 +
 13 files changed, 375 insertions(+), 216 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index 29cf6792335..5e49447a020 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -6258,13 +6258,11 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
 {
-  return (__m512h) __builtin_ia32_movaps512_mask
-((__v16sf)
- __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
- (__v32hf) __C,
- (__v32hf) __D, __B,
- _MM_FROUND_CUR_DIRECTION),
- (__v16sf) __A, __B);
+  return (__m512h)
+__builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+(__v32hf) __C,
+(__v32hf) __D, __B,
+_MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512h
@@ -6272,10 +6270,10 @@ __attribute__ ((_

[PATCH] AVX512FP16: Support vector shuffle builtins

2021-10-13 Thread Hongyu Wang via Gcc-patches
Hi,

This patch supports HFmode vector shuffle by creating HImode subreg when
expanding permutation expr.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}
OK for master?

gcc/ChangeLog:

* config/i386/i386-expand.c (ix86_expand_vec_perm): Convert
HFmode input operand to HImode.
(ix86_vectorize_vec_perm_const): Likewise.
(ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.
* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):
New define_insn.
(*avx512f_permvar_truncv8siv8hi_1_hf):
Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.
* gcc.target/i386/avx512fp16-pr101846.c: Ditto.
* gcc.target/i386/avx512fp16-pr94680.c: Ditto.
---
 gcc/config/i386/i386-expand.c | 29 ++-
 gcc/config/i386/sse.md| 54 +++-
 .../i386/avx512fp16-builtin_shuffle-1.c   | 86 +++
 .../gcc.target/i386/avx512fp16-pr101846.c | 56 
 .../gcc.target/i386/avx512fp16-pr94680.c  | 61 +
 5 files changed, 284 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index c0924a59efb..0f50ed3b9f8 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])
   e = GET_MODE_UNIT_SIZE (mode);
   gcc_assert (w <= 64);
 
+  if (GET_MODE_INNER (mode) == HFmode)
+{
+  machine_mode orig_mode = mode;
+  mode = mode_for_vector (HImode, w).require ();
+  if (target)
+   target = lowpart_subreg (mode, target, orig_mode);
+  if (op0)
+   op0 = lowpart_subreg (mode, op0, orig_mode);
+  if (op1)
+   op1 = lowpart_subreg (mode, op1, orig_mode);
+}
+
   if (TARGET_AVX512F && one_operand_shuffle)
 {
   rtx (*gen) (rtx, rtx, rtx) = NULL;
@@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx 
vals)
  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
  if (inner_mode == QImode
  || inner_mode == HImode
- || inner_mode == TImode)
+ || inner_mode == TImode
+ || inner_mode == HFmode)
{
  unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
  scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
@@ -21099,6 +21112,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx 
target, rtx op0,
   unsigned int i, nelt, which;
   bool two_args;
 
+  /* For HF mode vector, convert it to HI using subreg.  */
+  if (GET_MODE_INNER (vmode) == HFmode)
+{
+  machine_mode orig_mode = vmode;
+  vmode = mode_for_vector (HImode,
+  GET_MODE_NUNITS (vmode)).require ();
+  if (target)
+   target = lowpart_subreg (vmode, target, orig_mode);
+  if (op0)
+   op0 = lowpart_subreg (vmode, op0, orig_mode);
+  if (op1)
+   op1 = lowpart_subreg (vmode, op1, orig_mode);
+}
+
   d.target = target;
   d.op0 = op0;
   d.op1 = op1;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a3c4a3f1e62..d023d8a1c2e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12573,6 +12573,33 @@
(truncate:V16HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
 
+(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"
+  [(set (match_operand:V16HF 0 "nonimmediate_operand")
+   (vec_select:V16HF
+ (subreg:V32HF
+   (unspec:V32HI
+ [(match_operand:V32HI 1 "register_operand")
+  (match_operand:V32HI 2 "permvar_truncate_operand")]
+UNSPEC_VPERMVAR) 0)
+ (parallel [(const_int 0) (const_int 1)
+(const_int 2) (const_int 3)
+(const_int 4) (const_int 5)
+(const_int 6) (const_int 7)
+(const_int 8) (const_int 9)
+(const_int 10) (const_int 11)
+(const_int 12) (const_int 13)
+(const_int 14) (const_int 15)])))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (truncate:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);
+  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);
+})
+
+
 (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"
   [(set (match_operand:V8HI 0 "nonimmediate_operand")
(vec_select:V8HI
@@ -12591,6 +12618,28 @@
(truncate:V8HI (match_dup 1)))]
   "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
 
+(define_i

[PATCH] AVX512FP16: Fix testcase for complex intrinsic

2021-10-14 Thread Hongyu Wang via Gcc-patches
Hi,

-march=cascadelake which contains -mavx512vl produces unmatched scan
for vf[c]maddcsh test, so add -mno-avx512vl to vf[c]maddcsh-1a.c.

Also add scan for vblendmps to vf[c]maddcph tests to check correctness.

Tested on unix{-m32,} with -march=cascadelake.

Pushed to trunk as obvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vfcmaddcph-1a.c: Add scan for
vblendmps.
* gcc.target/i386/avx512fp16-vfmaddcph-1a.c: Likewise.
* gcc.target/i386/avx512fp16vl-vfcmaddcph-1a.c: Likewise.
* gcc.target/i386/avx512fp16vl-vfmaddcph-1a.c: Likewise.
* gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Add -mno-avx512vl.
* gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcph-1a.c   | 1 +
 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcph-1a.c| 1 +
 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c| 2 +-
 gcc/testsuite/gcc.target/i386/avx512fp16vl-vfcmaddcph-1a.c | 2 ++
 gcc/testsuite/gcc.target/i386/avx512fp16vl-vfmaddcph-1a.c  | 2 ++
 6 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcph-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcph-1a.c
index 6c2c34c1731..cd39b7f99ff 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcph-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcph-1a.c
@@ -6,6 +6,7 @@
 /* { dg-final { scan-assembler-times "vfcmaddcph\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcph\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcph\[ 
\\t\]+\{rz-sae\}\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vblendmps\[ 
\\t\]+%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[
 \\t\]+#)" 2 } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
index 8ff2092c325..eb96588df39 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -O2" } */
+/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcph-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcph-1a.c
index 4dae5f02dc6..859b215ab17 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcph-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcph-1a.c
@@ -6,6 +6,7 @@
 /* { dg-final { scan-assembler-times "vfmaddcph\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcph\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfmaddcph\[ 
\\t\]+\{rz-sae\}\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vblendmps\[ 
\\t\]+%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[
 \\t\]+#)" 2 } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
index 2ebe1f8ddd7..288d1c12a10 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -O2" } */
+/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[

[PATCH] AVX512FP16: Fix ICE for 2 v4hf vector concat

2021-10-14 Thread Hongyu Wang via Gcc-patches
Hi,

For V4HFmode, doing vector concat like

__builtin_shufflevector (a, b, {0, 1, 2, 3, 4, 5, 6, 7})

could trigger ICE since it is not handled in ix86_vector_init ().

Handle HFmode like HImode to avoid such ICE.

Bootstrappted/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}

OK for master?

gcc/ChangeLog:

* config/i386/i386-expand.c (ix86_expand_vector_init):
For half_vector concat for HFmode, handle them like HImode.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-v4hf-concat.c: New test.
---
 gcc/config/i386/i386-expand.c|  3 ++-
 .../gcc.target/i386/avx512fp16-v4hf-concat.c | 16 
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-v4hf-concat.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 95274201f4f..1b011047251 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -15122,7 +15122,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx 
vals)
  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
  if (inner_mode == QImode
  || inner_mode == HImode
- || inner_mode == TImode)
+ || inner_mode == TImode
+ || inner_mode == HFmode)
{
  unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
  scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-v4hf-concat.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-v4hf-concat.c
new file mode 100644
index 000..3b8a7f39b85
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-v4hf-concat.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -O2" } */
+/* { dg-final { scan-assembler-times "vpunpcklqdq" 1 } } */
+
+typedef _Float16 v8hf __attribute__((vector_size (16)));
+typedef _Float16 v4hf __attribute__((vector_size (8)));
+
+v8hf foov (v4hf a, v4hf b)
+{
+return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+v8hf foov2 (v4hf a)
+{
+return __builtin_shufflevector (a, (v4hf){0}, 0, 1, 2, 3, 4, 5, 6, 7);
+}
-- 
2.27.1



Re: [PATCH] AVX512FP16: Support vector shuffle builtins

2021-10-14 Thread Hongyu Wang via Gcc-patches
> This part seems not related to vector shuffle.
Yes, have separated this part to another patch and checked-in.

Updated patch. Ok for this one?

Hongtao Liu via Gcc-patches  于2021年10月14日周四 下午2:33写道:
>
> On Thu, Oct 14, 2021 at 10:39 AM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Hi,
> >
> > This patch supports HFmode vector shuffle by creating HImode subreg when
> > expanding permutation expr.
> >
> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}
> > OK for master?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386-expand.c (ix86_expand_vec_perm): Convert
> > HFmode input operand to HImode.
> > (ix86_vectorize_vec_perm_const): Likewise.
> > (ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.
> > * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):
> > New define_insn.
> > (*avx512f_permvar_truncv8siv8hi_1_hf):
> > Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.
> > * gcc.target/i386/avx512fp16-pr101846.c: Ditto.
> > * gcc.target/i386/avx512fp16-pr94680.c: Ditto.
> > ---
> >  gcc/config/i386/i386-expand.c | 29 ++-
> >  gcc/config/i386/sse.md| 54 +++-
> >  .../i386/avx512fp16-builtin_shuffle-1.c   | 86 +++
> >  .../gcc.target/i386/avx512fp16-pr101846.c | 56 
> >  .../gcc.target/i386/avx512fp16-pr94680.c  | 61 +
> >  5 files changed, 284 insertions(+), 2 deletions(-)
> >  create mode 100644 
> > gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
> >
> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > index c0924a59efb..0f50ed3b9f8 100644
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])
> >e = GET_MODE_UNIT_SIZE (mode);
> >gcc_assert (w <= 64);
> >
> > +  if (GET_MODE_INNER (mode) == HFmode)
> > +{
> > +  machine_mode orig_mode = mode;
> > +  mode = mode_for_vector (HImode, w).require ();
> > +  if (target)
> > +   target = lowpart_subreg (mode, target, orig_mode);
> > +  if (op0)
> > +   op0 = lowpart_subreg (mode, op0, orig_mode);
> > +  if (op1)
> > +   op1 = lowpart_subreg (mode, op1, orig_mode);
> > +}
> > +
> >if (TARGET_AVX512F && one_operand_shuffle)
> >  {
> >rtx (*gen) (rtx, rtx, rtx) = NULL;
> > @@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, 
> > rtx vals)
> >   rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
> >   if (inner_mode == QImode
> >   || inner_mode == HImode
> > - || inner_mode == TImode)
> > + || inner_mode == TImode
> > + || inner_mode == HFmode)
> This part seems not related to vector shuffle.
> > {
> >   unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
> >   scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
> > @@ -21099,6 +21112,20 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, 
> > rtx target, rtx op0,
> >unsigned int i, nelt, which;
> >bool two_args;
> >
> > +  /* For HF mode vector, convert it to HI using subreg.  */
> > +  if (GET_MODE_INNER (vmode) == HFmode)
> > +{
> > +  machine_mode orig_mode = vmode;
> > +  vmode = mode_for_vector (HImode,
> > +  GET_MODE_NUNITS (vmode)).require ();
> > +  if (target)
> > +   target = lowpart_subreg (vmode, target, orig_mode);
> > +  if (op0)
> > +   op0 = lowpart_subreg (vmode, op0, orig_mode);
> > +  if (op1)
> > +   op1 = lowpart_subreg (vmode, op1, orig_mode);
> > +}
> > +
> >d.target = target;
> >d.op0 = op0;
> >d.op1 = op1;
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index a3c4a3f1e62..d023d8a1c2e 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -12573,6 +12573,33 @@
> > (truncate:V16HI (match_dup 1)))]
> >"operands[1] = lowpart_subreg (V16SImod

Re: [PATCH] AVX512FP16: Support vector shuffle builtins

2021-10-14 Thread Hongyu Wang via Gcc-patches
> ix86_expand_vec_perm is only called by (define_expand "vec_perm"
> which means target, op0 and op1 must existed, and you can drop
> if(target/op0/op1) stuff.

Yes, dropped.

> Those checks for NULL seems reasonable according to documents,
> op0,op1,target maybe NULL.
Thanks for pointing it out, didn't realize the difference between
these 2 functions.

Updated patch.

Hongtao Liu  于2021年10月15日周五 下午1:54写道:
>
> On Fri, Oct 15, 2021 at 1:37 PM Hongyu Wang  wrote:
> >
> > > This part seems not related to vector shuffle.
> > Yes, have separated this part to another patch and checked-in.
> >
> > Updated patch. Ok for this one?
> >
> > Hongtao Liu via Gcc-patches  于2021年10月14日周四 
> > 下午2:33写道:
> > >
> > > On Thu, Oct 14, 2021 at 10:39 AM Hongyu Wang via Gcc-patches
> > >  wrote:
> > > >
> > > > Hi,
> > > >
> > > > This patch supports HFmode vector shuffle by creating HImode subreg when
> > > > expanding permutation expr.
> > > >
> > > > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde{-m32,}
> > > > OK for master?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * config/i386/i386-expand.c (ix86_expand_vec_perm): Convert
> > > > HFmode input operand to HImode.
> > > > (ix86_vectorize_vec_perm_const): Likewise.
> > > > (ix86_expand_vector_init): Allow HFmode for one_operand_shuffle.
> > > > * config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1_hf):
> > > > New define_insn.
> > > > (*avx512f_permvar_truncv8siv8hi_1_hf):
> > > > Likewise.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * gcc.target/i386/avx512fp16-builtin_shuffle-1.c: New test.
> > > > * gcc.target/i386/avx512fp16-pr101846.c: Ditto.
> > > > * gcc.target/i386/avx512fp16-pr94680.c: Ditto.
> > > > ---
> > > >  gcc/config/i386/i386-expand.c | 29 ++-
> > > >  gcc/config/i386/sse.md| 54 +++-
> > > >  .../i386/avx512fp16-builtin_shuffle-1.c   | 86 +++
> > > >  .../gcc.target/i386/avx512fp16-pr101846.c | 56 
> > > >  .../gcc.target/i386/avx512fp16-pr94680.c  | 61 +
> > > >  5 files changed, 284 insertions(+), 2 deletions(-)
> > > >  create mode 100644 
> > > > gcc/testsuite/gcc.target/i386/avx512fp16-builtin_shuffle-1.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr101846.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
> > > >
> > > > diff --git a/gcc/config/i386/i386-expand.c 
> > > > b/gcc/config/i386/i386-expand.c
> > > > index c0924a59efb..0f50ed3b9f8 100644
> > > > --- a/gcc/config/i386/i386-expand.c
> > > > +++ b/gcc/config/i386/i386-expand.c
> > > > @@ -4836,6 +4836,18 @@ ix86_expand_vec_perm (rtx operands[])
> > > >e = GET_MODE_UNIT_SIZE (mode);
> > > >gcc_assert (w <= 64);
> > > >
> > > > +  if (GET_MODE_INNER (mode) == HFmode)
> > > > +{
> > > > +  machine_mode orig_mode = mode;
> > > > +  mode = mode_for_vector (HImode, w).require ();
> > > > +  if (target)
> > > > +   target = lowpart_subreg (mode, target, orig_mode);
> > > > +  if (op0)
> > > > +   op0 = lowpart_subreg (mode, op0, orig_mode);
> > > > +  if (op1)
> > > > +   op1 = lowpart_subreg (mode, op1, orig_mode);
> > > > +}
> > > > +
> ix86_expand_vec_perm is only called by (define_expand "vec_perm"
> which means target, op0 and op1 must existed, and you can drop
> if(target/op0/op1) stuff.
> > > >if (TARGET_AVX512F && one_operand_shuffle)
> > > >  {
> > > >rtx (*gen) (rtx, rtx, rtx) = NULL;
> > > > @@ -15092,7 +15104,8 @@ ix86_expand_vector_init (bool mmx_ok, rtx 
> > > > target, rtx vals)
> > > >   rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
> > > >   if (inner_mode == QImode
> > > >   || inner_mode == HImode
> > > > - || inner_mode == TImode)
> > > > + || inner_mode == TImode
> > > > + || inner_mode == HFmode)
> > &

[PATCH] i386: Fix wrong codegen for V8HF move without TARGET_AVX512F

2021-10-19 Thread Hongyu Wang via Gcc-patches
Since _Float16 type is enabled under sse2 target, returning
V8HFmode vector without AVX512F target would generate wrong
vmovdqa64 instruction. Adjust ix86_get_ssemov to avoid this.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

OK for master?

gcc/ChangeLog:
PR target/102812
* config/i386/i386.c (ix86_get_ssemov): Adjust HFmode vector
move without AVX512F target.

gcc/testsuite/ChangeLog:
PR target/102812
* gcc.target/i386/pr102812.c: New test.
---
 gcc/config/i386/i386.c   |  9 ++---
 gcc/testsuite/gcc.target/i386/pr102812.c | 12 
 2 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102812.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9cc903e826b..1d79180da9a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -5399,9 +5399,12 @@ ix86_get_ssemov (rtx *operands, unsigned size,
   switch (scalar_mode)
{
case E_HFmode:
- opcode = (misaligned_p
-   ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
-   : "vmovdqa64");
+ if (!TARGET_AVX512F)
+   opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+ else
+   opcode = (misaligned_p
+ ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
+ : "vmovdqa64");
  break;
case E_SFmode:
  opcode = misaligned_p ? "%vmovups" : "%vmovaps";
diff --git a/gcc/testsuite/gcc.target/i386/pr102812.c 
b/gcc/testsuite/gcc.target/i386/pr102812.c
new file mode 100644
index 000..bad4fa9394e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102812.c
@@ -0,0 +1,12 @@
+/* PR target/102812 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4 -mno-avx" } */
+/* { dg-final { scan-assembler-not "vmovdqa64\t" } } */
+/* { dg-final { scan-assembler "movdqa\t" } } */
+
+typedef _Float16 v8hf __attribute__((__vector_size__ (16)));
+
+v8hf t (_Float16 a)
+{
+return (v8hf) {a, 0, 0, 0, 0, 0, 0, 0};
+}
-- 
2.18.1



Re: [PATCH] i386: Fix wrong codegen for V8HF move without TARGET_AVX512F

2021-10-20 Thread Hongyu Wang via Gcc-patches
Yes, updated patch.

gcc/ChangeLog:
PR target/102812
* config/i386/i386.c (ix86_get_ssemov): Adjust HFmode vector
move to use the same logic as HImode.

gcc/testsuite/ChangeLog:
PR target/102812
* gcc.target/i386/pr102812.c: New test.
---
 gcc/config/i386/i386.c   | 15 ---
 gcc/testsuite/gcc.target/i386/pr102812.c | 12 
 2 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102812.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9cc903e826b..159684ce549 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -5399,9 +5399,18 @@ ix86_get_ssemov (rtx *operands, unsigned size,
   switch (scalar_mode)
  {
  case E_HFmode:
-   opcode = (misaligned_p
- ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
- : "vmovdqa64");
+   if (evex_reg_p)
+ opcode = (misaligned_p
+   ? (TARGET_AVX512BW
+ ? "vmovdqu16"
+ : "vmovdqu64")
+   : "vmovdqa64");
+   else
+ opcode = (misaligned_p
+   ? (TARGET_AVX512BW
+ ? "vmovdqu16"
+ : "%vmovdqu")
+   : "%vmovdqa");
break;
  case E_SFmode:
opcode = misaligned_p ? "%vmovups" : "%vmovaps";
diff --git a/gcc/testsuite/gcc.target/i386/pr102812.c
b/gcc/testsuite/gcc.target/i386/pr102812.c
new file mode 100644
index 000..bad4fa9394e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102812.c
@@ -0,0 +1,12 @@
+/* PR target/102812 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4 -mno-avx" } */
+/* { dg-final { scan-assembler-not "vmovdqa64\t" } } */
+/* { dg-final { scan-assembler "movdqa\t" } } */
+
+typedef _Float16 v8hf __attribute__((__vector_size__ (16)));
+
+v8hf t (_Float16 a)
+{
+return (v8hf) {a, 0, 0, 0, 0, 0, 0, 0};
+}
-- 
2.18.1

Hongtao Liu via Gcc-patches  于2021年10月21日周四 下午1:24写道:
>
> On Wed, Oct 20, 2021 at 1:31 PM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Since _Float16 type is enabled under sse2 target, returning
> > V8HFmode vector without AVX512F target would generate wrong
> > vmovdqa64 instruction. Adjust ix86_get_ssemov to avoid this.
> >
> > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > OK for master?
> >
> > gcc/ChangeLog:
> > PR target/102812
> > * config/i386/i386.c (ix86_get_ssemov): Adjust HFmode vector
> > move without AVX512F target.
> >
> > gcc/testsuite/ChangeLog:
> > PR target/102812
> > * gcc.target/i386/pr102812.c: New test.
> > ---
> >  gcc/config/i386/i386.c   |  9 ++---
> >  gcc/testsuite/gcc.target/i386/pr102812.c | 12 
> >  2 files changed, 18 insertions(+), 3 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102812.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index 9cc903e826b..1d79180da9a 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -5399,9 +5399,12 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >switch (scalar_mode)
> > {
> > case E_HFmode:
> > - opcode = (misaligned_p
> > -   ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> > -   : "vmovdqa64");
> > + if (!TARGET_AVX512F)
> > +   opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > + else
> > +   opcode = (misaligned_p
> > + ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> > + : "vmovdqa64");
> >   break;
> Could we just use similar logic as HI?
>
> case E_HImode:
>   if (evex_reg_p)
> opcode = (need_unaligned_p
>   ? (TARGET_AVX512BW
>  ? "vmovdqu16"
>  : "vmovdqu64")
>   : "vmovdqa64");
>   else
> opcode = (need_unaligned_p
>   ? (TARGET_AVX512BW
>  ? "vmovdqu16"
>  : "%vmovdqu")
>   : "%vmovdqa");
>   break;
>
> > case E_SFmode:
> >   opcode = misaligned_p ? "%vmovups" : "%vmovaps";
> > diff --git a/gcc/testsuite/gcc.target/i386/pr102812.c 
> > b/gcc/testsuite/gcc.target/i386/pr102812.c
> > new file mode 100644
> > index 000..bad4fa9394e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr102812.c
> > @@ -0,0 +1,12 @@
> > +/* PR target/102812 */
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -msse4 -mno-avx" } */
> > +/* { dg-final { scan-assembler-not "vmovdqa64\t" } } */
> > +/* { dg-final { scan-assembler "movdqa\t" } } */
> > +
> > +typedef _Float16 v8hf __attribute__((__vector_size__ (16)));
> > +
> > +v8hf t (_Float16 a)
> > +{
> > +return (v8hf) {a, 0, 0, 0, 0, 0, 0, 0};
> > +}
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao


Re: [PATCH] i386: Fix wrong codegen for V8HF move without TARGET_AVX512F

2021-10-21 Thread Hongyu Wang via Gcc-patches
Thanks for reminding this, will adjust the testcase since the output
for 128/256bit HFmode load has changed.

Martin Liška  于2021年10月21日周四 下午8:49写道:
>
> On 10/21/21 07:47, Hongyu Wang via Gcc-patches wrote:
> > |Yes, updated patch.|
>
> Note the patch caused the following test failure:
>
> FAIL: gcc.target/i386/avx512fp16-13.c scan-assembler-times vmovdqa64[ 
> \\t]+[^{\n]*%ymm[0-9]+[^\n]*\\) 1
> FAIL: gcc.target/i386/avx512fp16-13.c scan-assembler-times vmovdqa64[ 
> \\t]+[^{\n]*%xmm[0-9]+[^\n]*\\) 1
> FAIL: gcc.target/i386/avx512fp16-13.c scan-assembler-times vmovdqa64[ 
> \\t]+[^{\n]*%ymm[0-9]+[^\n]*\\) 1
> FAIL: gcc.target/i386/avx512fp16-13.c scan-assembler-times vmovdqa64[ 
> \\t]+[^{\n]*%xmm[0-9]+[^\n]*\\) 1
>
> Martin


[PATCH] Adjust testcase for 128/256 bit HF vector load/store

2021-10-21 Thread Hongyu Wang via Gcc-patches
Hi,

The HF vector move have been updated to align with HI vector,
adjust according testcase for _Float16 vector load and store.

Tested on x86_64-pc-linux-gnu{-m32,}, pushed as obvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-13.c: Adjust scan-assembler for
xmm/ymm load/store.
---
 gcc/testsuite/gcc.target/i386/avx512fp16-13.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-13.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-13.c
index c3bae65da67..b73a8f44e1a 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-13.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-13.c
@@ -18,7 +18,7 @@ store256_ph (void *p, __m256h a)
   _mm256_store_ph (p, a);
 }
 
-/* { dg-final { scan-assembler-times "vmovdqa64\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
 
 void
 __attribute__ ((noinline, noclone))
@@ -27,7 +27,7 @@ store_ph (void *p, __m128h a)
   _mm_store_ph (p, a);
 }
 
-/* { dg-final { scan-assembler-times "vmovdqa64\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
 
 __m512h
 __attribute__ ((noinline, noclone))
@@ -45,7 +45,7 @@ load256_ph (void const *p)
   return _mm256_load_ph (p);
 }
 
-/* { dg-final { scan-assembler-times "vmovdqa64\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ 
\\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)" 1 } } */
 
 __m128h
 __attribute__ ((noinline, noclone))
@@ -53,7 +53,7 @@ load_ph (void const *p)
 {
   return _mm_load_ph (p);
 }
-/* { dg-final { scan-assembler-times "vmovdqa64\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)" 1 } } */
 
 __m512h
 __attribute__ ((noinline, noclone))
-- 
2.18.1



Re: [PATCH] testsuite: i386: Fix gcc.target/i386/avx512fp16-trunchf.c on Solaris [PR102835]

2021-10-25 Thread Hongyu Wang via Gcc-patches
I think this can be put in as an obvious fix.

Thanks for the patch.

Rainer Orth  于2021年10月25日周一 下午9:53写道:
>
> The gcc.target/i386/avx512fp16-trunchf.c test FAILs on 32-bit Solaris/x86:
>
> FAIL: gcc.target/i386/avx512fp16-trunchf.c scan-assembler-times vcvttsh2si[ 
> t]+[^{\\n]*(?:%xmm[0-9]|(%esp))+, %eax(?:\\n|[ t]+#) 3
> FAIL: gcc.target/i386/avx512fp16-trunchf.c scan-assembler-times vcvttsh2usi[ 
> t]+[^{\\n]*(?:%xmm[0-9]|(%esp))+, %eax(?:\\n|[ t]+#) 2
>
> This happens because Solaris defaults to -fno-omit-frame-pointer, so it
> uses %ebp instead of the expected %esp.  As Hongyu Wang suggested in the
> PR, this can be fixed by accepting both forms, which this patch does.
>
> Tested on i386-pc-solaris2.11 and x86_64-pc-linux-gnu.
>
> Ok for master?
>
> Rainer
>
> --
> -
> Rainer Orth, Center for Biotechnology, Bielefeld University
>
>
> 2021-10-20  Rainer Orth  
>
> gcc/testsuite:
> * gcc.target/i386/avx512fp16-trunchf.c: Allow for %esp instead of
> %ebp.
>


[PATCH] AVX512FP16: Optimize _Float16 reciprocal for div and sqrt

2021-10-26 Thread Hongyu Wang via Gcc-patches
Hi,

For _Float16 type, add insn and expanders to optimize x / y to
x * rcp (y), and x / sqrt (y) to x * rsqrt (y).
As Half float only have minor precision difference between div and
mul * rcp, there is no need for Newton-Rhapson approximation.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
Ok for master?

gcc/ChangeLog:

* config/i386/i386.c (use_rsqrt_p): Add mode parameter, enable
  HFmode rsqrt without TARGET_SSE_MATH.
(ix86_optab_supported_p): Refactor rint, adjust floor, ceil,
btrunc condition to be restricted by -ftrapping-math, adjust
use_rsqrt_p function call.
* config/i386/i386.md (rcphf2): New define_insn.
(rsqrthf2): Likewise.
* config/i386/sse.md (div3): Change VF2H to VF2.
(div3): New expander for HF mode.
(rsqrt2): Likewise.
(*avx512fp16_vmrcpv8hf2): New define_insn for rpad pass.
(*avx512fp16_vmrsqrtv8hf2): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-recip-1.c: New test.
* gcc.target/i386/avx512fp16-recip-2.c: Ditto.
* gcc.target/i386/pr102464.c: Add -fno-trapping-math.
---
 gcc/config/i386/i386.c| 29 +++---
 gcc/config/i386/i386.md   | 44 -
 gcc/config/i386/sse.md| 63 +++-
 .../gcc.target/i386/avx512fp16-recip-1.c  | 43 
 .../gcc.target/i386/avx512fp16-recip-2.c  | 97 +++
 gcc/testsuite/gcc.target/i386/pr102464.c  |  2 +-
 6 files changed, 258 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-recip-2.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 299e1ab2621..c5789365d3b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18905,9 +18905,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
1.0/sqrt.  */
 
 static bool
-use_rsqrt_p ()
+use_rsqrt_p (machine_mode mode)
 {
-  return (TARGET_SSE && TARGET_SSE_MATH
+  return ((mode == HFmode
+  || (TARGET_SSE && TARGET_SSE_MATH))
  && flag_finite_math_only
  && !flag_trapping_math
  && flag_unsafe_math_optimizations);
@@ -23603,29 +23604,27 @@ ix86_optab_supported_p (int op, machine_mode mode1, 
machine_mode,
   return opt_type == OPTIMIZE_FOR_SPEED;
 
 case rint_optab:
-  if (mode1 == HFmode)
-   return true;
-  else if (SSE_FLOAT_MODE_P (mode1)
-  && TARGET_SSE_MATH
-  && !flag_trapping_math
-  && !TARGET_SSE4_1)
+  if (SSE_FLOAT_MODE_P (mode1)
+ && TARGET_SSE_MATH
+ && !flag_trapping_math
+ && !TARGET_SSE4_1
+ && mode1 != HFmode)
return opt_type == OPTIMIZE_FOR_SPEED;
   return true;
 
 case floor_optab:
 case ceil_optab:
 case btrunc_optab:
-  if (mode1 == HFmode)
-   return true;
-  else if (SSE_FLOAT_MODE_P (mode1)
-  && TARGET_SSE_MATH
-  && !flag_trapping_math
-  && TARGET_SSE4_1)
+  if (((SSE_FLOAT_MODE_P (mode1)
+   && TARGET_SSE_MATH
+   && TARGET_SSE4_1)
+  || mode1 == HFmode)
+ && !flag_trapping_math)
return true;
   return opt_type == OPTIMIZE_FOR_SPEED;
 
 case rsqrt_optab:
-  return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+  return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
 default:
   return true;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e733a40fc90..11535df5425 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -8417,11 +8417,27 @@
(match_operand:XF 2 "register_operand")))]
   "TARGET_80387")
 
+/* There is no more precision loss than Newton-Rhapson approximation
+  when using HFmode rcp/rsqrt, so do the transformation directly under
+  TARGET_RECIP_DIV and fast-math.  */
 (define_expand "divhf3"
   [(set (match_operand:HF 0 "register_operand")
(div:HF (match_operand:HF 1 "register_operand")
   (match_operand:HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16")
+  "TARGET_AVX512FP16"
+{
+  if (TARGET_RECIP_DIV
+  && optimize_insn_for_speed_p ()
+  && flag_finite_math_only && !flag_trapping_math
+  && flag_unsafe_math_optimizations)
+{
+  rtx op = gen_reg_rtx (HFmode);
+  operands[2] = force_reg (HFmode, operands[2]);
+  emit_insn (gen_rcphf2 (op, operands[2]));
+  emit_insn (gen_mulhf3 (operands[0], operands[1], op));
+  DONE;
+}
+})
 
 (define_expand "div3"
   [(set (match_operand:MODEF 0 "register_operand")
@@ -16973,6 +16989,19 @@
]
(symbol_ref "true")))])
 
+(define_insn "rcphf2"
+  [(set (match_operand:HF 0 "register_operand" "=v,v")
+   (unspec:HF [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+  UNSPEC_RCP))]
+  "TARGET_AVX512FP16"

Re: [PATCH] i386: Avoid fma_chain for -march=alderlake and sapphirerapids.

2022-12-14 Thread Hongyu Wang via Gcc-patches
If there is no objection, I'm going to backport the m_SAPPHIRERAPIDS
and m_ALDERLAKE change to GCC 12.

Uros Bizjak via Gcc-patches  于2022年12月7日周三 15:11写道:
>
> On Wed, Dec 7, 2022 at 7:36 AM Hongyu Wang  wrote:
> >
> > For Alderlake there is similar issue like PR 81616, enable
> > avoid_fma256_chain will also benefit on Intel latest platforms
> > Alderlake and Sapphire Rapids.
> >
> > Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,}.
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add
> > m_SAPPHIRERAPIDS, m_ALDERLAKE and m_CORE_ATOM.
>
> OK.
>
> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/x86-tune.def | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> > index cd66f335113..db85de20bae 100644
> > --- a/gcc/config/i386/x86-tune.def
> > +++ b/gcc/config/i386/x86-tune.def
> > @@ -499,7 +499,8 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, 
> > "avoid_fma_chains", m_ZNVER)
> >
> >  /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
> > smaller FMA chain.  */
> > -DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
> > m_ZNVER3)
> > +DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
> > m_ZNVER3
> > + | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM)
> >
> >  /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
> > for v2df vector reduction.  */
> > --
> > 2.18.1
> >


[PATCH] Fix avx512ne2ps2bf16 wrong code [PR 111127]

2023-08-24 Thread Hongyu Wang via Gcc-patches
Hi, 

For PR27, the wrong code was caused by wrong expander for maskz.
correct the parameter order for avx512ne2ps2bf16_maskz expander

Bootstrapped/regtested on x86-64-pc-linux-gnu{m32,}. 
OK for master and backport to GCC13?

gcc/ChangeLog:

PR target/27
* config/i386/sse.md (avx512f_cvtne2ps2bf16__maskz):
Adjust paramter order.

gcc/testsuite/ChangeLog:

PR target/27
* gcc.target/i386/pr27.c: New test.
---
 gcc/config/i386/sse.md   |  4 ++--
 gcc/testsuite/gcc.target/i386/pr27.c | 24 
 2 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr27.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index da85223a9b4..194dab9a9d0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30006,8 +30006,8 @@ (define_expand "avx512f_cvtne2ps2bf16__maskz"
(match_operand: 3 "register_operand")]
   "TARGET_AVX512BF16"
 {
-  emit_insn (gen_avx512f_cvtne2ps2bf16__mask(operands[0], operands[2],
-operands[1], CONST0_RTX(mode), operands[3]));
+  emit_insn (gen_avx512f_cvtne2ps2bf16__mask(operands[0], operands[1],
+operands[2], CONST0_RTX(mode), operands[3]));
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/pr27.c 
b/gcc/testsuite/gcc.target/i386/pr27.c
new file mode 100644
index 000..c124bc18bc4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr27.c
@@ -0,0 +1,24 @@
+/* PR target/27 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm1, 
%zmm0, %zmm0\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm1, 
%ymm0, %ymm0\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm1, 
%xmm0, %xmm0\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include 
+
+__m512bh cvttest(__mmask32 k, __m512 a, __m512 b)
+{
+  return _mm512_maskz_cvtne2ps_pbh (k,a,b);
+}
+
+__m256bh cvttest2(__mmask16 k, __m256 a, __m256 b)
+{
+  return _mm256_maskz_cvtne2ps_pbh (k,a,b);
+}
+
+__m128bh cvttest3(__mmask8 k, __m128 a, __m128 b)
+{
+  return _mm_maskz_cvtne2ps_pbh (k,a,b);
+}
+
-- 
2.31.1



[PATCH 02/13] [APX EGPR] middle-end: Add index_reg_class with insn argument.

2023-08-31 Thread Hongyu Wang via Gcc-patches
Like base_reg_class, INDEX_REG_CLASS also does not support backend insn.
Add index_reg_class with insn argument for lra/reload usage.

gcc/ChangeLog:

* addresses.h (index_reg_class): New wrapper function like
base_reg_class.
* doc/tm.texi: Document INSN_INDEX_REG_CLASS.
* doc/tm.texi.in: Ditto.
* lra-constraints.cc (index_part_to_reg): Pass index_class.
(process_address_1): Calls index_reg_class with curr_insn and
replace INDEX_REG_CLASS with its return value index_cl.
* reload.cc (find_reloads_address): Likewise.
(find_reloads_address_1): Likewise.
---
 gcc/addresses.h| 10 ++
 gcc/doc/tm.texi|  9 +
 gcc/doc/tm.texi.in |  9 +
 gcc/lra-constraints.cc | 17 +
 gcc/reload.cc  |  4 ++--
 5 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/gcc/addresses.h b/gcc/addresses.h
index 08b100cfe6d..4bd96a3fc83 100644
--- a/gcc/addresses.h
+++ b/gcc/addresses.h
@@ -47,6 +47,16 @@ base_reg_class (machine_mode mode ATTRIBUTE_UNUSED,
 #endif
 }
 
+inline enum reg_class
+index_reg_class (rtx_insn *insn ATTRIBUTE_UNUSED = NULL)
+{
+#ifdef INSN_INDEX_REG_CLASS
+  return INSN_INDEX_REG_CLASS (insn);
+#else
+  return INDEX_REG_CLASS;
+#endif
+}
+
 /* Wrapper function to unify target macros REGNO_MODE_CODE_OK_FOR_BASE_P,
REGNO_MODE_OK_FOR_REG_BASE_P, REGNO_MODE_OK_FOR_BASE_P and
REGNO_OK_FOR_BASE_P.
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index a4239e3de10..5a50f5cf7f3 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -2553,6 +2553,15 @@ address where its value is either multiplied by a scale 
factor or
 added to another register (as well as added to a displacement).
 @end defmac
 
+@defmac INSN_INDEX_REG_CLASS (@var{insn})
+A C expression whose value is the register class to which a valid
+index register must belong. An index register is one used in an
+address where its value is either multiplied by a scale factor or
+added to another register (as well as added to a displacement).
+@code{insn} indicates insn specific index register class should be
+subset of the original index register class.
+@end defmac
+
 @defmac REGNO_OK_FOR_BASE_P (@var{num})
 A C expression which is nonzero if register number @var{num} is
 suitable for use as a base register in operand addresses.
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 72898f3adba..65748e19ccd 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -2148,6 +2148,15 @@ address where its value is either multiplied by a scale 
factor or
 added to another register (as well as added to a displacement).
 @end defmac
 
+@defmac INSN_INDEX_REG_CLASS (@var{insn})
+A C expression whose value is the register class to which a valid
+index register must belong. An index register is one used in an
+address where its value is either multiplied by a scale factor or
+added to another register (as well as added to a displacement).
+@code{insn} indicates insn specific index register class should be
+subset of the original index register class.
+@end defmac
+
 @defmac REGNO_OK_FOR_BASE_P (@var{num})
 A C expression which is nonzero if register number @var{num} is
 suitable for use as a base register in operand addresses.
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 9e7915ce934..161b67d8b73 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -3390,12 +3390,12 @@ base_plus_disp_to_reg (struct address_info *ad, rtx 
disp)
 /* Make reload of index part of address AD.  Return the new
pseudo.  */
 static rtx
-index_part_to_reg (struct address_info *ad)
+index_part_to_reg (struct address_info *ad, enum reg_class index_class)
 {
   rtx new_reg;
 
   new_reg = lra_create_new_reg (GET_MODE (*ad->index), NULL_RTX,
-   INDEX_REG_CLASS, NULL, "index term");
+   index_class, NULL, "index term");
   expand_mult (GET_MODE (*ad->index), *ad->index_term,
   GEN_INT (get_index_scale (ad)), new_reg, 1);
   return new_reg;
@@ -3650,13 +3650,14 @@ process_address_1 (int nop, bool check_only_p,
   /* If INDEX_REG_CLASS is assigned to base_term already and isn't to
  index_term, swap them so to avoid assigning INDEX_REG_CLASS to both
  when INDEX_REG_CLASS is a single register class.  */
+  enum reg_class index_cl = index_reg_class (curr_insn);
   if (ad.base_term != NULL
   && ad.index_term != NULL
-  && ira_class_hard_regs_num[INDEX_REG_CLASS] == 1
+  && ira_class_hard_regs_num[index_cl] == 1
   && REG_P (*ad.base_term)
   && REG_P (*ad.index_term)
-  && in_class_p (*ad.base_term, INDEX_REG_CLASS, NULL)
-  && ! in_class_p (*ad.index_term, INDEX_REG_CLASS, NULL))
+  && in_class_p (*ad.base_term, index_cl, NULL)
+  && ! in_class_p (*ad.index_term, index_cl, NULL))
 {
   std::swap (ad.base, ad.index);
   std::swap (ad.base_term, ad.index_term);
@@ -3680,7 +3681,7 @@ process_ad

[PATCH 00/13] [RFC] Support Intel APX EGPR

2023-08-31 Thread Hongyu Wang via Gcc-patches
Intel Advanced performance extension (APX) has been released in [1].
It contains several extensions such as extended 16 general purpose registers
(EGPRs), push2/pop2, new data destination (NDD), conditional compare
(CCMP/CTEST) combined with suppress flags write version of common instructions
(NF). This RFC focused on EGPR implementation in GCC.

APX introduces a REX2 prefix to help represent EGPR for several legacy/SSE
instructions. For the remaining ones, it promotes some of them using evex
prefix for EGPR.  The main issue in APX is that not all legacy/sse/vex
instructions support EGPR. For example, instructions in legacy opcode map2/3
cannot use REX2 prefix since there is only 1bit in REX2 to indicate map0/1
instructions, e.g., pinsrd. Also, for most vector extensions, EGPR is supported
in their evex forms but not vex forms, which means the mnemonics with no evex
forms also cannot use EGPR, e.g., vphaddw. 

Such limitation brings some challenge with current GCC infrastructure.
Generally, we use constraints to guide register allocation behavior. For
register operand, it is easy to add a new constraint to certain insn and limit
it to legacy or REX registers. But for memory operand, if we only use
constraint to limit base/index register choice, reload has no backoff when
process_address allocates any egprs to base/index reg, and then any post-reload
pass would get ICE from the constraint.

Here is what we did to address the issue: 

Middle-end: 
-   Add rtx_insn parameter to base_reg_class, reuse the
MODE_CODE_BASE_REG_CLASS macro with rtx_insn parameter.
-   Add index_reg_class like base_reg_class, calls new INSN_INDEX_REG_CLASS
macro with rtx_insn parameter.
-   In process_address_1, add rtx_insn parameter to call sites of
base_reg_class, replace usage of INDEX_REG_CLASS to index_reg_class with
rtx_insn parameter.  

Back-end:
-   Extend GENERAL_REG_CLASS, INDEX_REG_CLASS and their supersets with
corresponding regno checks for EGPRs.
-   Add GENERAL_GPR16/INDEX_GPR16 class for old 16 GPRs.
-   Whole component is controlled under -mapxf/TARGET_APX_EGPR. If it is
not enabled, clear r16-r31 in accessible_reg_set.
-   New register_constraint “h” and memory_constraint “Bt” that disallows
EGPRs in operand.
-   New asm_gpr32 flag option to enable/disable gpr32 for inline asm,
  disabled by default.
-   If asm_gpr32 is disabled, replace constraints “r” to “h”, and
“m/memory” to “Bt”.
-   Extra insn attribute gpr32, value 0 indicates the alternative cannot
use EGPRs.
-   Add target functions for base_reg_class and index_reg_class, calls a
helper function to verify if insn can use EGPR in its memory_operand. 
-   In the helper function, the verify process works as follow: 
1. Returns true if APX_EGPR disabled or insn is null. 
2. If the insn is inline asm, returns asm_gpr32 flag. 
3. Returns false for unrecognizable insn. 
4. Save recog_data and which_alternative, extract the insn, and restore them
before return. 
5. Loop through all enabled alternatives, if one of the enabled alternatives
have attr_gpr32 0, returns false, otherwise returns true.
-   For insn alternatives that cannot use gpr32 in register_operand, use h
constraint instead of r.
-   For insn alternatives that cannot use gpr32 in memory operand, use Bt
constraint instead of m, and set corresponding attr_gpr32 to 0.
-   Split output template with %v if the sse version of mnemonic cannot use
gpr32. 
-   For insn alternatives that cannot use gpr32 in memory operand, classify
the isa attribute and split alternatives to noavx, avx_noavx512f and etc., so
the helper function can properly loop through the available enabled mask.

Specifically for inline asm, we currently just map “r/m/memory” constraints as
an example. Eventually we will support entire mapping of all common constraints
if the mapping method was accepted.

Also, for vex instructions, currently we assume egpr was supported if they have
evex counterpart, since any APX enabled machine will have AVX10 support for all
the evex encodings. We just disabled those mnemonics that doesn’t support EGPR.
So EGPR will be allowed under -mavx2 -mapxf for many vex mnemonics. 

We haven’t disabled EGPR for 3DNOW/XOP/LWP/FMA4/TBM instructions, as they will
be co-operated with -mapxf. We can disable EGPR for them if AMD guys requires. 

For testing, currently we tested GCC testsuite and spec2017 with -maxf+sde
simulater and no more errors. Also, we inverted the register allocation order
to force r31 to be allocated first, and no more error except those AMD only
instructions. We will conduct further tests like changing all do-compile to
do-assemble and add more to gcc/testsuite in the future.

The RFC intends to describe our approach for APX implementation for EGPR
component. It may still have potential issues or bugs and requires futher
optimization. Any comments are very appreciated.

[1]. 
https://www.intel.com/content

[PATCH 06/13] [APX EGPR] Map reg/mem constraints in inline asm to non-EGPR constraint.

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

In inline asm, we do not know if the insn can use EGPR, so disable EGPR
usage by default from mapping the common reg/mem constraint to non-EGPR
constraints. Use a flag mapx-inline-asm-use-gpr32 to enable EGPR usage
for inline asm.

gcc/ChangeLog:

* config/i386/i386.cc (INCLUDE_STRING): Add include for
ix86_md_asm_adjust.
(ix86_md_asm_adjust): When APX EGPR enabled without specifying the
target option, map reg/mem constraints to non-EGPR constraints.
* config/i386/i386.opt: Add option mapx-inline-asm-use-gpr32.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-inline-gpr-norex2.c: New test.
---
 gcc/config/i386/i386.cc   |  44 +++
 gcc/config/i386/i386.opt  |   5 +
 .../gcc.target/i386/apx-inline-gpr-norex2.c   | 107 ++
 3 files changed, 156 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-inline-gpr-norex2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d26d9ab0d9d..9460ebbfda4 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -17,6 +17,7 @@ You should have received a copy of the GNU General Public 
License
 along with GCC; see the file COPYING3.  If not see
 .  */
 
+#define INCLUDE_STRING
 #define IN_TARGET_CODE 1
 
 #include "config.h"
@@ -23077,6 +23078,49 @@ ix86_md_asm_adjust (vec &outputs, vec & 
/*inputs*/,
   bool saw_asm_flag = false;
 
   start_sequence ();
+  /* TODO: Here we just mapped the general r/m constraints to non-EGPR
+   constraints, will eventually map all the usable constraints in the future. 
*/
+  if (TARGET_APX_EGPR && !ix86_apx_inline_asm_use_gpr32)
+{
+  /* Map "r" constraint in inline asm to "h" that disallows r16-r31
+and replace only r, exclude Br and Yr.  */
+  for (unsigned i = 0; i < constraints.length (); i++)
+   {
+ std::string *s = new std::string (constraints[i]);
+ size_t pos = s->find ('r');
+ while (pos != std::string::npos)
+   {
+ if (pos > 0
+ && (s->at (pos - 1) == 'Y' || s->at (pos - 1) == 'B'))
+   pos = s->find ('r', pos + 1);
+ else
+   {
+ s->replace (pos, 1, "h");
+ constraints[i] = (const char*) s->c_str ();
+ break;
+   }
+   }
+   }
+  /* Also map "m/memory/Bm" constraint that may use GPR32, replace them 
with
+"Bt/Bt/BT".  */
+  for (unsigned i = 0; i < constraints.length (); i++)
+   {
+ std::string *s = new std::string (constraints[i]);
+ size_t pos = s->find ("m");
+ size_t pos2 = s->find ("memory");
+ if (pos != std::string::npos)
+   {
+ if (pos > 0 && (s->at (pos - 1) == 'B'))
+ s->replace (pos - 1, 2, "BT");
+ else if (pos2 != std::string::npos)
+ s->replace (pos, 6, "Bt");
+ else
+ s->replace (pos, 1, "Bt");
+ constraints[i] = (const char*) s->c_str ();
+   }
+   }
+ }
+
   for (unsigned i = 0, n = outputs.length (); i < n; ++i)
 {
   const char *con = constraints[i];
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 1ee4d90186e..5c8d3a207e3 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1335,3 +1335,8 @@ Enum(apx_features) String(ndd) Value(apx_ndd) Set(4)
 
 EnumValue
 Enum(apx_features) String(all) Value(apx_all) Set(1)
+
+mapx-inline-asm-use-gpr32
+Target Var(ix86_apx_inline_asm_use_gpr32) Init(0)
+Enable GPR32 in inline asm when APX_EGPR enabled, do not
+hook reg or mem constraint in inline asm to GPR16.
diff --git a/gcc/testsuite/gcc.target/i386/apx-inline-gpr-norex2.c 
b/gcc/testsuite/gcc.target/i386/apx-inline-gpr-norex2.c
new file mode 100644
index 000..21534450045
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-inline-gpr-norex2.c
@@ -0,0 +1,107 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mapxf -m64 -march=skylake-avx512 -DDTYPE32" } */
+
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+#ifdef DTYPE32
+typedef u32 DTYPE;
+#define byteswap byteswapu32
+#endif
+
+#define R(x,n) ( (x >> n) | (x << (32 - n)))
+
+#define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
+#define S1(x) (R(x, 6) ^ R(x,11) ^ R(x,25))
+
+#define TT(a,b,c,d,e,f,g,h,x,K) \
+{\
+tmp1 = h + S1(e) + (g ^ (e & (f ^ g))) + K + x;\
+tmp2 = S0(a) + ((a & b) | (c & (a | b)));   \
+h  = tmp1 + tmp2;\
+d += tmp1;   \
+}
+
+static inline u32 byteswapu32(u32 x)
+{
+  x = (x & 0x) << 16 | (x & 0x) >> 16;
+  x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;  
+  return x;
+}
+
+void foo (DTYPE in[16], DTYPE

[PATCH 03/13] [APX_EGPR] Initial support for APX_F

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

Add -mapx-features= enumeration to separate subfeatures of APX_F.
-mapxf is treated same as previous ISA flag, while it sets
-mapx-features=apx_all that enables all subfeatures.

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (XSTATE_APX_F): New macro.
(XCR_APX_F_ENABLED_MASK): Likewise.
(get_available_features): Detect APX_F under
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_APX_F_SET): New.
(OPTION_MASK_ISA2_APX_F_UNSET): Likewise.
(ix86_handle_option): Handle -mapxf.
* common/config/i386/i386-cpuinfo.h (FEATURE_APX_F): New.
* common/config/i386/i386-isas.h: Add entry for APX_F.
* config/i386/cpuid.h (bit_APX_F): New.
* config/i386/i386.h (bit_APX_F): (TARGET_APX_EGPR,
TARGET_APX_PUSH2POP2, TARGET_APX_NDD): New define.
* config/i386/i386-opts.h (enum apx_features): New enum.
* config/i386/i386-isa.def (APX_F): New DEF_PTA.
* config/i386/i386-options.cc (ix86_function_specific_save):
Save ix86_apx_features.
(ix86_function_specific_restore): Restore it.
(ix86_valid_target_attribute_inner_p): Add mapxf.
(ix86_option_override_internal): Set ix86_apx_features for PTA
and TARGET_APX_F. Also reports error when APX_F is set but not
having TARGET_64BIT.
* config/i386/i386.opt: (-mapxf): New ISA flag option.
(-mapx=): New enumeration option.
(apx_features): New enum type.
(apx_none): New enum value.
(apx_egpr): Likewise.
(apx_push2pop2): Likewise.
(apx_ndd): Likewise.
(apx_all): Likewise.
* doc/invoke.texi: Document mapxf.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-1.c: New test.
---
 gcc/common/config/i386/cpuinfo.h  | 12 +++-
 gcc/common/config/i386/i386-common.cc | 17 +
 gcc/common/config/i386/i386-cpuinfo.h |  1 +
 gcc/common/config/i386/i386-isas.h|  1 +
 gcc/config/i386/cpuid.h   |  1 +
 gcc/config/i386/i386-isa.def  |  1 +
 gcc/config/i386/i386-options.cc   | 15 +++
 gcc/config/i386/i386-opts.h   |  8 
 gcc/config/i386/i386.h|  4 
 gcc/config/i386/i386.opt  | 25 +
 gcc/doc/invoke.texi   | 11 +++
 gcc/testsuite/gcc.target/i386/apx-1.c |  8 
 12 files changed, 99 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-1.c

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 24ae0dbf0ac..141d3743316 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -678,6 +678,7 @@ get_available_features (struct __processor_model *cpu_model,
 #define XSTATE_HI_ZMM  0x80
 #define XSTATE_TILECFG 0x2
 #define XSTATE_TILEDATA0x4
+#define XSTATE_APX_F   0x8
 
 #define XCR_AVX_ENABLED_MASK \
   (XSTATE_SSE | XSTATE_YMM)
@@ -685,11 +686,13 @@ get_available_features (struct __processor_model 
*cpu_model,
   (XSTATE_SSE | XSTATE_YMM | XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM)
 #define XCR_AMX_ENABLED_MASK \
   (XSTATE_TILECFG | XSTATE_TILEDATA)
+#define XCR_APX_F_ENABLED_MASK XSTATE_APX_F
 
-  /* Check if AVX and AVX512 are usable.  */
+  /* Check if AVX, AVX512 and APX are usable.  */
   int avx_usable = 0;
   int avx512_usable = 0;
   int amx_usable = 0;
+  int apx_usable = 0;
   /* Check if KL is usable.  */
   int has_kl = 0;
   if ((ecx & bit_OSXSAVE))
@@ -709,6 +712,8 @@ get_available_features (struct __processor_model *cpu_model,
}
   amx_usable = ((xcrlow & XCR_AMX_ENABLED_MASK)
== XCR_AMX_ENABLED_MASK);
+  apx_usable = ((xcrlow & XCR_APX_F_ENABLED_MASK)
+   == XCR_APX_F_ENABLED_MASK);
 }
 
 #define set_feature(f) \
@@ -922,6 +927,11 @@ get_available_features (struct __processor_model 
*cpu_model,
  if (edx & bit_AMX_COMPLEX)
set_feature (FEATURE_AMX_COMPLEX);
}
+ if (apx_usable)
+   {
+ if (edx & bit_APX_F)
+   set_feature (FEATURE_APX_F);
+   }
}
 }
 
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 95468b7c405..86596e96ad1 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -123,6 +123,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_SM3_SET OPTION_MASK_ISA2_SM3
 #define OPTION_MASK_ISA2_SHA512_SET OPTION_MASK_ISA2_SHA512
 #define OPTION_MASK_ISA2_SM4_SET OPTION_MASK_ISA2_SM4
+#define OPTION_MASK_ISA2_APX_F_SET OPTION_MASK_ISA2_APX_F
 
 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2.  */
@@ -309,6 +310,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_SM3_UNSET OP

[PATCH 08/13] [APX EGPR] Handle GPR16 only vector move insns

2023-08-31 Thread Hongyu Wang via Gcc-patches
For vector move insns like vmovdqa/vmovdqu, their evex counterparts
requrire explicit suffix 64/32/16/8. The usage of these instruction
are prohibited under AVX10_1 or AVX512F, so for AVX2+APX_F we select
vmovaps/vmovups for vector load/store insns that contains EGPR.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_get_ssemov): Check if egpr is used,
adjust mnemonic for vmovduq/vmovdqa.
* config/i386/sse.md 
(*_vinsert_0):
Check if egpr is used, adjust mnemonic for vmovdqu/vmovdqa.
(avx_vec_concat): Likewise, and separate alternative 0 to
avx_noavx512f.
---
 gcc/config/i386/i386.cc | 31 ++-
 gcc/config/i386/sse.md  | 34 --
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 412f3aefc43..f5d642948bc 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -5469,6 +5469,11 @@ ix86_get_ssemov (rtx *operands, unsigned size,
   bool evex_reg_p = (size == 64
 || EXT_REX_SSE_REG_P (operands[0])
 || EXT_REX_SSE_REG_P (operands[1]));
+
+  bool egpr_p = (TARGET_APX_EGPR
+&& (x86_extended_rex2reg_mentioned_p (operands[0])
+|| x86_extended_rex2reg_mentioned_p (operands[1])));
+
   machine_mode scalar_mode;
 
   const char *opcode = NULL;
@@ -5547,6 +5552,12 @@ ix86_get_ssemov (rtx *operands, unsigned size,
 ? "vmovdqu16"
 : "vmovdqu64")
  : "vmovdqa64");
+ else if (egpr_p)
+   opcode = (misaligned_p
+ ? (TARGET_AVX512BW
+? "vmovdqu16"
+: "%vmovups")
+ : "%vmovaps");
  else
opcode = (misaligned_p
  ? (TARGET_AVX512BW
@@ -5563,6 +5574,8 @@ ix86_get_ssemov (rtx *operands, unsigned size,
case E_TFmode:
  if (evex_reg_p)
opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+ else if (egpr_p)
+   opcode = misaligned_p ? "%vmovups" : "%vmovaps";
  else
opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
  break;
@@ -5581,6 +5594,12 @@ ix86_get_ssemov (rtx *operands, unsigned size,
 ? "vmovdqu8"
 : "vmovdqu64")
  : "vmovdqa64");
+ else if (egpr_p)
+   opcode = (misaligned_p
+ ? (TARGET_AVX512BW
+? "vmovdqu8"
+: "%vmovups")
+ : "%vmovaps");
  else
opcode = (misaligned_p
  ? (TARGET_AVX512BW
@@ -5589,12 +5608,18 @@ ix86_get_ssemov (rtx *operands, unsigned size,
  : "%vmovdqa");
  break;
case E_HImode:
- if (evex_reg_p)
+ if (evex_reg_p || egpr_p)
opcode = (misaligned_p
  ? (TARGET_AVX512BW
 ? "vmovdqu16"
 : "vmovdqu64")
  : "vmovdqa64");
+ else if (egpr_p)
+   opcode = (misaligned_p
+ ? (TARGET_AVX512BW
+? "vmovdqu16"
+: "%vmovups")
+ : "%vmovaps");
  else
opcode = (misaligned_p
  ? (TARGET_AVX512BW
@@ -5605,6 +5630,8 @@ ix86_get_ssemov (rtx *operands, unsigned size,
case E_SImode:
  if (evex_reg_p)
opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+ else if (egpr_p)
+   opcode = misaligned_p ? "%vmovups" : "%vmovaps";
  else
opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
  break;
@@ -5613,6 +5640,8 @@ ix86_get_ssemov (rtx *operands, unsigned size,
case E_OImode:
  if (evex_reg_p)
opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+ else if (egpr_p)
+   opcode = misaligned_p ? "%vmovups" : "%vmovaps";
  else
opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
  break;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 192e746fda3..bd6674d34f9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18918,6 +18918,12 @@ (define_insn 
"*_vinsert_0"
 {
   if (which_alternative == 0)
 return "vinsert\t{$0, %2, %1, %0|%0, %1, %2, 0}";
+  bool egpr_used = (TARGET_APX_EGPR
+   && x86_extended_rex2reg_mentioned_p (operands[2]));
+  const char *align_templ = egpr_used ? "vmovdqa\t{%2, %x0|%x0, %2}"
+ : "vmovaps\t{%2, %x0|%x0, %2}";
+  const char *unalign_templ = egpr_used ? "vmovdqu\t{%2, %x0|%x0, %2}"
+   : "vmovups\t{%2, %x0|%x0, %2}";
   switch (mode)
 {
 case E_V8DFmode:
@@ -18933,17 +18939,17 @@ (define_insn 
"*_vinsert_0"
 case E

[PATCH 01/13] [APX EGPR] middle-end: Add insn argument to base_reg_class

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

Current reload infrastructure does not support selective base_reg_class
for backend insn. Add insn argument to base_reg_class for
lra/reload usage.

gcc/ChangeLog:

* addresses.h (base_reg_class):  Add insn argument.
Pass to MODE_CODE_BASE_REG_CLASS.
(regno_ok_for_base_p_1): Add insn argument.
Pass to REGNO_MODE_CODE_OK_FOR_BASE_P.
(regno_ok_for_base_p): Add insn argument and parse to ok_for_base_p_1.
* config/avr/avr.h (MODE_CODE_BASE_REG_CLASS): Add insn argument.
(REGNO_MODE_CODE_OK_FOR_BASE_P): Ditto.
* config/gcn/gcn.h (MODE_CODE_BASE_REG_CLASS): Ditto.
(REGNO_MODE_CODE_OK_FOR_BASE_P): Ditto.
* config/rl78/rl78.h (REGNO_MODE_CODE_OK_FOR_BASE_P): Ditto.
(MODE_CODE_BASE_REG_CLASS): Ditto.
* doc/tm.texi: Add insn argument for MODE_CODE_BASE_REG_CLASS
and REGNO_MODE_CODE_OK_FOR_BASE_P.
* doc/tm.texi.in: Ditto.
* lra-constraints.cc (process_address_1): Pass insn to
base_reg_class.
(curr_insn_transform): Ditto.
* reload.cc (find_reloads): Ditto.
(find_reloads_address): Ditto.
(find_reloads_address_1): Ditto.
(find_reloads_subreg_address): Ditto.
* reload1.cc (maybe_fix_stack_asms): Ditto.
---
 gcc/addresses.h| 15 +--
 gcc/config/avr/avr.h   |  5 +++--
 gcc/config/gcn/gcn.h   |  4 ++--
 gcc/config/rl78/rl78.h |  6 --
 gcc/doc/tm.texi|  8 ++--
 gcc/doc/tm.texi.in |  8 ++--
 gcc/lra-constraints.cc | 15 +--
 gcc/reload.cc  | 30 ++
 gcc/reload1.cc |  2 +-
 9 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/gcc/addresses.h b/gcc/addresses.h
index 3519c241c6d..08b100cfe6d 100644
--- a/gcc/addresses.h
+++ b/gcc/addresses.h
@@ -28,11 +28,12 @@ inline enum reg_class
 base_reg_class (machine_mode mode ATTRIBUTE_UNUSED,
addr_space_t as ATTRIBUTE_UNUSED,
enum rtx_code outer_code ATTRIBUTE_UNUSED,
-   enum rtx_code index_code ATTRIBUTE_UNUSED)
+   enum rtx_code index_code ATTRIBUTE_UNUSED,
+   rtx_insn *insn ATTRIBUTE_UNUSED = NULL)
 {
 #ifdef MODE_CODE_BASE_REG_CLASS
   return MODE_CODE_BASE_REG_CLASS (MACRO_MODE (mode), as, outer_code,
-  index_code);
+  index_code, insn);
 #else
 #ifdef MODE_BASE_REG_REG_CLASS
   if (index_code == REG)
@@ -56,11 +57,12 @@ ok_for_base_p_1 (unsigned regno ATTRIBUTE_UNUSED,
 machine_mode mode ATTRIBUTE_UNUSED,
 addr_space_t as ATTRIBUTE_UNUSED,
 enum rtx_code outer_code ATTRIBUTE_UNUSED,
-enum rtx_code index_code ATTRIBUTE_UNUSED)
+enum rtx_code index_code ATTRIBUTE_UNUSED,
+rtx_insn* insn ATTRIBUTE_UNUSED = NULL)
 {
 #ifdef REGNO_MODE_CODE_OK_FOR_BASE_P
   return REGNO_MODE_CODE_OK_FOR_BASE_P (regno, MACRO_MODE (mode), as,
-   outer_code, index_code);
+   outer_code, index_code, insn);
 #else
 #ifdef REGNO_MODE_OK_FOR_REG_BASE_P
   if (index_code == REG)
@@ -79,12 +81,13 @@ ok_for_base_p_1 (unsigned regno ATTRIBUTE_UNUSED,
 
 inline bool
 regno_ok_for_base_p (unsigned regno, machine_mode mode, addr_space_t as,
-enum rtx_code outer_code, enum rtx_code index_code)
+enum rtx_code outer_code, enum rtx_code index_code,
+rtx_insn* insn = NULL)
 {
   if (regno >= FIRST_PSEUDO_REGISTER && reg_renumber[regno] >= 0)
 regno = reg_renumber[regno];
 
-  return ok_for_base_p_1 (regno, mode, as, outer_code, index_code);
+  return ok_for_base_p_1 (regno, mode, as, outer_code, index_code, insn);
 }
 
 #endif /* GCC_ADDRESSES_H */
diff --git a/gcc/config/avr/avr.h b/gcc/config/avr/avr.h
index 8e7e00db13b..1d090fe0838 100644
--- a/gcc/config/avr/avr.h
+++ b/gcc/config/avr/avr.h
@@ -280,12 +280,13 @@ enum reg_class {
 
 #define REGNO_REG_CLASS(R) avr_regno_reg_class(R)
 
-#define MODE_CODE_BASE_REG_CLASS(mode, as, outer_code, index_code)   \
+#define MODE_CODE_BASE_REG_CLASS(mode, as, outer_code, index_code, insn)   \
   avr_mode_code_base_reg_class (mode, as, outer_code, index_code)
 
 #define INDEX_REG_CLASS NO_REGS
 
-#define REGNO_MODE_CODE_OK_FOR_BASE_P(num, mode, as, outer_code, index_code) \
+#define REGNO_MODE_CODE_OK_FOR_BASE_P(num, mode, as, outer_code, \
+ index_code, insn)   \
   avr_regno_mode_code_ok_for_base_p (num, mode, as, outer_code, index_code)
 
 #define REGNO_OK_FOR_INDEX_P(NUM) 0
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index 4ff9a5d4d12..b56702a77fd 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -437,9 +437,9 @@ enum reg_class
  0x, 0x, 0x, 0x, 0x, 0 }}
 
 #define REGNO_REG_CLASS(REGNO

[PATCH 05/13] [APX EGPR] Add register and memory constraints that disallow EGPR

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

For APX, as we extended the GENERAL_REG_CLASS, new constraints are
needed to restrict insns that cannot adopt EGPR either in its reg or
memory operands.

gcc/ChangeLog:

* config/i386/constraints.md (h): New register constraint
for GENERAL_GPR16.
(Bt): New non-EGPR memory constraint.
(BT): Likewise for Bm constraint.
* config/i386/i386.h (enum reg_class): Add new reg class
GENERAL_GPR16.
---
 gcc/config/i386/constraints.md | 19 ++-
 gcc/config/i386/i386.h |  4 
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index fd490f39110..f487bf2e5a3 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -19,7 +19,7 @@
 
 ;;; Unused letters:
 ;;;   H
-;;;   h j   z
+;;;   j   z
 
 ;; Integer register constraints.
 ;; It is not necessary to define 'r' here.
@@ -165,6 +165,8 @@ (define_register_constraint "YW"
 ;;  k  TLS address that allows insn using non-integer registers
 ;;  n  Memory operand without REX prefix
 ;;  r  Broadcast memory operand
+;;  t  Memory operand without EGPR
+;;  T  Vector memory operand without EGPR
 ;;  s  Sibcall memory operand, not valid for TARGET_X32
 ;;  w  Call memory operand, not valid for TARGET_X32
 ;;  z  Constant call address operand.
@@ -201,6 +203,18 @@ (define_special_memory_constraint "Bn"
   "@internal Memory operand without REX prefix."
   (match_operand 0 "norex_memory_operand"))
 
+(define_memory_constraint "Bt"
+  "@internal Memory operand without GPR32."
+  (and (match_operand 0 "memory_operand")
+   (not (and (match_test "TARGET_APX_EGPR")
+(match_test "x86_extended_rex2reg_mentioned_p (op)")
+
+(define_special_memory_constraint "BT"
+  "@internal vector memory operand without GPR32."
+  (and (match_operand 0 "vector_memory_operand")
+   (not (and (match_test "TARGET_APX_EGPR")
+(match_test "x86_extended_rex2reg_mentioned_p (op)")
+
 (define_special_memory_constraint "Br"
   "@internal bcst memory operand."
   (match_operand 0 "bcst_mem_operand"))
@@ -371,3 +385,6 @@ (define_address_constraint "Tv"
 (define_address_constraint "Ts"
   "Address operand without segment register"
   (match_operand 0 "address_no_seg_operand"))
+
+(define_register_constraint  "h"
+ "TARGET_APX_EGPR ? GENERAL_GPR16 : GENERAL_REGS")
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 1ab291177f5..7ec3086641c 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1295,6 +1295,8 @@ enum reg_class
   %r8 %r9 %r10 %r11 %r12 %r13 %r14 %r15
   %r16 %r17 %r18 %r19 %r20 %r21 %r22 %r23
   %r24 %r25 %r26 %r27 %r28 %r29 %r30 %r31 */
+  GENERAL_GPR16,   /* %eax %ebx %ecx %edx %esi %edi %ebp %esp
+  %r8 %r9 %r10 %r11 %r12 %r13 %r14 %r15 */
   FP_TOP_REG, FP_SECOND_REG,   /* %st(0) %st(1) */
   FLOAT_REGS,
   SSE_FIRST_REG,
@@ -1357,6 +1359,7 @@ enum reg_class
"INDEX_REGS",   \
"LEGACY_REGS",  \
"GENERAL_REGS", \
+   "GENERAL_GPR16",\
"FP_TOP_REG", "FP_SECOND_REG",  \
"FLOAT_REGS",   \
"SSE_FIRST_REG",\
@@ -1395,6 +1398,7 @@ enum reg_class
   { 0x7f,  0xff0,   0x0 }, /* INDEX_REGS */\
{ 0x900ff,0x0,   0x0 }, /* LEGACY_REGS */   \
{ 0x900ff,  0xff0,   0x000 },   /* GENERAL_REGS */  
\
+   { 0x900ff,  0xff0,   0x0 }, /* GENERAL_GPR16 */ \
  { 0x100,0x0,   0x0 }, /* FP_TOP_REG */\
  { 0x200,0x0,   0x0 }, /* FP_SECOND_REG */ \
 { 0xff00,0x0,   0x0 }, /* FLOAT_REGS */\
-- 
2.31.1



[PATCH 07/13] [APX EGPR] Add backend hook for base_reg_class/index_reg_class.

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

Add backend helper functions to verify if a rtx_insn can adopt EGPR to
its base/index reg of memory operand. The verification rule goes like
  1. For asm insn, enable/disable EGPR by ix86_apx_inline_asm_use_gpr32.
  2. Disable EGPR for unrecognized insn.
  3. If which_alternative is not decided, loop through enabled alternatives
  and check its attr_gpr32. Only enable EGPR when all enabled
  alternatives has attr_gpr32 = 1.
  4. If which_alternative is decided, enable/disable EGPR by its corresponding
  attr_gpr32.

gcc/ChangeLog:

* config/i386/i386-protos.h (ix86_mode_code_base_reg_class): New
prototype.
(ix86_regno_mode_code_ok_for_base_p): Likewise.
(ix86_insn_index_reg_class): Likewise.
* config/i386/i386.cc (ix86_memory_address_use_extended_reg_class_p):
New helper function to scan the insn.
(ix86_mode_code_base_reg_class): New function to choose BASE_REG_CLASS.
(ix86_regno_mode_code_ok_for_base_p): Likewise for base regno.
(ix86_insn_index_reg_class): Likewise for INDEX_REG_CLASS.
* config/i386/i386.h (MODE_CODE_BASE_REG_CLASS): Define.
(REGNO_MODE_CODE_OK_FOR_BASE_P): Likewise.
(INSN_INDEX_REG_CLASS): Likewise.
(enum reg_class): Add INDEX_GPR16.
(GENERAL_GPR16_REGNO_P): Define.
* config/i386/i386.md (gpr32): New attribute.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-inline-gpr-norex2.c: Adjust.
---
 gcc/config/i386/i386-protos.h |  7 ++
 gcc/config/i386/i386.cc   | 98 +++
 gcc/config/i386/i386.h| 16 ++-
 gcc/config/i386/i386.md   |  3 +
 .../gcc.target/i386/apx-inline-gpr-norex2.c   |  7 +-
 5 files changed, 127 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bd4782800c4..78eb3e0f584 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -79,6 +79,13 @@ extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, 
rtx, rtx,
   rtx, rtx, rtx, rtx, bool);
 extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool);
 
+extern enum reg_class ix86_mode_code_base_reg_class (machine_mode, 
addr_space_t,
+RTX_CODE, RTX_CODE,
+rtx_insn *);
+extern bool ix86_regno_mode_code_ok_for_base_p (int, machine_mode, 
addr_space_t,
+   RTX_CODE, RTX_CODE,
+   rtx_insn *);
+extern enum reg_class ix86_insn_index_reg_class (rtx_insn *);
 extern bool constant_address_p (rtx);
 extern bool legitimate_pic_operand_p (rtx);
 extern bool legitimate_pic_address_disp_p (rtx);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9460ebbfda4..412f3aefc43 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11054,6 +11054,104 @@ ix86_validate_address_register (rtx op)
   return NULL_RTX;
 }
 
+/* Return true if insn memory address can use any available reg
+   in BASE_REG_CLASS or INDEX_REG_CLASS, otherwise false.
+   For APX, some instruction can't be encoded with gpr32
+   which is BASE_REG_CLASS or INDEX_REG_CLASS, for that case
+   returns false.  */
+static bool
+ix86_memory_address_use_extended_reg_class_p (rtx_insn* insn)
+{
+  /* LRA will do some initialization with insn == NULL,
+ return the maximum reg class for that.
+ For other cases, real insn will be passed and checked.  */
+  bool ret = true;
+  if (TARGET_APX_EGPR && insn)
+{
+  if (asm_noperands (PATTERN (insn)) >= 0
+ || GET_CODE (PATTERN (insn)) == ASM_INPUT)
+   return ix86_apx_inline_asm_use_gpr32;
+
+  if (INSN_CODE (insn) < 0)
+   return false;
+
+  /* Try recog the insn before calling get_attr_gpr32. Save
+the current recog_data first.  */
+  /* Also save which_alternative for current recog.  */
+
+  struct recog_data_d recog_data_save = recog_data;
+  int which_alternative_saved = which_alternative;
+
+  /* Update the recog_data for alternative check. */
+  if (recog_data.insn != insn)
+   extract_insn_cached (insn);
+
+  /* If alternative is not set, loop throught each alternative
+of insn and get gpr32 attr for all enabled alternatives.
+If any enabled alternatives has 0 value for gpr32, disallow
+gpr32 for addressing.  */
+  if (which_alternative_saved == -1)
+   {
+ alternative_mask enabled = get_enabled_alternatives (insn);
+ bool curr_insn_gpr32 = false;
+ for (int i = 0; i < recog_data.n_alternatives; i++)
+   {
+ if (!TEST_BIT (enabled, i))
+   continue;
+ which_alternative = i;
+ curr_insn_gpr32 = get_attr_gpr32 (insn);
+ if (!curr_insn_gpr32)
+  

[PATCH 11/13] [APX EGPR] Handle legacy insns that only support GPR16 (3/5)

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

Disable EGPR usage for below legacy insns in opcode map2/3 that have vex
but no evex counterpart.

insn list:
1. phminposuw/vphminposuw
2. ptest/vptest
3. roundps/vroundps, roundpd/vroundpd,
   roundss/vroundss, roundsd/vroundsd
4. pcmpestri/vpcmpestri, pcmpestrm/vpcmpestrm
5. pcmpistri/vpcmpistri, pcmpistrm/vpcmpistrm
6. aesimc/vaesimc, aeskeygenassist/vaeskeygenassist

gcc/ChangeLog:

* config/i386/i386-protos.h (x86_evex_reg_mentioned_p): New
prototype.
* config/i386/i386.cc (x86_evex_reg_mentioned_p): New
function.
* config/i386/i386.md (sse4_1_round2): Set attr gpr32 0
and constraint Bt/BM to all non-evex alternatives, adjust
alternative outputs if evex reg is mentioned.
* config/i386/sse.md (_ptest): Set attr gpr32 0
and constraint Bt/BM to all non-evex alternatives.
(ptesttf2): Likewise.
(_round): Likewise.
(sse4_2_pcmpestri): Likewise.
(sse4_2_pcmpestrm): Likewise.
(sse4_2_pcmpestr_cconly): Likewise.
(sse4_2_pcmpistr): Likewise.
(sse4_2_pcmpistri): Likewise.
(sse4_2_pcmpistrm): Likewise.
(sse4_2_pcmpistr_cconly): Likewise.
(aesimc): Likewise.
(aeskeygenassist): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-legacy-insn-check-norex2.c: Add intrinsic
tests.
---
 gcc/config/i386/i386-protos.h |  1 +
 gcc/config/i386/i386.cc   | 13 +++
 gcc/config/i386/i386.md   |  3 +-
 gcc/config/i386/sse.md| 93 +--
 .../i386/apx-legacy-insn-check-norex2.c   | 55 ++-
 5 files changed, 132 insertions(+), 33 deletions(-)

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 78eb3e0f584..bbb219e3039 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -65,6 +65,7 @@ extern bool extended_reg_mentioned_p (rtx);
 extern bool x86_extended_QIreg_mentioned_p (rtx_insn *);
 extern bool x86_extended_reg_mentioned_p (rtx);
 extern bool x86_extended_rex2reg_mentioned_p (rtx);
+extern bool x86_evex_reg_mentioned_p (rtx [], int);
 extern bool x86_maybe_negate_const_int (rtx *, machine_mode);
 extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f5d642948bc..ec93c5bab97 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22936,6 +22936,19 @@ x86_extended_rex2reg_mentioned_p (rtx insn)
   return false;
 }
 
+/* Return true when rtx operands mentions register that must be encoded using
+   evex prefix.  */
+bool
+x86_evex_reg_mentioned_p (rtx operands[], int nops)
+{
+  int i;
+  for (i = 0; i < nops; i++)
+if (EXT_REX_SSE_REG_P (operands[i])
+   || x86_extended_rex2reg_mentioned_p (operands[i]))
+  return true;
+  return false;
+}
+
 /* If profitable, negate (without causing overflow) integer constant
of mode MODE at location LOC.  Return true in this case.  */
 bool
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 83ad01b43c1..4c305e72389 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -21603,7 +21603,7 @@ (define_expand "significand2"
 (define_insn "sse4_1_round2"
   [(set (match_operand:MODEFH 0 "register_operand" "=x,x,x,v,v")
(unspec:MODEFH
- [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,m,v,m")
+ [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,Bt,v,m")
   (match_operand:SI 2 "const_0_to_15_operand")]
  UNSPEC_ROUND))]
   "TARGET_SSE4_1"
@@ -21616,6 +21616,7 @@ (define_insn "sse4_1_round2"
   [(set_attr "type" "ssecvt")
(set_attr "prefix_extra" "1,1,1,*,*")
(set_attr "length_immediate" "1")
+   (set_attr "gpr32" "1,1,0,1,1")
(set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,evex,evex")
(set_attr "isa" "noavx512f,noavx512f,noavx512f,avx512f,avx512f")
(set_attr "avx_partial_xmm_update" "false,false,true,false,true")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 05963de9219..456713b991a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -22617,11 +22617,12 @@ (define_insn "avx2_pblendd"
 
 (define_insn "sse4_1_phminposuw"
   [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x")
-   (unspec:V8HI [(match_operand:V8HI 1 "vector_operand" "YrBm,*xBm,xm")]
+   (unspec:V8HI [(match_operand:V8HI 1 "vector_operand" "YrBT,*xBT,xBt")]
 UNSPEC_PHMINPOSUW))]
   "TARGET_SSE4_1"
   "%vphminposuw\t{%1, %0|%0, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "gpr32" "0")
(set_attr "type" "sselog1")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "orig,orig,vex")
@@ -23810,12 +23811,13 @@ (define_insn "avx_vtest"
 (define_insn "*_ptest"
   [(set (reg FLAGS_REG)
(unspec [(match_operand:V_AVX 0 "register_operand" "Yr, *x, x")
-(match_operand:V_AV

[PATCH 04/13] [APX EGPR] Add 16 new integer general purpose registers

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

Extend GENERAL_REGS with extra r16-r31 registers like REX registers,
named as REX2 registers. They will only be enabled under
TARGET_APX_EGPR.

gcc/ChangeLog:

* config/i386/i386-protos.h (x86_extended_rex2reg_mentioned_p):
New function prototype.
* config/i386/i386.cc (regclass_map): Add mapping for 16 new
general registers.
(debugger64_register_map): Likewise.
(ix86_conditional_register_usage): Clear REX2 register when APX
disabled.
(ix86_code_end): Add handling for REX2 reg.
(print_reg): Likewise.
(ix86_output_jmp_thunk_or_indirect): Likewise.
(ix86_output_indirect_branch_via_reg): Likewise.
(ix86_attr_length_vex_default): Likewise.
(ix86_emit_save_regs): Adjust to allow saving r31.
(ix86_register_priority): Set REX2 reg priority same as REX.
(x86_extended_reg_mentioned_p): Add check for REX2 regs.
(x86_extended_rex2reg_mentioned_p): New function.
* config/i386/i386.h (CALL_USED_REGISTERS): Add new extended
registers.
(REG_ALLOC_ORDER): Likewise.
(FIRST_REX2_INT_REG): Define.
(LAST_REX2_INT_REG): Ditto.
(GENERAL_REGS): Add 16 new registers.
(INT_SSE_REGS): Likewise.
(FLOAT_INT_REGS): Likewise.
(FLOAT_INT_SSE_REGS): Likewise.
(INT_MASK_REGS): Likewise.
(ALL_REGS):Likewise.
(REX2_INT_REG_P): Define.
(REX2_INT_REGNO_P): Ditto.
(GENERAL_REGNO_P): Add REX2_INT_REGNO_P.
(REGNO_OK_FOR_INDEX_P): Ditto.
(REG_OK_FOR_INDEX_NONSTRICT_P): Add new extended registers.
* config/i386/i386.md: Add 16 new integer general
registers.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-egprs-names.c: New test.
* gcc.target/i386/apx-spill_to_egprs-1.c: Likewise.
* gcc.target/i386/apx-interrupt-1.c: Likewise.
---
 gcc/config/i386/i386-protos.h |   1 +
 gcc/config/i386/i386.cc   |  67 ++--
 gcc/config/i386/i386.h|  47 +---
 gcc/config/i386/i386.md   |  18 +++-
 .../gcc.target/i386/apx-egprs-names.c |  17 +++
 .../gcc.target/i386/apx-interrupt-1.c | 102 ++
 .../gcc.target/i386/apx-spill_to_egprs-1.c|  25 +
 7 files changed, 253 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-egprs-names.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-interrupt-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-spill_to_egprs-1.c

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 9ffb125fc2b..bd4782800c4 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -64,6 +64,7 @@ extern bool symbolic_reference_mentioned_p (rtx);
 extern bool extended_reg_mentioned_p (rtx);
 extern bool x86_extended_QIreg_mentioned_p (rtx_insn *);
 extern bool x86_extended_reg_mentioned_p (rtx);
+extern bool x86_extended_rex2reg_mentioned_p (rtx);
 extern bool x86_maybe_negate_const_int (rtx *, machine_mode);
 extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1bc3f11ff07..d26d9ab0d9d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -169,7 +169,12 @@ enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
   ALL_SSE_REGS, ALL_SSE_REGS, ALL_SSE_REGS, ALL_SSE_REGS,
   /* Mask registers.  */
   ALL_MASK_REGS, MASK_REGS, MASK_REGS, MASK_REGS,
-  MASK_REGS, MASK_REGS, MASK_REGS, MASK_REGS
+  MASK_REGS, MASK_REGS, MASK_REGS, MASK_REGS,
+  /* REX2 registers */
+  GENERAL_REGS, GENERAL_REGS, GENERAL_REGS, GENERAL_REGS,
+  GENERAL_REGS, GENERAL_REGS, GENERAL_REGS, GENERAL_REGS,
+  GENERAL_REGS, GENERAL_REGS, GENERAL_REGS, GENERAL_REGS,
+  GENERAL_REGS, GENERAL_REGS, GENERAL_REGS, GENERAL_REGS,
 };
 
 /* The "default" register map used in 32bit mode.  */
@@ -227,7 +232,10 @@ int const debugger64_register_map[FIRST_PSEUDO_REGISTER] =
   /* AVX-512 registers 24-31 */
   75, 76, 77, 78, 79, 80, 81, 82,
   /* Mask registers */
-  118, 119, 120, 121, 122, 123, 124, 125
+  118, 119, 120, 121, 122, 123, 124, 125,
+  /* rex2 extend interger registers */
+  130, 131, 132, 133, 134, 135, 136, 137,
+  138, 139, 140, 141, 142, 143, 144, 145
 };
 
 /* Define the register numbers to be used in Dwarf debugging information.
@@ -521,6 +529,13 @@ ix86_conditional_register_usage (void)
 
   accessible_reg_set &= ~reg_class_contents[ALL_MASK_REGS];
 }
+
+  /* If APX is disabled, disable the registers.  */
+  if (! (TARGET_APX_EGPR && TARGET_64BIT))
+{
+  for (i = FIRST_REX2_INT_REG; i <= LAST_REX2_INT_REG; i++)
+   CLEAR_HARD_REG_BIT (accessible_reg_set, i);
+}
 }
 
 /* Canonicalize a comparison from one we don't have to one we do have.  */
@@ -6179,6 +6194,13 @@ ix86_code_end (void)
regno

[PATCH 09/13] [APX EGPR] Handle legacy insn that only support GPR16 (1/5)

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

These legacy insn in opcode map0/1 only support GPR16,
and do not have vex/evex counterpart, directly adjust constraints and
add gpr32 attr to patterns.

insn list:
1. xsave/xsave64, xrstor/xrstor64
2. xsaves/xsaves64, xrstors/xrstors64
3. xsavec/xsavec64
4. xsaveopt/xsaveopt64
5. fxsave64/fxrstor64

gcc/ChangeLog:

* config/i386/i386.md (): Set attr gpr32 0 and constraint
Bt.
(_rex64): Likewise.
(_rex64): Likewise.
(64): Likewise.
(fxsave64): Likewise.
(fxstore64): Likewise.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add apxf check.
* gcc.target/i386/apx-legacy-insn-check-norex2.c: New test.
* gcc.target/i386/apx-legacy-insn-check-norex2-asm.c: New assembler 
test.
---
 gcc/config/i386/i386.md   | 18 +++
 .../i386/apx-legacy-insn-check-norex2-asm.c   |  5 
 .../i386/apx-legacy-insn-check-norex2.c   | 30 +++
 gcc/testsuite/lib/target-supports.exp | 10 +++
 4 files changed, 57 insertions(+), 6 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/i386/apx-legacy-insn-check-norex2-asm.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-legacy-insn-check-norex2.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b9eaea78f00..83ad01b43c1 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -25626,11 +25626,12 @@ (define_insn "fxsave"
 (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "fxsave64"
-  [(set (match_operand:BLK 0 "memory_operand" "=m")
+  [(set (match_operand:BLK 0 "memory_operand" "=Bt")
(unspec_volatile:BLK [(const_int 0)] UNSPECV_FXSAVE64))]
   "TARGET_64BIT && TARGET_FXSR"
   "fxsave64\t%0"
   [(set_attr "type" "other")
+   (set_attr "gpr32" "0")
(set_attr "memory" "store")
(set (attr "length")
 (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
@@ -25646,11 +25647,12 @@ (define_insn "fxrstor"
 (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "fxrstor64"
-  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "m")]
+  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "Bt")]
UNSPECV_FXRSTOR64)]
   "TARGET_64BIT && TARGET_FXSR"
   "fxrstor64\t%0"
   [(set_attr "type" "other")
+   (set_attr "gpr32" "0")
(set_attr "memory" "load")
(set (attr "length")
 (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
@@ -25704,7 +25706,7 @@ (define_insn ""
 (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "_rex64"
-  [(set (match_operand:BLK 0 "memory_operand" "=m")
+  [(set (match_operand:BLK 0 "memory_operand" "=Bt")
(unspec_volatile:BLK
 [(match_operand:SI 1 "register_operand" "a")
  (match_operand:SI 2 "register_operand" "d")]
@@ -25713,11 +25715,12 @@ (define_insn "_rex64"
   "\t%0"
   [(set_attr "type" "other")
(set_attr "memory" "store")
+   (set_attr "gpr32" "0")
(set (attr "length")
 (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn ""
-  [(set (match_operand:BLK 0 "memory_operand" "=m")
+  [(set (match_operand:BLK 0 "memory_operand" "=Bt")
(unspec_volatile:BLK
 [(match_operand:SI 1 "register_operand" "a")
  (match_operand:SI 2 "register_operand" "d")]
@@ -25726,6 +25729,7 @@ (define_insn ""
   "\t%0"
   [(set_attr "type" "other")
(set_attr "memory" "store")
+   (set_attr "gpr32" "0")
(set (attr "length")
 (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
 
@@ -25743,7 +25747,7 @@ (define_insn ""
 
 (define_insn "_rex64"
[(unspec_volatile:BLK
- [(match_operand:BLK 0 "memory_operand" "m")
+ [(match_operand:BLK 0 "memory_operand" "Bt")
   (match_operand:SI 1 "register_operand" "a")
   (match_operand:SI 2 "register_operand" "d")]
  ANY_XRSTOR)]
@@ -25751,12 +25755,13 @@ (define_insn "_rex64"
   "\t%0"
   [(set_attr "type" "other")
(set_attr "memory" "load")
+   (set_attr "gpr32" "0")
(set (attr "length")
 (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
 
 (define_insn "64"
[(unspec_volatile:BLK
- [(match_operand:BLK 0 "memory_operand" "m")
+ [(match_operand:BLK 0 "memory_operand" "Bt")
   (match_operand:SI 1 "register_operand" "a")
   (match_operand:SI 2 "register_operand" "d")]
  ANY_XRSTOR64)]
@@ -25764,6 +25769,7 @@ (define_insn "64"
   "64\t%0"
   [(set_attr "type" "other")
(set_attr "memory" "load")
+   (set_attr "gpr32" "0")
(set (attr "length")
 (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
 
diff --git a/gcc/testsuite/gcc.target/i386/apx-legacy-insn-check-norex2-asm.c 
b/gcc/testsuite/gcc.target/i386/apx-legacy-insn-check-norex2-asm.c
new file mode 100644
index 000..7ecc861435f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-legacy-insn-check-

[PATCH 10/13] [APX EGPR] Handle legacy insns that only support GPR16 (2/5)

2023-08-31 Thread Hongyu Wang via Gcc-patches
From: Kong Lingling 

These legacy insns in opcode map2/3 have vex but no evex
counterpart, disable EGPR for them by adjusting alternatives and
attr_gpr32.

insn list:
1. phaddw/vphaddw, phaddd/vphaddd, phaddsw/vphaddsw
2. phsubw/vphsubw, phsubd/vphsubd, phsubsw/vphsubsw
3. psignb/vpsginb, psignw/vpsignw, psignd/vpsignd
4. blendps/vblendps, blendpd/vblendpd
5. blendvps/vblendvps, blendvpd/vblendvpd
6. pblendvb/vpblendvb, pblendw/vpblendw
7. mpsadbw/vmpsadbw
8. dpps/vddps, dppd/vdppd
9. pcmpeqq/vpcmpeqq, pcmpgtq/vpcmpgtq

gcc/ChangeLog:

* config/i386/sse.md (avx2_phwv16hi3): Set
attr gpr32 0 and constraint Bt/BM to all mem alternatives.
(ssse3_phwv8hi3): Likewise.
(ssse3_phwv4hi3): Likewise.
(avx2_phdv8si3): Likewise.
(ssse3_phdv4si3): Likewise.
(ssse3_phdv2si3): Likewise.
(_psign3): Likewise.
(ssse3_psign3): Likewise.
(_blend_blendv_blendv_lt): Likewise.
(*_blendv_not_ltint: Likewise.
(_dp): Likewise.
(_mpsadbw): Likewise.
(_pblendvb): Likewise.
(*_pblendvb_lt): Likewise.
(sse4_1_pblend): Likewise.
(*avx2_pblend): Likewise.
(avx2_permv2ti): Likewise.
(*avx_vperm2f128_nozero): Likewise.
(*avx2_eq3): Likewise.
(*sse4_1_eqv2di3): Likewise.
(sse4_2_gtv2di3): Likewise.
(avx2_gt3): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-legacy-insn-check-norex2.c: Add
sse/vex intrinsic tests.
---
 gcc/config/i386/sse.md|  80 -
 .../i386/apx-legacy-insn-check-norex2.c   | 106 ++
 2 files changed, 159 insertions(+), 27 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bd6674d34f9..05963de9219 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16837,7 +16837,7 @@ (define_insn "*avx2_eq3"
   [(set (match_operand:VI_256 0 "register_operand" "=x")
(eq:VI_256
  (match_operand:VI_256 1 "nonimmediate_operand" "%x")
- (match_operand:VI_256 2 "nonimmediate_operand" "xm")))]
+ (match_operand:VI_256 2 "nonimmediate_operand" "xBt")))]
   "TARGET_AVX2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
   "vpcmpeq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ssecmp")
@@ -16845,6 +16845,7 @@ (define_insn "*avx2_eq3"
  (if_then_else (eq (const_string "mode") (const_string "V4DImode"))
   (const_string "1")
   (const_string "*")))
+   (set_attr "gpr32" "0")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
 
@@ -17027,7 +17028,7 @@ (define_insn "*sse4_1_eqv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,x")
(eq:V2DI
  (match_operand:V2DI 1 "vector_operand" "%0,0,x")
- (match_operand:V2DI 2 "vector_operand" "YrBm,*xBm,xm")))]
+ (match_operand:V2DI 2 "vector_operand" "YrBT,*xBT,xBt")))]
   "TARGET_SSE4_1 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
   "@
pcmpeqq\t{%2, %0|%0, %2}
@@ -17035,6 +17036,7 @@ (define_insn "*sse4_1_eqv2di3"
vpcmpeqq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "ssecmp")
+   (set_attr "gpr32" "0")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "orig,orig,vex")
(set_attr "mode" "TI")])
@@ -17043,7 +17045,7 @@ (define_insn "*sse2_eq3"
   [(set (match_operand:VI124_128 0 "register_operand" "=x,x")
(eq:VI124_128
  (match_operand:VI124_128 1 "vector_operand" "%0,x")
- (match_operand:VI124_128 2 "vector_operand" "xBm,xm")))]
+ (match_operand:VI124_128 2 "vector_operand" "xBm,xBt")))]
   "TARGET_SSE2
&& !(MEM_P (operands[1]) && MEM_P (operands[2]))"
   "@
@@ -17058,7 +17060,7 @@ (define_insn "sse4_2_gtv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,x")
(gt:V2DI
  (match_operand:V2DI 1 "register_operand" "0,0,x")
- (match_operand:V2DI 2 "vector_operand" "YrBm,*xBm,xm")))]
+ (match_operand:V2DI 2 "vector_operand" "YrBT,*xBT,xBt")))]
   "TARGET_SSE4_2"
   "@
pcmpgtq\t{%2, %0|%0, %2}
@@ -17066,6 +17068,7 @@ (define_insn "sse4_2_gtv2di3"
vpcmpgtq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "ssecmp")
+   (set_attr "gpr32" "0")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "orig,orig,vex")
(set_attr "mode" "TI")])
@@ -17074,7 +17077,7 @@ (define_insn "avx2_gt3"
   [(set (match_operand:VI_256 0 "register_operand" "=x")
(gt:VI_256
  (match_operand:VI_256 1 "register_operand" "x")
- (match_operand:VI_256 2 "nonimmediate_operand" "xm")))]
+ (match_operand:VI_256 2 "nonimmediate_operand" "xBt")))]
   "TARGET_AVX2"
   "vpcmpgt\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ssecmp")
@@ -17082,6 +17085,7 @@ (define_insn "avx2_gt3"
  (if_then_else (eq (const_string "mode") (const_string "V4DImode"))
   (const_string "1")

  1   2   >