[gcc r15-167] Update libbid according to the latest Intel Decimal Floating-Point Math Library.

2024-05-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:affd77d3fe7bfb525b3fb23316d164e847ed02d1

commit r15-167-gaffd77d3fe7bfb525b3fb23316d164e847ed02d1
Author: liuhongt 
Date:   Wed Mar 27 08:20:13 2024 +0800

Update libbid according to the latest Intel Decimal Floating-Point Math 
Library.

The Intel Decimal Floating-Point Math Library is available as open-source 
on Netlib[1].

[1] https://www.netlib.org/misc/intel/.

libgcc/config/libbid/ChangeLog:

* bid128_fma.c (add_and_round): Fix bug: the result
of (+5E+368)*(+10E-34)+(-10E+369) was returning
-99E+336 instead of expected
result -10E+337.
(bid128_ext_fma): Ditto.
(bid64qqq_fma): Ditto.
* bid128_noncomp.c: Change return type of bid128_class from
int to class_t.
* bid128_round_integral.c: Add default case to avoid compiler
warning.
* bid128_string.c (bid128_to_string): Replace 0x30 with '0'
for zero digit.
(bid128_from_string): Ditto.
* bid32_to_bid128.c (bid128_to_bid32): Fix Bug. In addition
to the INEXACT flag, the UNDERFLOW flag needs to be set (and
was not) when converting an input such as
+6931674235302037148946035460357709E+1857 to +100E-101
* bid32_to_bid64.c (bid64_to_bid32): fix Bug, In addition to
the INEXACT flag, the UNDERFLOW flag needs to be set (and was
not) when converting an input such as +9991E-111
to +100E-101. Furthermore, significant bits of NaNs are
set correctly now. For example,  0x7c3b9aca was
returning 0x7c02 instead of 0x 7c000100.
* bid64_noncomp.c: Change return type of bid64_class from int
to class_t.
* bid64_round_integral.c (bid64_round_integral_exact): Add
default case to avoid compiler warning.
* bid64_string.c (bid64_from_string): Fix bug for rounding
up. The input string "1" was returning
+1001E+1 instead of +1000E+1.
* bid64_to_bid128.c (bid128_to_bid64): Fix bug, in addition to
the INEXACT flag, the UNDERFLOW flag needs to be set (and was
not) when converting an input such as
+99E-417 to
+1000E-398.
* bid_binarydecimal.c (bid32_to_binary64): Fix bug for
conversion between binary and bid types. For example,
0x7c0F4240 was returning 0x7FFFA120 instead of
expected double precision 0x7FF8.
(binary64_to_bid32): Ditto.
(binary80_to_bid32): Ditto.
(binary128_to_bid32): Ditto.
(binary80_to_bid64): Ditto.
(binary128_to_bid64): Ditto.
* bid_conf.h (BID_HIGH_128W): New macro.
(BID_LOW_128W): Ditto.
* bid_functions.h (__ENABLE_BINARY80__): Ditto.
(ALIGN): Ditto.
* bid_inline_add.h (get_add128): Add default case to avoid compiler
warning.
* bid_internal.h (get_BID64): Ditto.
(fast_get_BID64_check_OF): Ditto.
(ALIGN): New macro.

Co-authored-by: Anderson, Cristina S 
Co-authored-by: Akkas, Ahmet 
Co-authored-by: Cornea, Marius 

Diff:
---
 libgcc/config/libbid/bid128_fma.c| 188 ++-
 libgcc/config/libbid/bid128_noncomp.c|   2 +-
 libgcc/config/libbid/bid128_round_integral.c |   2 +
 libgcc/config/libbid/bid128_string.c |   7 +-
 libgcc/config/libbid/bid32_to_bid128.c   |   3 -
 libgcc/config/libbid/bid32_to_bid64.c|  11 +-
 libgcc/config/libbid/bid64_noncomp.c |   2 +-
 libgcc/config/libbid/bid64_round_integral.c  |   2 +
 libgcc/config/libbid/bid64_string.c  |  21 ++-
 libgcc/config/libbid/bid64_to_bid128.c   |   3 -
 libgcc/config/libbid/bid_binarydecimal.c | 167 
 libgcc/config/libbid/bid_conf.h  |   8 ++
 libgcc/config/libbid/bid_functions.h |  23 +++-
 libgcc/config/libbid/bid_inline_add.h|   2 +
 libgcc/config/libbid/bid_internal.h  |  17 +--
 15 files changed, 220 insertions(+), 238 deletions(-)

diff --git a/libgcc/config/libbid/bid128_fma.c 
b/libgcc/config/libbid/bid128_fma.c
index 67233193a42..cbcf225546f 100644
--- a/libgcc/config/libbid/bid128_fma.c
+++ b/libgcc/config/libbid/bid128_fma.c
@@ -417,13 +417,12 @@ add_and_round (int q3,
   R128.w[1] = R256.w[1];
   R128.w[0] = R256.w[0];
 }
+if (e4 + x0 < expmin) { // for all rounding modes
+  is_tiny = 1;
+}
 // the rounded result has p34 = 34 digits
 e4 = e4 + x0 + incr_exp;
-if (rnd_mode == ROUNDING_TO_NEAREST) {
-  if (e

[gcc r15-235] Support dot_prod optabs for 64-bit vector.

2024-05-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:fa911365490a7ca308878517a4af6189ffba7ed6

commit r15-235-gfa911365490a7ca308878517a4af6189ffba7ed6
Author: liuhongt 
Date:   Wed Dec 20 11:43:25 2023 +0800

Support dot_prod optabs for 64-bit vector.

gcc/ChangeLog:

PR target/113079
* config/i386/mmx.md (usdot_prodv8qi): New expander.
(sdot_prodv8qi): Ditto.
(udot_prodv8qi): Ditto.
(usdot_prodv4hi): Ditto.
(udot_prodv4hi): Ditto.
(sdot_prodv4hi): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr113079.c: New test.
* gcc.target/i386/pr113079-2.c: New test.
* gcc.target/i386/sse4-pr113079-2.c: New test.

Diff:
---
 gcc/config/i386/mmx.md  | 195 
 gcc/testsuite/gcc.target/i386/pr113079-2.c  | 161 +++
 gcc/testsuite/gcc.target/i386/pr113079.c|  57 +++
 gcc/testsuite/gcc.target/i386/sse4-pr113079-2.c | 158 +++
 4 files changed, 571 insertions(+)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 9a8d6030d8b..5f342497885 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -6342,6 +6342,201 @@
   DONE;
 })
 
+(define_expand "usdot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+ || TARGET_AVXVNNI)
+{
+  rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode);
+  rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode);
+  rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_usdot_prodv16qi (op0, op1, op2, op3));
+  emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+ }
+   else
+ {
+  rtx op1 = gen_reg_rtx (V8HImode);
+  rtx op2 = gen_reg_rtx (V8HImode);
+  rtx op3 = gen_reg_rtx (V4SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+  rtx op0_1 = gen_reg_rtx (V4SImode);
+
+  emit_move_insn (op3, CONST0_RTX (V4SImode));
+  emit_insn (gen_zero_extendv8qiv8hi2 (op1, operands[1]));
+  emit_insn (gen_extendv8qiv8hi2 (op2, operands[2]));
+  emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3));
+
+  /* vec_perm (op0, 2, 3, 0, 1);  */
+  emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78)));
+  emit_insn (gen_addv4si3 (op0, op0, op0_1));
+  emit_insn (gen_addv2si3 (operands[0], operands[3],
+  lowpart_subreg (V2SImode, op0, V4SImode)));
+ }
+DONE;
+})
+
+(define_expand "sdot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if (TARGET_AVXVNNIINT8)
+{
+  rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode);
+  rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode);
+  rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_sdot_prodv16qi (op0, op1, op2, op3));
+  emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+}
+  else
+{
+  rtx op1 = gen_reg_rtx (V8HImode);
+  rtx op2 = gen_reg_rtx (V8HImode);
+  rtx op3 = gen_reg_rtx (V4SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+  rtx op0_1 = gen_reg_rtx (V4SImode);
+
+  emit_move_insn (op3, CONST0_RTX (V4SImode));
+  emit_insn (gen_extendv8qiv8hi2 (op1, operands[1]));
+  emit_insn (gen_extendv8qiv8hi2 (op2, operands[2]));
+  emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3));
+
+  /* vec_perm (op0, 2, 3, 0, 1);  */
+  emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78)));
+  emit_insn (gen_addv4si3 (op0, op0, op0_1));
+  emit_insn (gen_addv2si3 (operands[0], operands[3],
+  lowpart_subreg (V2SImode, op0, V4SImode)));
+}
+  DONE;
+
+})
+
+(define_expand "udot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if (TARGET_AVXVNNIINT8

[gcc r15-236] Extend usdot_prodv*qi with vpmaddwd when AVXVNNI/AVX512VNNI is not available.

2024-05-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:8b974f54393ab2d2d16a0051a68c155455a92aad

commit r15-236-g8b974f54393ab2d2d16a0051a68c155455a92aad
Author: liuhongt 
Date:   Mon Jan 8 15:13:41 2024 +0800

Extend usdot_prodv*qi with vpmaddwd when AVXVNNI/AVX512VNNI is not 
available.

gcc/ChangeLog:

* config/i386/sse.md (usdot_prodv*qi): Extend to VI1_AVX512
with vpmaddwd when avxvnni/avx512vnni is not available.

Diff:
---
 gcc/config/i386/sse.md | 55 +-
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1bf50726e83..f57f36ae380 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -29955,21 +29955,48 @@
 
 (define_expand "usdot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI1_AVX512VNNI 1 "register_operand")
-   (match_operand:VI1_AVX512VNNI 2 "register_operand")
+   (match_operand:VI1_AVX512 1 "register_operand")
+   (match_operand:VI1_AVX512 2 "register_operand")
(match_operand: 3 "register_operand")]
-  "(( == 64 && TARGET_EVEX512)
-|| ((TARGET_AVX512VNNI && TARGET_AVX512VL)
-   || TARGET_AVXVNNI))"
-{
-  operands[1] = lowpart_subreg (mode,
-   force_reg (mode, operands[1]),
-   mode);
-  operands[2] = lowpart_subreg (mode,
-   force_reg (mode, operands[2]),
-   mode);
-  emit_insn (gen_vpdpbusd_ (operands[0], operands[3],
- operands[1], operands[2]));
+  "TARGET_SSE2"
+{
+  if ( == 64
+ ? TARGET_AVX512VNNI
+ : ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI))
+{
+  operands[1] = lowpart_subreg (mode,
+   force_reg (mode, operands[1]),
+   mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  emit_insn (gen_vpdpbusd_ (operands[0], operands[3],
+ operands[1], operands[2]));
+}
+  else
+{
+  /* Emulate with vpdpwssd.  */
+  rtx op1_lo = gen_reg_rtx (mode);
+  rtx op1_hi = gen_reg_rtx (mode);
+  rtx op2_lo = gen_reg_rtx (mode);
+  rtx op2_hi = gen_reg_rtx (mode);
+
+  emit_insn (gen_vec_unpacku_lo_ (op1_lo, operands[1]));
+  emit_insn (gen_vec_unpacks_lo_ (op2_lo, operands[2]));
+  emit_insn (gen_vec_unpacku_hi_ (op1_hi, operands[1]));
+  emit_insn (gen_vec_unpacks_hi_ (op2_hi, operands[2]));
+
+  rtx res1 = gen_reg_rtx (mode);
+  rtx res2 = gen_reg_rtx (mode);
+  rtx sum = gen_reg_rtx (mode);
+
+  emit_move_insn (sum, CONST0_RTX (mode));
+  emit_insn (gen_sdot_prod (res1, op1_lo,
+   op2_lo, sum));
+  emit_insn (gen_sdot_prod (res2, op1_hi,
+   op2_hi, operands[3]));
+  emit_insn (gen_add3 (operands[0], res1, res2));
+}
   DONE;
 })


[gcc r15-234] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.

2024-05-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a9f642783853b60bb0a59562b8ab3ed10ec01641

commit r15-234-ga9f642783853b60bb0a59562b8ab3ed10ec01641
Author: liuhongt 
Date:   Wed Dec 20 11:54:43 2023 +0800

Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.

gcc/ChangeLog:

PR target/113090
* config/i386/i386-expand.cc
(expand_vec_perm_punpckldq_pshuf): New function.
(ix86_expand_vec_perm_const_1): Try
expand_vec_perm_punpckldq_pshuf for sequence of 2
instructions.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr113090.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc   | 71 
 gcc/testsuite/gcc.target/i386/pr113090.c | 25 +++
 2 files changed, 96 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a6132911e6a..2f27bfb484c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -21173,6 +21173,74 @@ expand_vec_perm_pshuflw_pshufhw (struct 
expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle.  */
+static bool
+expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
+{
+  if (GET_MODE_BITSIZE (d->vmode) != 64
+  || !TARGET_MMX_WITH_SSE
+  || d->one_operand_p)
+return false;
+
+  machine_mode widen_vmode;
+  switch (d->vmode)
+{
+/* pshufd.  */
+case E_V2SImode:
+  widen_vmode = V4SImode;
+  break;
+
+/* pshufd.  */
+case E_V2SFmode:
+  widen_vmode = V4SFmode;
+  break;
+
+case E_V4HImode:
+  widen_vmode = V8HImode;
+  /* pshufb.  */
+  if (!TARGET_SSSE3)
+   return false;
+  break;
+
+case E_V8QImode:
+  /* pshufb.  */
+  widen_vmode = V16QImode;
+  if (!TARGET_SSSE3)
+   return false;
+  break;
+
+default:
+  return false;
+}
+
+  if (d->testing_p)
+return true;
+
+  struct expand_vec_perm_d dperm;
+  dperm.target = gen_reg_rtx (widen_vmode);
+  rtx op0 = gen_reg_rtx (widen_vmode);
+  emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
+  dperm.op0 = op0;
+  dperm.op1 = op0;
+  dperm.vmode = widen_vmode;
+  unsigned nelt = GET_MODE_NUNITS (widen_vmode);
+  dperm.nelt = nelt;
+  dperm.one_operand_p = true;
+  dperm.testing_p = false;
+
+  for (unsigned i = 0; i != nelt / 2; i++)
+{
+  dperm.perm[i] = d->perm[i];
+  dperm.perm[i + nelt / 2] = d->perm[i];
+}
+
+  gcc_assert (expand_vec_perm_1 (&dperm));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode,
+dperm.target,
+dperm.vmode));
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
the permutation using the SSSE3 palignr instruction.  This succeeds
when all of the elements in PERM fit within one vector and we merely
@@ -23685,6 +23753,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_shufps_shufps (d))
 return true;
 
+  if (expand_vec_perm_punpckldq_pshuf (d))
+return true;
+
   /* Try sequences of three instructions.  */
 
   if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr113090.c 
b/gcc/testsuite/gcc.target/i386/pr113090.c
new file mode 100644
index 000..0f0b7cc0084
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113090.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler-times "pshufd" 3 } } */
+
+typedef int v2si __attribute__((vector_size(8)));
+typedef short v4hi __attribute__((vector_size(8)));
+typedef char v8qi __attribute__((vector_size(8)));
+
+v2si
+foo (v2si a, v2si b)
+{
+return __builtin_shufflevector (a, b, 1, 2);
+}
+
+v4hi
+foo1 (v4hi a, v4hi b)
+{
+  return __builtin_shufflevector (a, b, 2, 3, 4, 5);
+}
+
+v8qi
+foo2 (v8qi a, v8qi b)
+{
+  return __builtin_shufflevector (a, b, 4, 5, 6, 7, 8, 9, 10, 11);
+}


[gcc r15-499] x86: Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]

2024-05-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a71f90c5a7ae2942083921033cb23dcd63e70525

commit r15-499-ga71f90c5a7ae2942083921033cb23dcd63e70525
Author: Levy Hsu 
Date:   Thu May 9 16:50:56 2024 +0800

x86: Add 3-instruction subroutine vector shift for V16QI in 
ix86_expand_vec_perm_const_1 [PR107563]

Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1
to optimize vector shifting for the V16QI type on x86.
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently.
The change aims to improve assembly code generation for configurations
supporting SSE2.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

Best
Levy

gcc/ChangeLog:

PR target/107563
* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): Call 
expand_vec_perm_psrlw_psllw_por.

gcc/testsuite/ChangeLog:

PR target/107563
* g++.target/i386/pr107563-a.C: New test.
* g++.target/i386/pr107563-b.C: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc | 64 ++
 gcc/testsuite/g++.target/i386/pr107563-a.C | 13 ++
 gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++
 3 files changed, 89 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1ab22fe79736..e846a946de07 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+return false;
+
+  switch (d->vmode)
+{
+case E_V8QImode:
+  if (!TARGET_MMX_WITH_SSE)
+   return false;
+  mode = V4HImode;
+  gen_shr = gen_ashrv4hi3;
+  gen_shl = gen_ashlv4hi3;
+  gen_or = gen_iorv4hi3;
+  break;
+case E_V16QImode:
+  mode = V8HImode;
+  gen_shr = gen_vlshrv8hi3;
+  gen_shl = gen_vashlv8hi3;
+  gen_or = gen_iorv8hi3;
+  break;
+default: return false;
+}
+
+  if (!rtx_equal_p (d->op0, d->op1))
+return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+  return false;
+
+  if (d->testing_p)
+return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together.  */
@@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_2perm_pblendv (d, false))
 return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
b/gcc/testsuite/g++.target/i386/pr107563-a.C
new file mode 100755
index ..605c1bdf814b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -0,0 +1,13 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
+
+void foo2(temp_vec_type2& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
+}
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
b/gcc/testsuite/g++.target/i386/pr107563-b.C
new file mode 100755
index ..0ce3e8263bb5
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -0,0 +1,12 @@
+/* PR target/107563.C */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* {

[gcc r15-529] Optimize ashift >> 7 to vpcmpgtb for vector int8.

2024-05-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:0cc0956b3bb8bcbc9196075b9073a227d799e042

commit r15-529-g0cc0956b3bb8bcbc9196075b9073a227d799e042
Author: liuhongt 
Date:   Tue May 14 18:39:54 2024 +0800

Optimize ashift >> 7 to vpcmpgtb for vector int8.

Since there is no corresponding instruction, the shift operation for
vector int8 is implemented using the instructions for vector int16,
but for some special shift counts, it can be transformed into vpcmpgtb.

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc
(ix86_expand_vec_shift_qihi_constant): Optimize ashift >> 7 to
vpcmpgtb.
(ix86_expand_vecop_qihi_partial): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shift.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc | 32 +
 gcc/testsuite/gcc.target/i386/pr114514-shift.c | 49 ++
 2 files changed, 81 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index e846a946de07..4c47cfe468ef 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24246,6 +24246,28 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code 
code,
 return false;
 
   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+
+
+  if (shift_amount == 7
+  && code == ASHIFTRT)
+{
+  if (qimode == V16QImode
+ || qimode == V32QImode)
+   {
+ rtx zero = gen_reg_rtx (qimode);
+ emit_move_insn (zero, CONST0_RTX (qimode));
+ emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+   }
+  else
+   {
+ gcc_assert (qimode == V64QImode);
+ rtx kmask = gen_reg_rtx (DImode);
+ emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
+ emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
+   }
+  return true;
+}
+
   /* Record sign bit.  */
   xor_constant = 1 << (8 - shift_amount - 1);
 
@@ -24356,6 +24378,16 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, 
rtx dest, rtx op1, rtx op2)
   return;
 }
 
+  if (CONST_INT_P (op2)
+  && code == ASHIFTRT
+  && INTVAL (op2) == 7)
+{
+  rtx zero = gen_reg_rtx (qimode);
+  emit_move_insn (zero, CONST0_RTX (qimode));
+  emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+  return;
+}
+
   switch (code)
 {
 case MULT:
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shift.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
new file mode 100644
index ..cf8b32b3b1d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
@@ -0,0 +1,49 @@
+/* { dg-do compile  } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "vpxor" 4 } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 5 { target  ia32 } } } */
+/* { dg-final { scan-assembler-times "vpmovb2m" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovm2b" 1 } } */
+
+
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v8qi __attribute__((vector_size(8)));
+typedef char v4qi __attribute__((vector_size(4)));
+
+v4qi
+__attribute__((noipa))
+foo1 (v4qi a)
+{
+  return a >> 7;
+}
+
+v8qi
+__attribute__((noipa))
+foo2 (v8qi a)
+{
+  return a >> 7;
+}
+
+v16qi
+__attribute__((noipa))
+foo3 (v16qi a)
+{
+  return a >> 7;
+}
+
+v32qi
+__attribute__((noipa))
+foo4 (v32qi a)
+{
+  return a >> 7;
+}
+
+v64qi
+__attribute__((noipa))
+foo5 (v64qi a)
+{
+  return a >> 7;
+}


[gcc r15-530] Set d.one_operand_p to true when TARGET_SSSE3 in ix86_expand_vecop_qihi_partial.

2024-05-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:090714e6cf8029f4ff8883dce687200024adbaeb

commit r15-530-g090714e6cf8029f4ff8883dce687200024adbaeb
Author: liuhongt 
Date:   Wed May 15 10:56:24 2024 +0800

Set d.one_operand_p to true when TARGET_SSSE3 in 
ix86_expand_vecop_qihi_partial.

pshufb is available under TARGET_SSSE3, so
ix86_expand_vec_perm_const_1 must return true when TARGET_SSSE3.

With the patch under -march=x86-64-v2

v8qi
foo (v8qi a)
{
  return a >> 5;
}

<   pmovsxbw%xmm0, %xmm0
<   psraw   $5, %xmm0
<   pshufb  .LC0(%rip), %xmm0

vs.

>   movdqa  %xmm0, %xmm1
>   pcmpeqd %xmm0, %xmm0
>   pmovsxbw%xmm1, %xmm1
>   psrlw   $8, %xmm0
>   psraw   $5, %xmm1
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0

Although there's a memory load from constant pool, but it should be
better when it's inside a loop. The load from constant pool can be
hoist out. it's 1 instruction vs 4 instructions.

<   pshufb  .LC0(%rip), %xmm0

vs.

>   pcmpeqd %xmm0, %xmm0
>   psrlw   $8, %xmm0
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
Set d.one_operand_p to true when TARGET_SSSE3.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shufb.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc |  2 +-
 gcc/testsuite/gcc.target/i386/pr114514-shufb.c | 35 ++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4c47cfe468ef..4e16aedc5c13 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24458,7 +24458,7 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx 
dest, rtx op1, rtx op2)
   d.op0 = d.op1 = qres;
   d.vmode = V16QImode;
   d.nelt = 16;
-  d.one_operand_p = false;
+  d.one_operand_p = TARGET_SSSE3;
   d.testing_p = false;
 
   for (i = 0; i < d.nelt; ++i)
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shufb.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
new file mode 100644
index ..71fdc9d8daf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "packuswb" } }  */
+/* { dg-final { scan-assembler-times "pshufb" 4 { target { ! ia32 } } } }  */
+/* { dg-final { scan-assembler-times "pshufb" 6 { target  ia32 } } }  */
+
+typedef unsigned char v8uqi __attribute__((vector_size(8)));
+typedef  char v8qi __attribute__((vector_size(8)));
+typedef unsigned char v4uqi __attribute__((vector_size(4)));
+typedef  char v4qi __attribute__((vector_size(4)));
+
+v8qi
+foo (v8qi a)
+{
+  return a >> 5;
+}
+
+v8uqi
+foo1 (v8uqi a)
+{
+  return a >> 5;
+}
+
+v4qi
+foo2 (v4qi a)
+{
+  return a >> 5;
+}
+
+v4uqi
+foo3 (v4uqi a)
+{
+  return a >> 5;
+}
+


[gcc r15-717] Use pblendw instead of pand to clear upper 16 bits.

2024-05-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:0ebaffccb294d90184ad78367de66b6307de3ac0

commit r15-717-g0ebaffccb294d90184ad78367de66b6307de3ac0
Author: liuhongt 
Date:   Fri Mar 22 14:40:00 2024 +0800

Use pblendw instead of pand to clear upper 16 bits.

For vec_pack_truncv8si/v4si w/o AVX512,
(const_vector:v4si (const_int 0x) x4) is used as mask to clear
upper 16 bits, but vpblendw with zero_vector can also be used, and
zero vector is cheaper than (const_vector:v4si (const_int 0x) x4).

gcc/ChangeLog:
PR target/114427
* config/i386/i386-expand.cc (expand_vec_perm_even_odd_pack):
Use pblendw instead of pand to clear upper bits.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114427.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc   | 34 
 gcc/testsuite/gcc.target/i386/pr114427.c | 18 +
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 100fb2afb3a..7142c0a9d77 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22587,6 +22587,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
 {
   rtx op, dop0, dop1, t;
   unsigned i, odd, c, s, nelt = d->nelt;
+  int pblendw_i = 0;
   bool end_perm = false;
   machine_mode half_mode;
   rtx (*gen_and) (rtx, rtx, rtx);
@@ -22608,6 +22609,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv2si3;
   gen_pack = gen_mmx_packusdw;
   gen_shift = gen_lshrv2si3;
+  pblendw_i = 0x5;
   break;
 case E_V8HImode:
   /* Required for "pack".  */
@@ -22619,6 +22621,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv4si3;
   gen_pack = gen_sse4_1_packusdw;
   gen_shift = gen_lshrv4si3;
+  pblendw_i = 0x55;
   break;
 case E_V8QImode:
   /* No check as all instructions are SSE2.  */
@@ -22647,6 +22650,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv8si3;
   gen_pack = gen_avx2_packusdw;
   gen_shift = gen_lshrv8si3;
+  pblendw_i = 0x;
   end_perm = true;
   break;
 case E_V32QImode:
@@ -22682,10 +22686,32 @@ expand_vec_perm_even_odd_pack (struct 
expand_vec_perm_d *d)
   dop1 = gen_reg_rtx (half_mode);
   if (odd == 0)
 {
-  t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
-  t = force_reg (half_mode, t);
-  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
-  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+  /* Use pblendw since const_vector 0 should be cheaper than
+const_vector 0x.  */
+  if (d->vmode == V4HImode
+ || d->vmode == E_V8HImode
+ || d->vmode == E_V16HImode)
+   {
+ rtx dop0_t = gen_reg_rtx (d->vmode);
+ rtx dop1_t = gen_reg_rtx (d->vmode);
+ t = gen_reg_rtx (d->vmode);
+ emit_move_insn (t, CONST0_RTX (d->vmode));
+
+ emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
+GEN_INT (pblendw_i)));
+ emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
+GEN_INT (pblendw_i)));
+
+ emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
+ emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
+   }
+  else
+   {
+ t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+ t = force_reg (half_mode, t);
+ emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+ emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+   }
 }
   else
 {
diff --git a/gcc/testsuite/gcc.target/i386/pr114427.c 
b/gcc/testsuite/gcc.target/i386/pr114427.c
new file mode 100644
index 000..58b66db7fff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114427.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+/* { dg-final { scan-assembler-not "65535" } } */
+
+void
+foo (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 16; i++)
+  b[i] = c[i] + a[i];
+}
+
+void
+foo1 (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 8; i++)
+  b[i] = c[i] + a[i];
+}


[gcc r15-3058] Align predicates for operands[1] between mov and *mov_internal.

2024-08-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:bb42c551905024ea23095a0eb7b58fdbcfbcaef6

commit r15-3058-gbb42c551905024ea23095a0eb7b58fdbcfbcaef6
Author: liuhongt 
Date:   Tue Aug 20 14:41:00 2024 +0800

Align predicates for operands[1] between mov and *mov_internal.

 > It's not obvious to me why movv16qi requires a nonimmediate_operand
> > source, especially since ix86_expand_vector_mode does have code to
> > cope with constant operand[1]s.  emit_move_insn_1 doesn't check the
> > predicates anyway, so the predicate will have little effect.
> >
> > A workaround would be to check legitimate_constant_p instead of the
> > predicate, but I'm not sure that that should be necessary.
> >
> > Has this already been discussed?  If not, we should loop in the x86
> > maintainers (but I didn't do that here in case it would be a repeat).
>
> I also noticed it. Not sure why movv16qi requires a
> nonimmediate_operand, while ix86_expand_vector_mode could deal with
> constant op. Looking forward to Hongtao's comments.
The code has been there since 2005 before I'm involved.
 It looks to me at the beginning both mov and
*mov_internal only support nonimmediate_operand for the
operands[1].
And r0-75606-g5656a184e83983 adjusted the nonimmediate_operand to
nonimmediate_or_sse_const_operand for *mov_internal, but not for
mov. I think we can align the predicate between mov
and *mov_internal.

gcc/ChangeLog:

* config/i386/sse.md (mov): Align predicates for
operands[1] between mov and *mov_internal.
* config/i386/mmx.md (mov): Ditto.

Diff:
---
 gcc/config/i386/mmx.md | 2 +-
 gcc/config/i386/sse.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 94d3a6e56922..cb2697537a81 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -169,7 +169,7 @@
 
 (define_expand "mov"
   [(set (match_operand:MMXMODE 0 "nonimmediate_operand")
-   (match_operand:MMXMODE 1 "nonimmediate_operand"))]
+   (match_operand:MMXMODE 1 "nonimm_or_0_operand"))]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
 {
   ix86_expand_vector_move (mode, operands);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8f34c9300d03..e67d25f960e2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1387,7 +1387,7 @@
 
 (define_expand "mov"
   [(set (match_operand:VMOVE 0 "nonimmediate_operand")
-   (match_operand:VMOVE 1 "nonimmediate_operand"))]
+   (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"))]
   "TARGET_SSE"
 {
   ix86_expand_vector_move (mode, operands);


[gcc r15-3078] Align ix86_{move_max,store_max} with vectorizer.

2024-08-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:6ea25c041964bf63014fcf7bb68fb1f5a0a4e123

commit r15-3078-g6ea25c041964bf63014fcf7bb68fb1f5a0a4e123
Author: liuhongt 
Date:   Thu Aug 15 12:54:07 2024 +0800

Align ix86_{move_max,store_max} with vectorizer.

When none of mprefer-vector-width, avx256_optimal/avx128_optimal,
avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will
set ix86_{move_max,store_max} as max available vector length except
for AVX part.

  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
  else
opts->x_ix86_move_max = PVW_AVX128;

So for -mavx2, vectorizer will choose 256-bit for vectorization, but
128-bit is used for struct copy, there could be a potential STLF issue
due to this "misalign".

The patch fixes that.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
set ix86_{move_max,store_max} to PVW_AVX256 when TARGET_AVX
instead of PVW_AVX128.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pieces-memcpy-10.c: Add -mprefer-vector-width=128.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.
* gcc.target/i386/pieces-memcpy-22.c: New test.
* gcc.target/i386/pieces-memset-51.c: New test.
* gcc.target/i386/pieces-strcpy-3.c: New test.

Diff:
---
 gcc/config/i386/i386-options.cc  |  6 ++
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-51.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c  | 15 +++
 12 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index f423455b3638..f79257cc7641 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -3023,6 +3023,9 @@ ix86_option_override_internal (bool main_args_p,
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_move_max = PVW_AVX256;
  else
opts->x_ix86_move_max = PVW_AVX128;
}
@@ -3047,6 +3050,9 @@ ix86_option_override_internal (bool main_args_p,
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_store_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_store_max = PVW_AVX256;
  else
opts->x_ix86_store_max = PVW_AVX128;
}
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 5faee21f9b99..53ad0b3be443 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
new file mode 100644
index ..605b3623ffc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
b/gcc/testsuite/gcc.target

[gcc r14-10608] Align ix86_{move_max,store_max} with vectorizer.

2024-08-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:27dc1533b6dfc49f3912c524db51d6c372a5ac3d

commit r14-10608-g27dc1533b6dfc49f3912c524db51d6c372a5ac3d
Author: liuhongt 
Date:   Thu Aug 15 12:54:07 2024 +0800

Align ix86_{move_max,store_max} with vectorizer.

When none of mprefer-vector-width, avx256_optimal/avx128_optimal,
avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will
set ix86_{move_max,store_max} as max available vector length except
for AVX part.

  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
  else
opts->x_ix86_move_max = PVW_AVX128;

So for -mavx2, vectorizer will choose 256-bit for vectorization, but
128-bit is used for struct copy, there could be a potential STLF issue
due to this "misalign".

The patch fixes that.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
set ix86_{move_max,store_max} to PVW_AVX256 when TARGET_AVX
instead of PVW_AVX128.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pieces-memcpy-10.c: Add -mprefer-vector-width=128.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.
* gcc.target/i386/pieces-memcpy-22.c: New test.
* gcc.target/i386/pieces-memset-51.c: New test.
* gcc.target/i386/pieces-strcpy-3.c: New test.

(cherry picked from commit 6ea25c041964bf63014fcf7bb68fb1f5a0a4e123)

Diff:
---
 gcc/config/i386/i386-options.cc  |  6 ++
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-51.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c  | 15 +++
 12 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 6c212a8edeb9..f6c450cc871c 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -3062,6 +3062,9 @@ ix86_option_override_internal (bool main_args_p,
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_move_max = PVW_AVX256;
  else
opts->x_ix86_move_max = PVW_AVX128;
}
@@ -3086,6 +3089,9 @@ ix86_option_override_internal (bool main_args_p,
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_store_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_store_max = PVW_AVX256;
  else
opts->x_ix86_store_max = PVW_AVX128;
}
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 5faee21f9b99..53ad0b3be443 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
new file mode 100644
index ..605b3623ffc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
diff --gi

[gcc r13-8987] Align ix86_{move_max,store_max} with vectorizer.

2024-08-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:aea374238cec1a1e53fb79575d2f998e16926999

commit r13-8987-gaea374238cec1a1e53fb79575d2f998e16926999
Author: liuhongt 
Date:   Thu Aug 15 12:54:07 2024 +0800

Align ix86_{move_max,store_max} with vectorizer.

When none of mprefer-vector-width, avx256_optimal/avx128_optimal,
avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will
set ix86_{move_max,store_max} as max available vector length except
for AVX part.

  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
  else
opts->x_ix86_move_max = PVW_AVX128;

So for -mavx2, vectorizer will choose 256-bit for vectorization, but
128-bit is used for struct copy, there could be a potential STLF issue
due to this "misalign".

The patch fixes that.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
set ix86_{move_max,store_max} to PVW_AVX256 when TARGET_AVX
instead of PVW_AVX128.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pieces-memcpy-10.c: Add -mprefer-vector-width=128.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.
* gcc.target/i386/pieces-memcpy-22.c: New test.
* gcc.target/i386/pieces-memset-51.c: New test.
* gcc.target/i386/pieces-strcpy-3.c: New test.

(cherry picked from commit 6ea25c041964bf63014fcf7bb68fb1f5a0a4e123)

Diff:
---
 gcc/config/i386/i386-options.cc  |  6 ++
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-51.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c  | 15 +++
 12 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index a4cff4e615f0..1b3856a630a5 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2840,6 +2840,9 @@ ix86_option_override_internal (bool main_args_p,
{
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
opts->x_ix86_move_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_move_max = PVW_AVX256;
  else
opts->x_ix86_move_max = PVW_AVX128;
}
@@ -2861,6 +2864,9 @@ ix86_option_override_internal (bool main_args_p,
{
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
opts->x_ix86_store_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_store_max = PVW_AVX256;
  else
opts->x_ix86_store_max = PVW_AVX128;
}
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 5faee21f9b99..53ad0b3be443 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
new file mode 100644
index ..605b3623ffc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.

[gcc r12-10682] Align ix86_{move_max,store_max} with vectorizer.

2024-08-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b4bc34db3f2948e37ad55a09870635e88c54c7d3

commit r12-10682-gb4bc34db3f2948e37ad55a09870635e88c54c7d3
Author: liuhongt 
Date:   Thu Aug 15 12:54:07 2024 +0800

Align ix86_{move_max,store_max} with vectorizer.

When none of mprefer-vector-width, avx256_optimal/avx128_optimal,
avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will
set ix86_{move_max,store_max} as max available vector length except
for AVX part.

  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
  else
opts->x_ix86_move_max = PVW_AVX128;

So for -mavx2, vectorizer will choose 256-bit for vectorization, but
128-bit is used for struct copy, there could be a potential STLF issue
due to this "misalign".

The patch fixes that.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
set ix86_{move_max,store_max} to PVW_AVX256 when TARGET_AVX
instead of PVW_AVX128.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pieces-memcpy-10.c: Add -mprefer-vector-width=128.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.
* gcc.target/i386/pieces-memcpy-22.c: New test.
* gcc.target/i386/pieces-memset-51.c: New test.
* gcc.target/i386/pieces-strcpy-3.c: New test.

(cherry picked from commit aea374238cec1a1e53fb79575d2f998e16926999)

Diff:
---
 gcc/config/i386/i386-options.cc  |  6 ++
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-51.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c  | 15 +++
 12 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 318f6c614551..ad496ea5a8eb 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2766,6 +2766,9 @@ ix86_option_override_internal (bool main_args_p,
{
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
opts->x_ix86_move_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_move_max = PVW_AVX256;
  else
opts->x_ix86_move_max = PVW_AVX128;
}
@@ -2787,6 +2790,9 @@ ix86_option_override_internal (bool main_args_p,
{
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
opts->x_ix86_store_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_store_max = PVW_AVX256;
  else
opts->x_ix86_store_max = PVW_AVX128;
}
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 5faee21f9b99..53ad0b3be443 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
new file mode 100644
index ..605b3623ffc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6

[gcc r13-8988] Fix testcase failure.

2024-08-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ea9c508927ec032c6d67a24df59ffa429e4d3d95

commit r13-8988-gea9c508927ec032c6d67a24df59ffa429e4d3d95
Author: liuhongt 
Date:   Thu Aug 22 14:31:40 2024 +0800

Fix testcase failure.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pieces-memcpy-10.c: Use -mmove-max=256 and
-mstore-max=256.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.

Diff:
---
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 53ad0b3be443..78f92ac5197d 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
index cfd2a86cf33b..57b74ae4b230 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
index ddd194debd57..d9443678735d 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
index 9c206465d465..8ad6ad7e494a 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
index b0756182e355..08fd6e9a9278 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge -mno-stackrealign" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge -mno-stackrealign" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
index 103da699ae52..6b73bb256af6 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
index f1494e176105..c6c7ff234dab 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c 
b/

[gcc r12-10683] Fix testcase failure.

2024-08-22 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:141d8aa375ea32c05f0d437828e6a76f1a3ea4af

commit r12-10683-g141d8aa375ea32c05f0d437828e6a76f1a3ea4af
Author: liuhongt 
Date:   Thu Aug 22 14:31:40 2024 +0800

Fix testcase failure.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pieces-memcpy-10.c: Use -mmove-max=256 and
-mstore-max=256.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.

(cherry picked from commit ea9c508927ec032c6d67a24df59ffa429e4d3d95)

Diff:
---
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 53ad0b3be443..78f92ac5197d 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
index cfd2a86cf33b..57b74ae4b230 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
index ddd194debd57..d9443678735d 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
index 9c206465d465..8ad6ad7e494a 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
index b0756182e355..08fd6e9a9278 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge -mno-stackrealign" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge -mno-stackrealign" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
index 103da699ae52..6b73bb256af6 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
index f1494e176105..c6c7ff234dab 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 exter

[gcc r15-3314] Check avx upper register for parallel.

2024-08-29 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ab214ef734bfc3dcffcf79ff9e1dd651c2b40566

commit r15-3314-gab214ef734bfc3dcffcf79ff9e1dd651c2b40566
Author: liuhongt 
Date:   Thu Aug 29 11:39:20 2024 +0800

Check avx upper register for parallel.

For function arguments/return, when it's BLK mode, it's put in a
parallel with an expr_list, and the expr_list contains the real mode
and registers.
Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
failed to handle that. The patch extend the handle to each subrtx.

gcc/ChangeLog:

PR target/116512
* config/i386/i386.cc (ix86_check_avx_upper_register): Iterate
subrtx to scan for avx upper register.
(ix86_check_avx_upper_stores): Inline old
ix86_check_avx_upper_register.
(ix86_avx_u128_mode_needed): Ditto, and replace
FOR_EACH_SUBRTX with call to new
ix86_check_avx_upper_register.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116512.c: New test.

Diff:
---
 gcc/config/i386/i386.cc  | 36 
 gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index a1f65d41fdd5..546c964d2a47 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14882,9 +14882,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn)
 static bool
 ix86_check_avx_upper_register (const_rtx exp)
 {
-  return (SSE_REG_P (exp)
- && !EXT_REX_SSE_REG_P (exp)
- && GET_MODE_BITSIZE (GET_MODE (exp)) > 128);
+  /* construct_container may return a parallel with expr_list
+ which contains the real reg and mode  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, exp, NONCONST)
+{
+  const_rtx x = *iter;
+  if (SSE_REG_P (x)
+ && !EXT_REX_SSE_REG_P (x)
+ && GET_MODE_BITSIZE (GET_MODE (x)) > 128)
+   return true;
+}
+
+  return false;
 }
 
 /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
@@ -14892,7 +14902,9 @@ ix86_check_avx_upper_register (const_rtx exp)
 static void
 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
 {
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+  && !EXT_REX_SSE_REG_P (dest)
+  && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
 {
   bool *used = (bool *) data;
   *used = true;
@@ -14951,14 +14963,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
   return AVX_U128_CLEAN;
 }
 
-  subrtx_iterator::array_type array;
-
   rtx set = single_set (insn);
   if (set)
 {
   rtx dest = SET_DEST (set);
   rtx src = SET_SRC (set);
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+ && !EXT_REX_SSE_REG_P (dest)
+ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
{
  /* This is an YMM/ZMM load.  Return AVX_U128_DIRTY if the
 source isn't zero.  */
@@ -14969,9 +14981,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
}
   else
{
- FOR_EACH_SUBRTX (iter, array, src, NONCONST)
-   if (ix86_check_avx_upper_register (*iter))
- return AVX_U128_DIRTY;
+ if (ix86_check_avx_upper_register (src))
+   return AVX_U128_DIRTY;
}
 
   /* This isn't YMM/ZMM load/store.  */
@@ -14982,9 +14993,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
  Hardware changes state only when a 256bit register is written to,
  but we need to prevent the compiler from moving optimal insertion
  point above eventual read from 256bit or 512 bit register.  */
-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
-if (ix86_check_avx_upper_register (*iter))
-  return AVX_U128_DIRTY;
+  if (ix86_check_avx_upper_register (PATTERN (insn)))
+return AVX_U128_DIRTY;
 
   return AVX_U128_ANY;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c 
b/gcc/testsuite/gcc.target/i386/pr116512.c
new file mode 100644
index ..c2bc6c91b648
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116512.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+
+#include 
+
+struct B {
+  union {
+__m512 f;
+__m512i s;
+  };
+};
+
+struct B foo(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res;
+}
+
+__m512i bar(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res.s;
+}


[gcc r14-10625] Check avx upper register for parallel.

2024-09-01 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ba9a3f105ea552a22d08f2d54dfdbef16af7c99e

commit r14-10625-gba9a3f105ea552a22d08f2d54dfdbef16af7c99e
Author: liuhongt 
Date:   Thu Aug 29 11:39:20 2024 +0800

Check avx upper register for parallel.

For function arguments/return, when it's BLK mode, it's put in a
parallel with an expr_list, and the expr_list contains the real mode
and registers.
Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
failed to handle that. The patch extend the handle to each subrtx.

gcc/ChangeLog:

PR target/116512
* config/i386/i386.cc (ix86_check_avx_upper_register): Iterate
subrtx to scan for avx upper register.
(ix86_check_avx_upper_stores): Inline old
ix86_check_avx_upper_register.
(ix86_avx_u128_mode_needed): Ditto, and replace
FOR_EACH_SUBRTX with call to new
ix86_check_avx_upper_register.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116512.c: New test.

(cherry picked from commit ab214ef734bfc3dcffcf79ff9e1dd651c2b40566)

Diff:
---
 gcc/config/i386/i386.cc  | 36 
 gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 288c69467d62..feefbe322dec 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15027,9 +15027,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn)
 static bool
 ix86_check_avx_upper_register (const_rtx exp)
 {
-  return (SSE_REG_P (exp)
- && !EXT_REX_SSE_REG_P (exp)
- && GET_MODE_BITSIZE (GET_MODE (exp)) > 128);
+  /* construct_container may return a parallel with expr_list
+ which contains the real reg and mode  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, exp, NONCONST)
+{
+  const_rtx x = *iter;
+  if (SSE_REG_P (x)
+ && !EXT_REX_SSE_REG_P (x)
+ && GET_MODE_BITSIZE (GET_MODE (x)) > 128)
+   return true;
+}
+
+  return false;
 }
 
 /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
@@ -15037,7 +15047,9 @@ ix86_check_avx_upper_register (const_rtx exp)
 static void
 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
 {
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+  && !EXT_REX_SSE_REG_P (dest)
+  && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
 {
   bool *used = (bool *) data;
   *used = true;
@@ -15096,14 +15108,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
   return AVX_U128_CLEAN;
 }
 
-  subrtx_iterator::array_type array;
-
   rtx set = single_set (insn);
   if (set)
 {
   rtx dest = SET_DEST (set);
   rtx src = SET_SRC (set);
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+ && !EXT_REX_SSE_REG_P (dest)
+ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
{
  /* This is an YMM/ZMM load.  Return AVX_U128_DIRTY if the
 source isn't zero.  */
@@ -15114,9 +15126,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
}
   else
{
- FOR_EACH_SUBRTX (iter, array, src, NONCONST)
-   if (ix86_check_avx_upper_register (*iter))
- return AVX_U128_DIRTY;
+ if (ix86_check_avx_upper_register (src))
+   return AVX_U128_DIRTY;
}
 
   /* This isn't YMM/ZMM load/store.  */
@@ -15127,9 +15138,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
  Hardware changes state only when a 256bit register is written to,
  but we need to prevent the compiler from moving optimal insertion
  point above eventual read from 256bit or 512 bit register.  */
-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
-if (ix86_check_avx_upper_register (*iter))
-  return AVX_U128_DIRTY;
+  if (ix86_check_avx_upper_register (PATTERN (insn)))
+return AVX_U128_DIRTY;
 
   return AVX_U128_ANY;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c 
b/gcc/testsuite/gcc.target/i386/pr116512.c
new file mode 100644
index ..c2bc6c91b648
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116512.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+
+#include 
+
+struct B {
+  union {
+__m512 f;
+__m512i s;
+  };
+};
+
+struct B foo(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res;
+}
+
+__m512i bar(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res.s;
+}


[gcc r13-8999] Check avx upper register for parallel.

2024-09-01 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:5e049ada87842947adaca5c607516396889f64d6

commit r13-8999-g5e049ada87842947adaca5c607516396889f64d6
Author: liuhongt 
Date:   Thu Aug 29 11:39:20 2024 +0800

Check avx upper register for parallel.

For function arguments/return, when it's BLK mode, it's put in a
parallel with an expr_list, and the expr_list contains the real mode
and registers.
Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
failed to handle that. The patch extend the handle to each subrtx.

gcc/ChangeLog:

PR target/116512
* config/i386/i386.cc (ix86_check_avx_upper_register): Iterate
subrtx to scan for avx upper register.
(ix86_check_avx_upper_stores): Inline old
ix86_check_avx_upper_register.
(ix86_avx_u128_mode_needed): Ditto, and replace
FOR_EACH_SUBRTX with call to new
ix86_check_avx_upper_register.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116512.c: New test.

(cherry picked from commit ab214ef734bfc3dcffcf79ff9e1dd651c2b40566)

Diff:
---
 gcc/config/i386/i386.cc  | 36 
 gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 499184166ff2..a90351ca9c2c 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14432,9 +14432,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn)
 static bool
 ix86_check_avx_upper_register (const_rtx exp)
 {
-  return (SSE_REG_P (exp)
- && !EXT_REX_SSE_REG_P (exp)
- && GET_MODE_BITSIZE (GET_MODE (exp)) > 128);
+  /* construct_container may return a parallel with expr_list
+ which contains the real reg and mode  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, exp, NONCONST)
+{
+  const_rtx x = *iter;
+  if (SSE_REG_P (x)
+ && !EXT_REX_SSE_REG_P (x)
+ && GET_MODE_BITSIZE (GET_MODE (x)) > 128)
+   return true;
+}
+
+  return false;
 }
 
 /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
@@ -14442,7 +14452,9 @@ ix86_check_avx_upper_register (const_rtx exp)
 static void
 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
 {
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+  && !EXT_REX_SSE_REG_P (dest)
+  && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
 {
   bool *used = (bool *) data;
   *used = true;
@@ -14500,14 +14512,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
   return AVX_U128_CLEAN;
 }
 
-  subrtx_iterator::array_type array;
-
   rtx set = single_set (insn);
   if (set)
 {
   rtx dest = SET_DEST (set);
   rtx src = SET_SRC (set);
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+ && !EXT_REX_SSE_REG_P (dest)
+ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
{
  /* This is an YMM/ZMM load.  Return AVX_U128_DIRTY if the
 source isn't zero.  */
@@ -14518,9 +14530,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
}
   else
{
- FOR_EACH_SUBRTX (iter, array, src, NONCONST)
-   if (ix86_check_avx_upper_register (*iter))
- return AVX_U128_DIRTY;
+ if (ix86_check_avx_upper_register (src))
+   return AVX_U128_DIRTY;
}
 
   /* This isn't YMM/ZMM load/store.  */
@@ -14531,9 +14542,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
  Hardware changes state only when a 256bit register is written to,
  but we need to prevent the compiler from moving optimal insertion
  point above eventual read from 256bit or 512 bit register.  */
-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
-if (ix86_check_avx_upper_register (*iter))
-  return AVX_U128_DIRTY;
+  if (ix86_check_avx_upper_register (PATTERN (insn)))
+return AVX_U128_DIRTY;
 
   return AVX_U128_ANY;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c 
b/gcc/testsuite/gcc.target/i386/pr116512.c
new file mode 100644
index ..c2bc6c91b648
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116512.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+
+#include 
+
+struct B {
+  union {
+__m512 f;
+__m512i s;
+  };
+};
+
+struct B foo(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res;
+}
+
+__m512i bar(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res.s;
+}


[gcc r12-10694] Check avx upper register for parallel.

2024-09-01 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:6585b06303d8fd9da907f443fc0da9faed303712

commit r12-10694-g6585b06303d8fd9da907f443fc0da9faed303712
Author: liuhongt 
Date:   Thu Aug 29 11:39:20 2024 +0800

Check avx upper register for parallel.

For function arguments/return, when it's BLK mode, it's put in a
parallel with an expr_list, and the expr_list contains the real mode
and registers.
Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
failed to handle that. The patch extend the handle to each subrtx.

gcc/ChangeLog:

PR target/116512
* config/i386/i386.cc (ix86_check_avx_upper_register): Iterate
subrtx to scan for avx upper register.
(ix86_check_avx_upper_stores): Inline old
ix86_check_avx_upper_register.
(ix86_avx_u128_mode_needed): Ditto, and replace
FOR_EACH_SUBRTX with call to new
ix86_check_avx_upper_register.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116512.c: New test.

(cherry picked from commit ab214ef734bfc3dcffcf79ff9e1dd651c2b40566)

Diff:
---
 gcc/config/i386/i386.cc  | 36 
 gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index af42e4b9739e..2d272bdaf1a4 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14360,9 +14360,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn)
 static bool
 ix86_check_avx_upper_register (const_rtx exp)
 {
-  return (SSE_REG_P (exp)
- && !EXT_REX_SSE_REG_P (exp)
- && GET_MODE_BITSIZE (GET_MODE (exp)) > 128);
+  /* construct_container may return a parallel with expr_list
+ which contains the real reg and mode  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, exp, NONCONST)
+{
+  const_rtx x = *iter;
+  if (SSE_REG_P (x)
+ && !EXT_REX_SSE_REG_P (x)
+ && GET_MODE_BITSIZE (GET_MODE (x)) > 128)
+   return true;
+}
+
+  return false;
 }
 
 /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
@@ -14370,7 +14380,9 @@ ix86_check_avx_upper_register (const_rtx exp)
 static void
 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
 {
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+  && !EXT_REX_SSE_REG_P (dest)
+  && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
 {
   bool *used = (bool *) data;
   *used = true;
@@ -14428,14 +14440,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
   return AVX_U128_CLEAN;
 }
 
-  subrtx_iterator::array_type array;
-
   rtx set = single_set (insn);
   if (set)
 {
   rtx dest = SET_DEST (set);
   rtx src = SET_SRC (set);
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+ && !EXT_REX_SSE_REG_P (dest)
+ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
{
  /* This is an YMM/ZMM load.  Return AVX_U128_DIRTY if the
 source isn't zero.  */
@@ -14446,9 +14458,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
}
   else
{
- FOR_EACH_SUBRTX (iter, array, src, NONCONST)
-   if (ix86_check_avx_upper_register (*iter))
- return AVX_U128_DIRTY;
+ if (ix86_check_avx_upper_register (src))
+   return AVX_U128_DIRTY;
}
 
   /* This isn't YMM/ZMM load/store.  */
@@ -14459,9 +14470,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
  Hardware changes state only when a 256bit register is written to,
  but we need to prevent the compiler from moving optimal insertion
  point above eventual read from 256bit or 512 bit register.  */
-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
-if (ix86_check_avx_upper_register (*iter))
-  return AVX_U128_DIRTY;
+  if (ix86_check_avx_upper_register (PATTERN (insn)))
+return AVX_U128_DIRTY;
 
   return AVX_U128_ANY;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c 
b/gcc/testsuite/gcc.target/i386/pr116512.c
new file mode 100644
index ..c2bc6c91b648
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116512.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+
+#include 
+
+struct B {
+  union {
+__m512 f;
+__m512i s;
+  };
+};
+
+struct B foo(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res;
+}
+
+__m512i bar(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res.s;
+}


[gcc r15-3498] Handle const0_operand for *avx2_pcmp3_1.

2024-09-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a51f2fc0d80869ab079a93cc3858f24a1fd28237

commit r15-3498-ga51f2fc0d80869ab079a93cc3858f24a1fd28237
Author: liuhongt 
Date:   Wed Sep 4 15:39:17 2024 +0800

Handle const0_operand for *avx2_pcmp3_1.

*_eq3_1 supports
nonimm_or_0_operand for op1 and op2, pass_combine would fail to lower
avx512 comparision back to avx2 one when op1/op2 is const0_rtx. It's
because the splitter only support nonimmediate_operand.

Failed to match this instruction:
(set (reg/i:V16QI 20 xmm0)
(vec_merge:V16QI (const_vector:V16QI [
(const_int -1 [0x]) repeated x16
])
(const_vector:V16QI [
(const_int 0 [0]) repeated x16
])
(unspec:HI [
(reg:V16QI 105 [ a ])
(const_vector:V16QI [
(const_int 0 [0]) repeated x16
])
(const_int 0 [0])
] UNSPEC_PCMP)))

The patch extend predicates of the splitter to handles that.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md (*avx2_pcmp3_1): Change predicate
of operands[1] and operands[2] from nonimmdiate_operand to
nonimm_or_0_operand.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115517.c: New test.

Diff:
---
 gcc/config/i386/sse.md   |  9 ++--
 gcc/testsuite/gcc.target/i386/pr115517.c | 38 
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3bf95f0b0e5..1946d3513be 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17908,8 +17908,8 @@
  (match_operand:VI_128_256 1 "vector_all_ones_operand")
  (match_operand:VI_128_256 2 "const0_operand")
  (unspec:
-   [(match_operand:VI_128_256 3 "nonimmediate_operand")
-(match_operand:VI_128_256 4 "nonimmediate_operand")
+   [(match_operand:VI_128_256 3 "nonimm_or_0_operand")
+(match_operand:VI_128_256 4 "nonimm_or_0_operand")
 (match_operand:SI 5 "const_0_to_7_operand")]
 UNSPEC_PCMP)))]
   "TARGET_AVX512VL && ix86_pre_reload_split ()
@@ -17928,6 +17928,11 @@
 {
   if (INTVAL (operands[5]) == 1)
 std::swap (operands[3], operands[4]);
+
+  operands[3] = force_reg (mode, operands[3]);
+  if (operands[4] == CONST0_RTX (mode))
+operands[4] = force_reg (mode, operands[4]);
+
   enum rtx_code code = INTVAL (operands[5]) ? GT : EQ;
   emit_move_insn (operands[0], gen_rtx_fmt_ee (code, mode,
   operands[3], operands[4]));
diff --git a/gcc/testsuite/gcc.target/i386/pr115517.c 
b/gcc/testsuite/gcc.target/i386/pr115517.c
new file mode 100644
index 000..e91d2c23a6b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115517.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 4 } } */
+/* { dg-final { scan-assembler-not {(?n)%k[0-9]} } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+
+v16qi
+foo (v16qi a)
+{
+  v16qi b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  return a == b;
+}
+
+v8hi
+foo2 (v8hi a)
+{
+  v8hi b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return a == b;
+}
+
+v4si
+foo3 (v4si a)
+{
+  v4si b = {0, 0, 0, 0};
+  return a == b;
+}
+
+v2di
+foo4 (v2di a)
+{
+  v2di b = {0, 0};
+  return a == b;
+}
+


[gcc r15-3558] Don't force_reg operands[3] when it's not const0_rtx.

2024-09-09 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:c726a6643125a59e2ba6f992924a2d0098104578

commit r15-3558-gc726a6643125a59e2ba6f992924a2d0098104578
Author: liuhongt 
Date:   Fri Sep 6 15:03:16 2024 +0800

Don't force_reg operands[3] when it's not const0_rtx.

It fix the regression by

a51f2fc0d80869ab079a93cc3858f24a1fd28237 is the first bad commit
commit a51f2fc0d80869ab079a93cc3858f24a1fd28237
Author: liuhongt 
Date:   Wed Sep 4 15:39:17 2024 +0800

Handle const0_operand for *avx2_pcmp3_1.

caused

FAIL: gcc.target/i386/pr59539-1.c scan-assembler-times vmovdqu|vmovups 1

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/pr59539-1.c --target_board='unix{-m32\ 
-march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/pr59539-1.c --target_board='unix{-m64\ 
-march=cascadelake}'"

gcc/ChangeLog:

* config/i386/sse.md (*avx2_pcmp3_1): Don't force_reg
operands[3] when it's not const0_rtx.

Diff:
---
 gcc/config/i386/sse.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1946d3513be1..1ae61182d0cc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17929,7 +17929,8 @@
   if (INTVAL (operands[5]) == 1)
 std::swap (operands[3], operands[4]);
 
-  operands[3] = force_reg (mode, operands[3]);
+  if (operands[3] == CONST0_RTX (mode))
+operands[3] = force_reg (mode, operands[3]);
   if (operands[4] == CONST0_RTX (mode))
 operands[4] = force_reg (mode, operands[4]);


[gcc r15-3579] Enable tune fuse_move_and_alu for GNR.

2024-09-10 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:f80e4ba94e41410219bdcdb1a0f204ea3f148666

commit r15-3579-gf80e4ba94e41410219bdcdb1a0f204ea3f148666
Author: liuhongt 
Date:   Tue Sep 10 15:04:58 2024 +0800

Enable tune fuse_move_and_alu for GNR.

According to Intel Software Optimization Manual[1], the Redwood cove
microarchitecture supports LD+OP and MOV+OP macro fusions.

The patch enables MOV+OP tune for GNR.

[1] 
https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_FUSE_MOV_AND_ALU): Enable
for GNR and GNR-D.

Diff:
---
 gcc/config/i386/x86-tune.def | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index d7e2ad7fd250..3d123da95f0c 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -153,7 +153,8 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, 
"fuse_alu_and_branch",
 /* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
and the destination is used by alu.  alu must be one of
ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR.  */
-DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)
+DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu",
+m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
 
 /*/
 /* Function prologue, epilogue and function calling sequences.   */


[gcc r15-1638] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

2024-06-25 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:aac00d09859cc5934bd0f7493d537b8430337773

commit r15-1638-gaac00d09859cc5934bd0f7493d537b8430337773
Author: liuhongt 
Date:   Thu Jun 20 12:41:13 2024 +0800

Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
and x < 0 ? 1 : 0 into (unsigned) x >> 31.

Move the optimization did in ix86_expand_int_vcond to match.pd

gcc/ChangeLog:

PR target/114189
* match.pd: Simplify a < 0 ? -1 : 0 to (signed) >> 31 and a <
0 ? 1 : 0 to (unsigned) a >> 31 for vector integer type.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr115517.c: New test.
* gcc.target/i386/avx512-pr115517.c: New test.
* g++.target/i386/avx2-pr115517.C: New test.
* g++.target/i386/avx512-pr115517.C: New test.
* g++.dg/tree-ssa/pr88152-1.C: Adjust testcase.

Diff:
---
 gcc/match.pd| 30 +++
 gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C   |  2 +-
 gcc/testsuite/g++.target/i386/avx2-pr115517.C   | 60 +
 gcc/testsuite/g++.target/i386/avx512-pr115517.C | 70 +
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c   | 33 
 gcc/testsuite/gcc.target/i386/avx512-pr115517.c | 70 +
 6 files changed, 264 insertions(+), 1 deletion(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 3d0689c9312..cf8a399a744 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5927,6 +5927,36 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (VECTOR_INTEGER_TYPE_P (type)
&& target_supports_op_p (type, MINMAX, optab_vector))
 (minmax @0 @1
+
+/* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+   and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_all_onesp integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && !TYPE_UNSIGNED (TREE_TYPE (@0))
+   && tree_nop_conversion_p (type, TREE_TYPE (@0))
+   && target_supports_op_p (TREE_TYPE (@0), RSHIFT_EXPR, optab_scalar))
+(with
+  {
+   unsigned int prec = element_precision (TREE_TYPE (@0));
+  }
+(view_convert
+  (rshift @0 { build_int_cst (integer_type_node, prec - 1);})
+
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_onep integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && !TYPE_UNSIGNED (TREE_TYPE (@0))
+   && tree_nop_conversion_p (type, TREE_TYPE (@0)))
+(with
+ {
+   unsigned int prec = element_precision (TREE_TYPE (@0));
+   tree utype = unsigned_type_for (TREE_TYPE (@0));
+ }
+ (if (target_supports_op_p (utype, RSHIFT_EXPR, optab_scalar))
+  (view_convert
+   (rshift (view_convert:utype @0)
+   { build_int_cst (integer_type_node, prec - 1);}))
 #endif
 
 (for cnd (cond vec_cond)
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
index 423ec897c1d..21299b886f0 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
@@ -1,7 +1,7 @@
 // PR target/88152
 // { dg-do compile }
 // { dg-options "-O2 -std=c++14 -fdump-tree-forwprop1" }
-// { dg-final { scan-tree-dump-times " (?:<|>=) \{ 0\[, ]" 120 "forwprop1" } }
+// { dg-final { scan-tree-dump-times " (?:(?:<|>=) \{ 0\[, \]|>> 
(?:7|15|31|63))" 120 "forwprop1" } }
 
 template 
 using V [[gnu::vector_size (sizeof (T) * N)]] = T;
diff --git a/gcc/testsuite/g++.target/i386/avx2-pr115517.C 
b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
new file mode 100644
index 000..ec000c57542
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vpsrlq" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrld" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */
+
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v8hi
+foo (v8hi a)
+{
+  v8hi const1_op = __extension__(v8hi){1,1,1,1,1,1,1,1};
+  v8hi const0_op = __extension__(v8hi){0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v16hi
+foo2 (v16hi a)
+{
+  v16hi const1_op = __extension__(v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+  v16hi const0_op = __extension__(v16hi){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v4si
+foo3 (v4si a)
+{
+  v4si const1_op = __extension__(v4si){1,1,1,1};
+  v4si const0_op = __extension__(v4si){0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v8si
+foo4 (v8si a)
+{
+  v8si const1_op = __extension__(v8

[gcc r15-1673] Fix wrong cost of MEM when addr is a lea.

2024-06-26 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b8153b5417bed02f47354a14ad36100785dfdc47

commit r15-1673-gb8153b5417bed02f47354a14ad36100785dfdc47
Author: liuhongt 
Date:   Mon Jun 24 17:53:22 2024 +0800

Fix wrong cost of MEM when addr is a lea.

416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
It is the case in the PR, the patch adjust rtx_cost to only handle reg
+ disp, for other forms, they're basically all LEA which doesn't have
additional cost of ADD.

gcc/ChangeLog:

PR target/115462
* config/i386/i386.cc (ix86_rtx_costs): Make cost of MEM (reg +
disp) just a little bit more than MEM (reg).

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr115462.c: New test.

Diff:
---
 gcc/config/i386/i386.cc  |  5 -
 gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1f71ed04be6..92e3c67112e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22154,7 +22154,10 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 address_cost should be used, but it reduce cost too much.
 So current solution is make constant disp as cheap as possible.  */
  if (GET_CODE (addr) == PLUS
- && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode)
+ /* Only hanlde (reg + disp) since other forms of addr are mostly 
LEA,
+there's no additional cost for the plus of disp.  */
+ && register_operand (XEXP (addr, 0), Pmode))
{
  *total += 1;
  *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c 
b/gcc/testsuite/gcc.target/i386/pr115462.c
new file mode 100644
index 000..ad50a6382bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115462.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
+/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } 
} */
+
+int
+foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, 
long n, int* q)
+{
+  static int p1[1];
+  int* p2 = p1 + 1000;
+  int* p3 = p1 + 4000;
+  int* p4 = p1 + 8000;
+
+  for (long i = 0; i != n; i++)
+{
+  /* scan for  movl%edi, p1.0+3996(,%rax,4),
+p1.0+3996 should be propagted into the loop.  */
+  p2[indx++] = q[indx++];
+  p3[indx2++] = q[indx2++];
+  p4[indx3++] = q[indx3++];
+}
+  return p1[indx6] + p1[indx5];
+}


[gcc r15-1733] Define mask as extern instead of uninitialized local variables.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:5e1a9f4ccff390ae79a9b9d0d39b325f2b4ea925

commit r15-1733-g5e1a9f4ccff390ae79a9b9d0d39b325f2b4ea925
Author: liuhongt 
Date:   Wed Jun 26 11:17:46 2024 +0800

Define mask as extern instead of uninitialized local variables.

The testcases are supposed to scan for vpopcnt{b,w,d,q} operations
with k mask, but mask is defined as uninitialized local variable which
will be set as 0 at rtl expand phase.
And it's further simplified off by late_combine which caused scan assembly 
failure.
Move the definition of mask outside to make the testcases more stable.

gcc/testsuite/ChangeLog:

PR target/115610
* gcc.target/i386/avx512bitalg-vpopcntb.c: Define mask as
extern instead of uninitialized local variables.
* gcc.target/i386/avx512bitalg-vpopcntbvl.c: Ditto.
* gcc.target/i386/avx512bitalg-vpopcntw.c: Ditto.
* gcc.target/i386/avx512bitalg-vpopcntwvl.c: Ditto.
* gcc.target/i386/avx512vpopcntdq-vpopcntd.c: Ditto.
* gcc.target/i386/avx512vpopcntdq-vpopcntq.c: Ditto.

Diff:
---
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c| 3 +--
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c  | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c| 2 +-
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c  | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c | 5 +++--
 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
index 44b82c0519d..66d24107c26 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
@@ -7,10 +7,9 @@
 #include 
 
 extern __m512i z, z1;
-
+extern __mmask16 msk;
 int foo ()
 {
-  __mmask16 msk;
   __m512i c = _mm512_popcnt_epi8 (z);
   asm volatile ("" : "+v" (c));
   c = _mm512_mask_popcnt_epi8 (z1, msk, z);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
index 8c2dfaba9c6..8ab05653f7c 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
@@ -11,11 +11,11 @@
 
 extern __m256i y, y_1;
 extern __m128i x, x_1;
+extern __mmask32 msk32;
+extern __mmask16 msk16;
 
 int foo ()
 {
-  __mmask32 msk32;
-  __mmask16 msk16;
   __m256i c256 = _mm256_popcnt_epi8 (y);
   asm volatile ("" : "+v" (c256));
   c256 = _mm256_mask_popcnt_epi8 (y_1, msk32, y);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
index 2ef8589f6c1..c741bf48a51 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
@@ -7,10 +7,10 @@
 #include 
 
 extern __m512i z, z1;
+extern __mmask16 msk;
 
 int foo ()
 {
-  __mmask16 msk;
   __m512i c = _mm512_popcnt_epi16 (z);
   asm volatile ("" : "+v" (c));
   c = _mm512_mask_popcnt_epi16 (z1, msk, z);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
index c976461b12e..79bb3c31e85 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
@@ -11,11 +11,11 @@
 
 extern __m256i y, y_1;
 extern __m128i x, x_1;
+extern __mmask16 msk16;
+extern __mmask8 msk8;
 
 int foo ()
 {
-  __mmask16 msk16;
-  __mmask8 msk8;
   __m256i c256 = _mm256_popcnt_epi16 (y);
   asm volatile ("" : "+v" (c256));
   c256 = _mm256_mask_popcnt_epi16 (y_1, msk16, y);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c 
b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
index b4d82f97032..776a4753d8e 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
@@ -15,11 +15,12 @@
 extern __m128i x, x_1;
 extern __m256i y, y_1;
 extern __m512i z, z_1;
+extern  __mmask16 msk;
+extern  __mmask8 msk8;
+
 
 int foo ()
 {
-  __mmask16 msk;
-  __mmask8 msk8;
   __m128i a = _mm_popcnt_epi32 (x);
   asm volatile ("" : "+v" (a));
   a = _mm_mask_popcnt_epi32 (x_1, msk8, x);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c 
b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
index e87d6c999b6..c6314ac5deb 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
@@ -15,10 +15,10 @@
 extern __m128i x, x_1;
 extern __m256i y, y_1;
 extern __m512i z, z_1;
+extern __mmask8 msk; 
 
 int foo ()
 {
-  __mmask8 msk; 
   __m128i a = _mm_popcnt_epi64 (x);
   asm volatile ("" : "+v" (a));
   a = _mm_mask_popcnt_epi64 (x_1, msk, x);


[gcc r15-1734] Extend lshifrtsi3_1_zext to ?k alternative.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:8e1fa107a63b2e160b6bf69de4fe163dd3cebd80

commit r15-1734-g8e1fa107a63b2e160b6bf69de4fe163dd3cebd80
Author: liuhongt 
Date:   Wed Jun 26 13:07:31 2024 +0800

Extend lshifrtsi3_1_zext to ?k alternative.

late_combine will combine lshift + zero into *lshifrtsi3_1_zext which
cause extra mov between gpr and kmask, add ?k to the pattern.

gcc/ChangeLog:

PR target/115610
* config/i386/i386.md (<*insnsi3_zext): Add alternative ?k,
enable it only for lshiftrt and under avx512bw.
* config/i386/sse.md (*klshrsi3_1_zext): New define_insn, and
add corresponding define_split after it.

Diff:
---
 gcc/config/i386/i386.md | 19 +--
 gcc/config/i386/sse.md  | 28 
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b6ccb1e798d..59a889da304 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16836,10 +16836,10 @@
(set_attr "mode" "SI")])
 
 (define_insn "*si3_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,?k")
(zero_extend:DI
- (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "cI,r,cI"
+ (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" 
"0,rm,rm,k")
+ (match_operand:QI 2 "nonmemory_operand" 
"cI,r,cI,I"
(clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT
&& ix86_binary_operator_ok (, SImode, operands, TARGET_APX_NDD)"
@@ -16850,6 +16850,8 @@
 case TYPE_ISHIFTX:
   return "#";
 
+case TYPE_MSKLOG:
+  return "#";
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
@@ -16860,8 +16862,8 @@
   : "{l}\t{%2, %k0|%k0, %2}";
 }
 }
-  [(set_attr "isa" "*,bmi2,apx_ndd")
-   (set_attr "type" "ishift,ishiftx,ishift")
+  [(set_attr "isa" "*,bmi2,apx_ndd,avx512bw")
+   (set_attr "type" "ishift,ishiftx,ishift,msklog")
(set (attr "length_immediate")
  (if_then_else
(and (match_operand 2 "const1_operand")
@@ -16869,7 +16871,12 @@
 (match_test "optimize_function_for_size_p (cfun)")))
(const_string "0")
(const_string "*")))
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "SI")
+   (set (attr "enabled")
+   (if_then_else
+ (eq_attr "alternative" "3")
+ (symbol_ref " == LSHIFTRT && TARGET_AVX512BW")
+ (const_string "*")))])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
 (define_split
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a94ec3c441f..3db4f374b9b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2183,6 +2183,34 @@
 (match_dup 2)))
   (unspec [(const_int 0)] UNSPEC_MASKOP)])])
 
+(define_insn "*klshrsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+   (zero_extend:DI
+ (lshiftrt:SI (match_operand:SI 1 "register_operand" "k")
+  (match_operand 2 "const_0_to_31_operand" "I"
+  (unspec [(const_int 0)] UNSPEC_MASKOP)]
+  "TARGET_AVX512BW"
+  "kshiftrd\t{%2, %1, %0|%0, %1, %2}"
+[(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SI")])
+
+(define_split
+  [(set (match_operand:DI 0 "mask_reg_operand")
+   (zero_extend:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "mask_reg_operand")
+   (match_operand 2 "const_0_to_31_operand"
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW && reload_completed"
+  [(parallel
+ [(set (match_dup 0)
+  (zero_extend:DI
+(lshiftrt:SI
+  (match_dup 1)
+  (match_dup 2
+  (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "ktest"
   [(set (reg:CC FLAGS_REG)
(unspec:CC


[gcc r15-1735] Enable flate-combine.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e62ea4fb8ffcab06ddd02f26db91b29b7270743f

commit r15-1735-ge62ea4fb8ffcab06ddd02f26db91b29b7270743f
Author: liuhongt 
Date:   Wed Jun 26 13:52:24 2024 +0800

Enable flate-combine.

Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also
define target_insn_cost to prevent post_reload pass_late_combine to
revert the optimziation did in pass_rpad.

Adjust testcases since pass_late_combine generates better code but
break scan assembly.

.i.e
Under 32-bit target, gcc used to generate broadcast from stack and
then do the real operation.
After flate_combine, they're combined into embeded broadcast
operations.

gcc/ChangeLog:

* config/i386/i386-features.cc (ix86_rpad_gate): New function.
* config/i386/i386-options.cc (ix86_override_options_after_change):
Don't disable flate_combine.
* config/i386/i386-passes.def: Move pass_stv2 and pass_rpad
after pre_reload pas_late_combine.
* config/i386/i386-protos.h (ix86_rpad_gate): New declare.
* config/i386/i386.cc (ix86_insn_cost): New function.
(TARGET_INSN_COST): Define.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus
testcase.
* gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto.
* gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto.
* gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto.
* gcc.target/i386/pr91333.c: Ditto.
* gcc.target/i386/vect-strided-4.c: Ditto.

Diff:
---
 gcc/config/i386/i386-features.cc   | 16 +++-
 gcc/config/i386/i386-options.cc|  4 
 gcc/config/i386/i386-passes.def|  4 ++--
 gcc/config/i386/i386-protos.h  |  1 +
 gcc/config/i386/i386.cc| 18 ++
 .../gcc.target/i386/avx512f-broadcast-pr87767-1.c  |  4 ++--
 .../gcc.target/i386/avx512f-broadcast-pr87767-5.c  |  1 -
 gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c |  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c |  2 +-
 .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512vl-broadcast-pr87767-1.c |  4 ++--
 .../gcc.target/i386/avx512vl-broadcast-pr87767-5.c |  2 --
 gcc/testsuite/gcc.target/i386/pr91333.c|  2 +-
 gcc/testsuite/gcc.target/i386/vect-strided-4.c |  2 +-
 15 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 607d1991460..fc224ed06b0 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area (gcc::context 
*ctxt)
   return new pass_insert_endbr_and_patchable_area (ctxt);
 }
 
+bool
+ix86_rpad_gate ()
+{
+  return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
+}
+
 /* At entry of the nearest common dominator for basic blocks with
conversions/rcp/sqrt/rsqrt/round, generate a single
vxorps %xmmN, %xmmN, %xmmN
@@ -3232,11 +3242,7 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
 {
-  return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
+  return ix86_rpad_gate ();
 }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 9c12d498928..1ef2c71a7a2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void)
flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
 }
 
-  /* Late combine tends to undo some of the effects of STV and RPAD,
- by combining instructions back to their original form.  */
-  if (!OPTION_SET_P (flag_late_combine_instructions))
-flag_late_combine_instructions = 0;
 }
 
 /* Clear stack slot assignments remembered from previous functions.
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 7d96766f7b9..2d29f65da88 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -25,11 +25,11 @@ along with GCC; see the file COPYING3.  If not see
  */
 
   INSERT_P

[gcc r15-1736] Add more splitters to match (unspec [op1 op2 (gt op3 constm1_operand)] UNSPEC_BLENDV)

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:2e2dfa0095c3326a0a5fc2ff175918b42eeb044f

commit r15-1736-g2e2dfa0095c3326a0a5fc2ff175918b42eeb044f
Author: liuhongt 
Date:   Mon Jun 17 17:16:46 2024 +0800

Add more splitters to match (unspec [op1 op2 (gt op3 constm1_operand)] 
UNSPEC_BLENDV)

These define_insn_and_split are needed after vcond{,u,eq} is obsolete.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md
(*_blendv_gt): New
define_insn_and_split.
(*_blendv_gtint):
Ditto.
(*_blendv_not_gtint):
Ditto.
(*_pblendvb_gt): Ditto.
(*_pblendvb_gt_subreg_not): Ditto.

Diff:
---
 gcc/config/i386/sse.md | 130 +
 1 file changed, 130 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3db4f374b9b..423f13d3982 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -23079,6 +23079,32 @@
(set_attr "btver2_decode" "vector,vector,vector") 
(set_attr "mode" "")])
 
+(define_insn_and_split "*_blendv_gt"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
+   (unspec:VF_128_256
+ [(match_operand:VF_128_256 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand:VF_128_256 2 "register_operand" "0,0,x")
+  (gt:VF_128_256
+(match_operand: 3 "register_operand" "Yz,Yz,x")
+(match_operand: 4 "vector_all_ones_operand"))]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:VF_128_256
+[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))]
+  "operands[3] = gen_lowpart (mode, operands[3]);"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "addr" "gpr16")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "")])
+
 (define_mode_attr ssefltmodesuffix
   [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")
(V2DF "pd") (V4DF "pd") (V4SF "ps") (V8SF "ps")])
@@ -23118,6 +23144,38 @@
(set_attr "btver2_decode" "vector,vector,vector") 
(set_attr "mode" "")])
 
+(define_insn_and_split 
"*_blendv_gtint"
+  [(set (match_operand: 0 "register_operand" "=Yr,*x,x")
+   (unspec:
+ [(match_operand: 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand: 2 "register_operand" "0,0,x")
+  (subreg:
+(gt:VI48_AVX
+  (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x")
+  (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:
+[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))]
+{
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[1] = gen_lowpart (mode, operands[1]);
+  operands[2] = gen_lowpart (mode, operands[2]);
+  operands[3] = gen_lowpart (mode, operands[3]);
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "addr" "gpr16")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "")])
+
 ;; PR target/100738: Transform vpcmpeqd + vpxor + vblendvps to vblendvps for 
inverted mask;
 (define_insn_and_split 
"*_blendv_not_ltint"
   [(set (match_operand: 0 "register_operand")
@@ -23145,6 +23203,32 @@
   operands[3] = gen_lowpart (mode, operands[3]);
 })
 
+(define_insn_and_split 
"*_blendv_not_gtint"
+  [(set (match_operand: 0 "register_operand")
+   (unspec:
+ [(match_operand: 1 "vector_operand")
+  (match_operand: 2 "register_operand")
+  (subreg:
+(gt:VI48_AVX
+  (subreg:VI48_AVX
+  (not:
+(match_operand: 3 "register_operand")) 0)
+  (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:
+[(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
+{
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[2] = gen_lowpart (mode, operands[2]);
+  operands[1] = force_reg (mode,
+  gen_lowpart (mode, operands[1]));
+  operands[3] = gen_lowpart (mode, operands[3]);
+})
+
 (define_insn "_dp"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
(unspec:VF_128_256
@@ -23299,6 +23383,30 @@
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_pblendvb_gt"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+   (unspe

[gcc r15-1737] Lower AVX512 kmask comparison back to AVX2 comparison when op_{true, false} is vector -1/0.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b06a108f0fbffe12493b527224f6e4131a72beac

commit r15-1737-gb06a108f0fbffe12493b527224f6e4131a72beac
Author: liuhongt 
Date:   Tue Jun 18 14:03:42 2024 +0800

Lower AVX512 kmask comparison back to AVX2 comparison when op_{true,false} 
is vector -1/0.

gcc/ChangeLog
PR target/115517
* config/i386/sse.md
(*_cvtmask2_not): New pre_reload
splitter.
(*_cvtmask2_not): Ditto.
(*avx2_pcmp3_6): Ditto.
(*avx2_pcmp3_7): Ditto.

Diff:
---
 gcc/config/i386/sse.md | 97 ++
 1 file changed, 97 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 423f13d3982..3d790af3a2c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10008,6 +10008,24 @@
   [(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_cvtmask2_not"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+   (vec_merge:VI12_AVX512VL
+ (match_operand:VI12_AVX512VL 2 "const0_operand")
+ (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 4)
+   (not: (match_dup 1)))
+   (set (match_dup 0)
+   (vec_merge:VI12_AVX512VL
+ (match_dup 3)
+ (match_dup 2)
+ (match_dup 4)))]
+  "operands[4] = gen_reg_rtx (mode);")
+
 (define_expand "_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -10046,6 +10064,24 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_cvtmask2_not"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+   (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "const0_operand")
+ (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 4)
+   (not: (match_dup 1)))
+   (set (match_dup 0)
+   (vec_merge:VI48_AVX512VL
+ (match_dup 3)
+ (match_dup 2)
+ (match_dup 4)))]
+  "operands[4] = gen_reg_rtx (mode);")
+
 (define_insn "*_cvtmask2_pternlog_false_dep"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
(vec_merge:VI48_AVX512VL
@@ -17738,6 +17774,67 @@
 std::swap (operands[1], operands[2]);
 })
 
+(define_int_attr pcmp_usmin
+  [(UNSPEC_PCMP "smin") (UNSPEC_UNSIGNED_PCMP "umin")])
+
+(define_insn_and_split "*avx2_pcmp3_6"
+ [(set (match_operand:VI_128_256  0 "register_operand")
+   (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "vector_all_ones_operand")
+ (match_operand:VI_128_256 2 "const0_operand")
+ (unspec:
+   [(match_operand:VI_128_256 3 "nonimmediate_operand")
+(match_operand:VI_128_256 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_7_operand")]
+UNSPEC_PCMP_ITER)))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()
+   && (INTVAL (operands[5]) == 2 || INTVAL (operands[5]) == 5)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dst_min = gen_reg_rtx (mode);
+
+  if (MEM_P (operands[3]) && MEM_P (operands[4]))
+operands[3] = force_reg (mode, operands[3]);
+  emit_insn (gen_3 (dst_min, operands[3], operands[4]));
+  rtx eq_op = INTVAL (operands[5]) == 2 ? operands[3] : operands[4];
+  emit_move_insn (operands[0], gen_rtx_EQ (mode, eq_op, dst_min));
+  DONE;
+})
+
+(define_insn_and_split "*avx2_pcmp3_7"
+ [(set (match_operand:VI_128_256  0 "register_operand")
+   (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "const0_operand")
+ (match_operand:VI_128_256 2 "vector_all_ones_operand")
+ (unspec:
+   [(match_operand:VI_128_256 3 "nonimmediate_operand")
+(match_operand:VI_128_256 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_7_operand")]
+UNSPEC_PCMP_ITER)))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()
+ /* NE is commutative.  */
+   && (INTVAL (operands[5]) == 4
+ /* LE, 3 must be register.  */
+   || INTVAL (operands[5]) == 2
+ /* NLT aka GE, 4 must be register and we swap operands.  */
+   || INTVAL (operands[5]) == 5)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (INTVAL (operands[5]) == 5)
+std::swap (operands[3], operands[4]);
+
+  if (MEM_P (operands[3]))
+operands[3] = force_reg (mode, operands[3]);
+  enum rtx_code code = INTVAL (operands[5]) != 4 ? GT : EQ;
+  emit_move_insn (operands[0], gen_rtx_fmt_ee (code, mode,
+  operands[3], operands[4]));
+  DONE;
+})
+
 (define_expand "_eq3"
   [(set (match_operand: 0 "register_operand")
(unspec:


[gcc r15-1739] Add more splitter for mskmov with avx512 comparison.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:3cb204046c0db899750aee9480af4f1953a40ac3

commit r15-1739-g3cb204046c0db899750aee9480af4f1953a40ac3
Author: liuhongt 
Date:   Wed Jun 19 13:12:00 2024 +0800

Add more splitter for mskmov with avx512 comparison.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md
(*_movmsk_lt_avx512): New
define_insn_and_split.
(*_movmsk_ext_lt_avx512):
Ditto.
(*_pmovmskb_lt_avx512): Ditto.
(*_pmovmskb_zext_lt_avx512): Ditto.
(*sse2_pmovmskb_ext_lt_avx512): Ditto.
(*pmovsk_kmask_v16qi_avx512): Ditto.
(*pmovsk_mask_v32qi_avx512): Ditto.
(*pmovsk_mask_cmp__avx512): Ditto.
(*pmovsk_ptest__avx512): Ditto.

Diff:
---
 gcc/config/i386/sse.md | 232 -
 1 file changed, 209 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 694b4b8f07c..3ffa1881c83 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10071,24 +10071,6 @@
   [(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split "*_cvtmask2_not"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 2 "const0_operand")
- (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand")
- (match_operand: 1 "register_operand")))]
-  "TARGET_AVX512BW && ix86_pre_reload_split ()"
-  "#"
-  "&& 1"
-  [(set (match_dup 4)
-   (not: (match_dup 1)))
-   (set (match_dup 0)
-   (vec_merge:VI12_AVX512VL
- (match_dup 3)
- (match_dup 2)
- (match_dup 4)))]
-  "operands[4] = gen_reg_rtx (mode);")
-
 (define_expand "_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -10128,10 +10110,10 @@
(set_attr "mode" "")])
 
 (define_insn_and_split "*_cvtmask2_not"
-  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
-   (vec_merge:VI48_AVX512VL
- (match_operand:VI48_AVX512VL 2 "const0_operand")
- (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand")
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+   (vec_merge:VI1248_AVX512VLBW
+ (match_operand:VI1248_AVX512VLBW 2 "const0_operand")
+ (match_operand:VI1248_AVX512VLBW 3 "vector_all_ones_operand")
  (match_operand: 1 "register_operand")))]
   "TARGET_AVX512F && ix86_pre_reload_split ()"
   "#"
@@ -10139,7 +10121,7 @@
   [(set (match_dup 4)
(not: (match_dup 1)))
(set (match_dup 0)
-   (vec_merge:VI48_AVX512VL
+   (vec_merge:VI1248_AVX512VLBW
  (match_dup 3)
  (match_dup 2)
  (match_dup 4)))]
@@ -21816,6 +21798,30 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_movmsk_lt_avx512"
+  [(set (match_operand:SI 0 "register_operand" "=r,jr")
+   (unspec:SI
+ [(subreg:VF_128_256
+   (vec_merge:
+(match_operand: 3 "vector_all_ones_operand")
+(match_operand: 4 "const0_operand")
+(unspec:
+ [(match_operand: 1 "register_operand" "x,x")
+  (match_operand: 2 "const0_operand")
+  (const_int 1)]
+ UNSPEC_PCMP)) 0)]
+ UNSPEC_MOVMSK))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))]
+  "operands[1] = gen_lowpart (mode, operands[1]);"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "*_movmsk_ext_lt"
   [(set (match_operand:DI 0 "register_operand" "=r,jr")
(any_extend:DI
@@ -21835,6 +21841,31 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "")])
 
+(define_insn_and_split 
"*_movmsk_ext_lt_avx512"
+  [(set (match_operand:DI 0 "register_operand" "=r,jr")
+   (any_extend:DI
+ (unspec:SI
+   [(subreg:VF_128_256
+ (vec_merge:
+  (match_operand: 3 "vector_all_ones_operand")
+  (match_operand: 4 "const0_operand")
+  (unspec:
+   [(match_operand: 1 "register_operand" "x,x")
+(match_operand: 2 "const0_operand")
+(const_int 1)]
+   UNSPEC_PCMP)) 0)]
+   UNSPEC_MOVMSK)))]
+  "TARGET_64BIT && TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))]
+  "operands[1] = gen_lowpart (mode, operands[1]);"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "*_movmsk_shift"
   [(set (match_operand:SI 0 "register_operand" "=r,jr")
(unspec:SI
@@ -22024,6 +22055,34 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode"

[gcc r15-1740] Adjust testcase for the regressed testcases after obsolete of vcond{, u, eq}.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e94e6ee495d95f29355bbc017214228a5e367638

commit r15-1740-ge94e6ee495d95f29355bbc017214228a5e367638
Author: liuhongt 
Date:   Wed Jun 19 16:05:58 2024 +0800

Adjust testcase for the regressed testcases after obsolete of vcond{,u,eq}.

> Richard suggests that we implement the "obvious" transforms like
> inversion in the middle-end but if for example unsigned compares
> are not supported the us_minus + eq + negative trick isn't on
> that list.
>
> The main reason to restrict vec_cmp would be to avoid
> a <= b ? c : d going with an unsupported vec_cmp but instead
> do a > b ? d : c - the alternative is trying to fix this
> on the RTL side via combine.  I understand the non-native

Yes, I have a patch which can fix most regressions via pattern match
in combine.
Still there is a situation that is difficult to deal with, mainly the
optimization w/o sse4.1 . Because pblendvb/blendvps/blendvpd only
exists under sse4.1, w/o sse4.1, it takes 3
instructions (pand,pandn,por) to simulate the vcond_mask, and the
combine matches up to 4 instructions, which makes it currently
impossible to use the combine to recover those optimizations in the
vcond{,u,eq}.i.e min/max.

In the case of sse 4.1 and above, there is basically no regression anymore.

the regression testcases w/o sse4.1

FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++14  scan-assembler-times 
pcmpeqb 2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++17  scan-assembler-times 
pcmpeqb 2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++20  scan-assembler-times 
pcmpeqb 2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++98  scan-assembler-times 
pcmpeqb 2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++14  scan-assembler-times 
pcmpeqw 2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++17  scan-assembler-times 
pcmpeqw 2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++20  scan-assembler-times 
pcmpeqw 2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++98  scan-assembler-times 
pcmpeqw 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++14  scan-assembler-times 
pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++17  scan-assembler-times 
pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++20  scan-assembler-times 
pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++98  scan-assembler-times 
pcmpeqb 2
FAIL: gcc.target/i386/pr88540.c scan-assembler minpd

gcc/testsuite/ChangeLog:

PR target/115517
* g++.target/i386/pr100637-1b.C: Add xfail and -mno-sse4.1.
* g++.target/i386/pr100637-1w.C: Ditto.
* g++.target/i386/pr103861-1.C: Ditto.
* gcc.target/i386/pr88540.c: Ditto.
* gcc.target/i386/pr103941-2.c: Add -mno-avx512f.
* g++.target/i386/sse4_1-pr100637-1b.C: New test.
* g++.target/i386/sse4_1-pr100637-1w.C: New test.
* g++.target/i386/sse4_1-pr103861-1.C: New test.
* gcc.target/i386/sse4_1-pr88540.c: New test.

Diff:
---
 gcc/testsuite/g++.target/i386/pr100637-1b.C|  4 ++--
 gcc/testsuite/g++.target/i386/pr100637-1w.C|  4 ++--
 gcc/testsuite/g++.target/i386/pr103861-1.C |  4 ++--
 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1b.C | 17 +
 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1w.C | 17 +
 gcc/testsuite/g++.target/i386/sse4_1-pr103861-1.C  | 17 +
 gcc/testsuite/gcc.target/i386/pr103941-2.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr88540.c|  4 ++--
 gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c | 10 ++
 9 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/gcc/testsuite/g++.target/i386/pr100637-1b.C 
b/gcc/testsuite/g++.target/i386/pr100637-1b.C
index 35b5df7c9dd..dccb8f5e712 100644
--- a/gcc/testsuite/g++.target/i386/pr100637-1b.C
+++ b/gcc/testsuite/g++.target/i386/pr100637-1b.C
@@ -1,6 +1,6 @@
 /* PR target/100637 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-sse4.1" } */
 
 typedef unsigned char __attribute__((__vector_size__ (4))) __v4qu;
 typedef char __attribute__((__vector_size__ (4))) __v4qi;
@@ -13,5 +13,5 @@ __v4qu us (__v4qi a, __v4qi b) { return (a > b) ? au : bu; }
 __v4qi su (__v4qu a, __v4qu b) { return (a > b) ? as : bs; }
 __v4qi ss (__v4qi a, __v4qi b) { return (a > b) ? as : bs; }
 
-/* { dg-final { scan-assembler-times "pcmpeqb" 2 } } */
+/* { dg-final { scan-assembler-times "pcmpeqb" 2 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times "pcmpgtb" 2 } } */
diff --git a/gcc/testsuite/g++.target/i386/pr100637-1w.C 
b/gcc/testsuite/g++.target/i386/pr100637-1w.C
index a3ed06fddee..a0aab62db33 100644
--- a/gcc/testsuite/g++.target/i386/pr100637-1w.C
+++ b/gcc/testsuite/g++.target/i386/pr100637-1w.C
@@ -1,6 +1,6 @@
 /* PR target/100637 */
 /* { dg-do compile } */
-/* { dg-opti

[gcc r15-1738] Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:09737d9605521df9232d9990006c44955064f44e

commit r15-1738-g09737d9605521df9232d9990006c44955064f44e
Author: liuhongt 
Date:   Tue Jun 18 15:52:02 2024 +0800

Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}.

These versions of the min/max patterns implement exactly the operations
   min = (op1 < op2 ? op1 : op2)
   max = (!(op1 < op2) ? op1 : op2)

gcc/ChangeLog:
PR target/115517
* config/i386/sse.md (*minmax3_1): New pre_reload
define_insn_and_split.
(*minmax3_2): Ditto.

Diff:
---
 gcc/config/i386/sse.md | 63 ++
 1 file changed, 63 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3d790af3a2c..694b4b8f07c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3096,6 +3096,69 @@
(set_attr "prefix" "")
(set_attr "mode" "")])
 
+(define_insn_and_split "*minmax3_1"
+  [(set (match_operand:VFH 0 "register_operand")
+   (vec_merge:VFH
+ (match_operand:VFH 1 "nonimmediate_operand")
+ (match_operand:VFH 2 "nonimmediate_operand")
+ (unspec:
+   [(match_operand:VFH 3 "nonimmediate_operand")
+(match_operand:VFH 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_31_operand")]
+UNSPEC_PCMP)))]
+  "TARGET_SSE && ix86_pre_reload_split ()
+   && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+   || (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3])))
+   && (INTVAL (operands[5]) == 1 || INTVAL (operands[5]) == 14)"
+   "#"
+   "&& 1"
+   [(const_int 0)]
+ {
+   int u = UNSPEC_IEEE_MIN;
+   if ((INTVAL (operands[5]) == 1 && rtx_equal_p (operands[1], operands[4]))
+   || (INTVAL (operands[5]) == 14 && rtx_equal_p (operands[1], 
operands[3])))
+ u = UNSPEC_IEEE_MAX;
+
+   if (MEM_P (operands[1]))
+ operands[1] = force_reg (mode, operands[1]);
+   rtvec v = gen_rtvec (2, operands[1], operands[2]);
+   rtx tmp = gen_rtx_UNSPEC (mode, v, u);
+   emit_move_insn (operands[0], tmp);
+   DONE;
+ })
+
+(define_insn_and_split "*minmax3_2"
+  [(set (match_operand:VF_128_256 0 "register_operand")
+   (unspec:VF_128_256
+ [(match_operand:VF_128_256 1 "nonimmediate_operand")
+  (match_operand:VF_128_256 2 "nonimmediate_operand")
+  (lt:VF_128_256
+(match_operand:VF_128_256 3 "nonimmediate_operand")
+(match_operand:VF_128_256 4 "nonimmediate_operand"))]
+UNSPEC_BLENDV))]
+  "TARGET_SSE && ix86_pre_reload_split ()
+   && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+   || (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3])))"
+   "#"
+   "&& 1"
+   [(const_int 0)]
+ {
+   int u = UNSPEC_IEEE_MIN;
+   if (rtx_equal_p (operands[1], operands[3]))
+ u = UNSPEC_IEEE_MAX;
+
+   if (MEM_P (operands[2]))
+ force_reg (mode, operands[2]);
+   rtvec v = gen_rtvec (2, operands[2], operands[1]);
+   rtx tmp = gen_rtx_UNSPEC (mode, v, u);
+   emit_move_insn (operands[0], tmp);
+   DONE;
+ })
+
 ;; These versions of the min/max patterns implement exactly the operations
 ;;   min = (op1 < op2 ? op1 : op2)
 ;;   max = (!(op1 < op2) ? op1 : op2)


[gcc r15-1741] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:2ccdd0f22312a14ac64bf944fdc4f8e7532eb0eb

commit r15-1741-g2ccdd0f22312a14ac64bf944fdc4f8e7532eb0eb
Author: liuhongt 
Date:   Thu Jun 20 12:41:13 2024 +0800

Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
and x < 0 ? 1 : 0 into (unsigned) x >> 31.

Add define_insn_and_split for the optimization did in
ix86_expand_int_vcond.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md ("*ashr3_1"): New
define_insn_and_split.
(*avx512_ashr3_1): Ditto.
(*avx2_lshr3_1): Ditto.
(*avx2_lshr3_2): Ditto and add 2 combine splitter after
it.
* config/i386/mmx.md (mmxscalarsize): New mode attribute.
(*mmw_ashr3_1): New define_insn_and_split.
("mmx_3): Add a combine spiltter after it.
(*mmx_ashrv2hi3_1): New define_insn_and_plit, also add a
combine splitter after it.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111023-2.c: Adjust testcase.
* gcc.target/i386/vect-div-1.c: Ditto.

Diff:
---
 gcc/config/i386/mmx.md | 52 +++
 gcc/config/i386/sse.md | 83 ++
 gcc/testsuite/gcc.target/i386/pr111023-2.c |  4 +-
 gcc/testsuite/gcc.target/i386/vect-div-1.c |  2 +-
 4 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 359dc90628d..fca28df99a1 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -147,6 +147,14 @@
(V4HI "hi") (V2HI "hi")
(V8QI "qi")])
 
+(define_mode_attr mmxscalarsize
+  [(V1DI "64")
+   (V2SI "32") (V2SF "32")
+   (V4HF "16") (V4BF "16")
+   (V2HF "16") (V2BF "16")
+   (V4HI "16") (V2HI "16")
+   (V8QI "8")])
+
 (define_mode_attr Yv_Yw
   [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
 
@@ -3620,6 +3628,17 @@
(const_string "0")))
(set_attr "mode" "DI,TI,TI")])
 
+(define_insn_and_split "*mmx_ashr3_1"
+  [(set (match_operand:MMXMODE24 0 "register_operand")
+   (lt:MMXMODE24
+ (match_operand:MMXMODE24 1 "register_operand")
+ (match_operand:MMXMODE24 2 "const0_operand")))]
+  "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (ashiftrt:MMXMODE24 (match_dup 1) (match_dup 3)))]
+  "operands[3] = gen_int_mode ( - 1, DImode);")
+
 (define_expand "ashr3"
   [(set (match_operand:MMXMODE24 0 "register_operand")
 (ashiftrt:MMXMODE24
@@ -3646,6 +3665,17 @@
(const_string "0")))
(set_attr "mode" "DI,TI,TI")])
 
+(define_split
+  [(set (match_operand:MMXMODE248 0 "register_operand")
+   (and:MMXMODE248
+ (lt:MMXMODE248
+   (match_operand:MMXMODE248 1 "register_operand")
+   (match_operand:MMXMODE248 2 "const0_operand"))
+ (match_operand:MMXMODE248 3 "const1_operand")))]
+  "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()"
+  [(set (match_dup 0) (lshiftrt:MMXMODE248 (match_dup 1) (match_dup 4)))]
+  "operands[4] = gen_int_mode ( - 1, DImode);")
+
 (define_expand "3"
   [(set (match_operand:MMXMODE24 0 "register_operand")
 (any_lshift:MMXMODE24
@@ -3687,6 +3717,28 @@
(const_string "0")))
(set_attr "mode" "TI")])
 
+(define_insn_and_split "*mmx_ashrv2hi3_1"
+  [(set (match_operand:V2HI 0 "register_operand")
+   (lt:V2HI
+ (match_operand:V2HI 1 "register_operand")
+ (match_operand:V2HI 2 "const0_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (ashiftrt:V2HI (match_dup 1) (match_dup 3)))]
+  "operands[3] = gen_int_mode (15, DImode);")
+
+(define_split
+  [(set (match_operand:V2HI 0 "register_operand")
+   (and:V2HI
+ (lt:V2HI
+   (match_operand:V2HI 1 "register_operand")
+   (match_operand:V2HI 2 "const0_operand"))
+ (match_operand:V2HI 3 "const1_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  [(set (match_dup 0) (lshiftrt:V2HI (match_dup 1) (match_dup 4)))]
+  "operands[4] = gen_int_mode (15, DImode);")
+
 (define_expand "v8qi3"
   [(set (match_operand:V8QI 0 "register_operand")
(any_shift:V8QI (match_operand:V8QI 1 "register_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3ffa1881c83..1169e93453e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16923,6 +16923,17 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*ashr3_1"
+  [(set (match_operand:VI24_AVX2 0 "register_operand")
+   (lt:VI24_AVX2
+ (match_operand:VI24_AVX2 1 "register_operand")
+ (match_operand:VI24_AVX2 2 "const0_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (ashiftrt:VI24_AVX2 (match_dup 1) (match_dup 3)))]
+  "operands[3] = gen_int_mode ( - 1, DImode);")
+
 (define_

[gcc r15-1742] Remove vcond{, u, eq} expanders since they will be obsolete.

2024-06-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:55f80c690c5fa59836646565a9dee2a3f68374a0

commit r15-1742-g55f80c690c5fa59836646565a9dee2a3f68374a0
Author: liuhongt 
Date:   Mon Jun 24 09:19:01 2024 +0800

Remove vcond{,u,eq} expanders since they will be obsolete.

gcc/ChangeLog:

PR target/115517
* config/i386/mmx.md (vcondv2sf): Removed.
(vcond): Ditto.
(vcond): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
* config/i386/sse.md (vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcondv2di): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
(vconduv2di): Ditto.
(vcondeqv2di): Ditto.

Diff:
---
 gcc/config/i386/mmx.md |  97 --
 gcc/config/i386/sse.md | 213 -
 2 files changed, 310 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index fca28df99a1..94d3a6e5692 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1180,39 +1180,6 @@
   DONE;
 })
 
-(define_expand "vcondv2sf"
-  [(set (match_operand:V2FI 0 "register_operand")
-   (if_then_else:V2FI
- (match_operator 3 ""
-   [(match_operand:V2SF 4 "nonimmediate_operand")
-(match_operand:V2SF 5 "nonimmediate_operand")])
- (match_operand:V2FI 1 "general_operand")
- (match_operand:V2FI 2 "general_operand")))]
-  "TARGET_MMX_WITH_SSE && ix86_partial_vec_fp_math"
-{
-  rtx ops[6];
-  ops[5] = gen_reg_rtx (V4SFmode);
-  ops[4] = gen_reg_rtx (V4SFmode);
-  ops[3] = gen_rtx_fmt_ee (GET_CODE (operands[3]), VOIDmode, ops[4], ops[5]);
-  ops[2] = lowpart_subreg (mode,
-  force_reg (mode, operands[2]),
-  mode);
-  ops[1] = lowpart_subreg (mode,
-  force_reg (mode, operands[1]),
-  mode);
-  ops[0] = gen_reg_rtx (mode);
-
-  emit_insn (gen_movq_v2sf_to_sse (ops[5], operands[5]));
-  emit_insn (gen_movq_v2sf_to_sse (ops[4], operands[4]));
-
-  bool ok = ix86_expand_fp_vcond (ops);
-  gcc_assert (ok);
-
-  emit_move_insn (operands[0], lowpart_subreg (mode, ops[0],
-  mode));
-  DONE;
-})
-
 (define_insn "@sse4_1_insertps_"
   [(set (match_operand:V2FI 0 "register_operand" "=Yr,*x,v")
(unspec:V2FI
@@ -4041,70 +4008,6 @@
   DONE;
 })
 
-(define_expand "vcond"
-  [(set (match_operand:MMXMODE124 0 "register_operand")
-   (if_then_else:MMXMODE124
- (match_operator 3 ""
-   [(match_operand:MMXMODEI 4 "register_operand")
-(match_operand:MMXMODEI 5 "register_operand")])
- (match_operand:MMXMODE124 1)
- (match_operand:MMXMODE124 2)))]
-  "TARGET_MMX_WITH_SSE
-   && (GET_MODE_NUNITS (mode)
-   == GET_MODE_NUNITS (mode))"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcond"
-  [(set (match_operand:VI_16_32 0 "register_operand")
-   (if_then_else:VI_16_32
- (match_operator 3 ""
-   [(match_operand:VI_16_32 4 "register_operand")
-(match_operand:VI_16_32 5 "register_operand")])
- (match_operand:VI_16_32 1)
- (match_operand:VI_16_32 2)))]
-  "TARGET_SSE2"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcondu"
-  [(set (match_operand:MMXMODE124 0 "register_operand")
-   (if_then_else:MMXMODE124
- (match_operator 3 ""
-   [(match_operand:MMXMODEI 4 "register_operand")
-(match_operand:MMXMODEI 5 "register_operand")])
- (match_operand:MMXMODE124 1)
- (match_operand:MMXMODE124 2)))]
-  "TARGET_MMX_WITH_SSE
-   && (GET_MODE_NUNITS (mode)
-   == GET_MODE_NUNITS (mode))"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcondu"
-  [(set (match_operand:VI_16_32 0 "register_operand")
-   (if_then_else:VI_16_32
- (match_operator 3 ""
-   [(match_operand:VI_16_32 4 "register_operand")
-(match_operand:VI_16_32 5 "register_operand")])
- (match_operand:VI_16_32 1)
- (match_operand:VI_16_32 2)))]
-  "TARGET_SSE2"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
 (define_expand "vcond_mask_"
   [(set (match_operand:MMXMODE124 0 "register_operand")
(vec_merge:MMXMODE124
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1169e93453e..d71b0f2567e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4848,72 +4848,6 @@
   DONE;
 })
 
-(define_expand "vcond"
-  [(set (match_operand:V_512 0 "register_operand")
-   (if_then_else:V_512
- (match_operator 3 ""
-   [(match_operand:VF_512 4 "

[gcc r15-1806] Move runtime check into a separate function and guard it with target ("no-avx")

2024-07-03 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:239ad907b1fc08874042f8bea5f61eaf3ba2877d

commit r15-1806-g239ad907b1fc08874042f8bea5f61eaf3ba2877d
Author: liuhongt 
Date:   Wed Jul 3 14:47:33 2024 +0800

Move runtime check into a separate function and guard it with target 
("no-avx")

The patch can avoid SIGILL on non-AVX512 machine due to kmovd is
generated in dynamic check.

gcc/testsuite/ChangeLog:

PR target/115748
* gcc.target/i386/avx512-check.h: Move runtime check into a
separate function and guard it with target ("no-avx").

Diff:
---
 gcc/testsuite/gcc.target/i386/avx512-check.h | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h 
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0ad9064f637..71858a33dac 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -34,8 +34,9 @@ check_osxsave (void)
   return (ecx & bit_OSXSAVE) != 0;
 }
 
+__attribute__((noipa,target("no-avx")))
 int
-main ()
+avx512_runtime_support_p ()
 {
   unsigned int eax, ebx, ecx, edx;
 
@@ -100,6 +101,17 @@ main ()
   && (edx & bit_AVX512VP2INTERSECT)
 #endif
   && avx512f_os_support ())
+{
+  return 1;
+}
+
+  return 0;
+}
+
+int
+main ()
+{
+  if (avx512_runtime_support_p ())
 {
   DO_TEST ();
 #ifdef DEBUG


[gcc r15-1836] Use __builtin_cpu_support instead of __get_cpuid_count.

2024-07-03 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:699087a16591adfdf21228876b6c48dbcd353faa

commit r15-1836-g699087a16591adfdf21228876b6c48dbcd353faa
Author: liuhongt 
Date:   Thu Jul 4 13:57:32 2024 +0800

Use __builtin_cpu_support instead of __get_cpuid_count.

gcc/testsuite/ChangeLog:

PR target/115748
* gcc.target/i386/avx512-check.h: Use __builtin_cpu_support
instead of __get_cpuid_count.

Diff:
---
 gcc/testsuite/gcc.target/i386/avx512-check.h | 46 
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h 
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 71858a33dac..8ec1a7ccbae 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -38,69 +38,63 @@ __attribute__((noipa,target("no-avx")))
 int
 avx512_runtime_support_p ()
 {
-  unsigned int eax, ebx, ecx, edx;
-
-  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
-return 0;
-
   /* Run AVX512 test only if host has ISA support.  */
-  if (check_osxsave ()
-  && (ebx & bit_AVX512F)
+  if (__builtin_cpu_supports ("avx512f")
 #ifdef AVX512VL
-  && (ebx & bit_AVX512VL)
+  && __builtin_cpu_supports ("avx512vl")
 #endif
 #ifdef AVX512ER
-  && (ebx & bit_AVX512ER)
+  && __builtin_cpu_supports ("avx512er")
 #endif
 #ifdef AVX512CD
-  && (ebx & bit_AVX512CD)
+  && __builtin_cpu_supports ("avx512cd")
 #endif
 #ifdef AVX512DQ
-  && (ebx & bit_AVX512DQ)
+  && __builtin_cpu_supports ("avx512dq")
 #endif
 #ifdef AVX512BW
-  && (ebx & bit_AVX512BW)
+  && __builtin_cpu_supports ("avx512bw")
 #endif
 #ifdef AVX512IFMA
-  && (ebx & bit_AVX512IFMA)
+  && __builtin_cpu_supports ("avx512ifma")
 #endif
 #ifdef AVX512VBMI
-  && (ecx & bit_AVX512VBMI)
+  && __builtin_cpu_supports ("avx512vbmi")
 #endif
 #ifdef AVX5124FMAPS
-  && (edx & bit_AVX5124FMAPS)
+  && __builtin_cpu_supports ("avx5124fmaps")
 #endif
 #ifdef AVX5124VNNIW
-  && (edx & bit_AVX5124VNNIW)
+  && __builtin_cpu_supports ("avx5124vnniw")
 #endif
 #ifdef AVX512VPOPCNTDQ
-  && (ecx & bit_AVX512VPOPCNTDQ)
+  && __builtin_cpu_supports ("avx512vpopcntdq")
 #endif
 #ifdef AVX512BITALG
-  && (ecx & bit_AVX512BITALG)
+  && __builtin_cpu_supports ("avx512bitalg")
 #endif
 #ifdef GFNI
-  && (ecx & bit_GFNI)
+  && __builtin_cpu_supports ("gfni")
 #endif
 #ifdef AVX512VBMI2
-  && (ecx & bit_AVX512VBMI2)
+  && __builtin_cpu_supports ("avx512vbmi2")
 #endif
 #ifdef AVX512VNNI
-  && (ecx & bit_AVX512VNNI)
+  && __builtin_cpu_supports ("avx512vnni")
 #endif
 #ifdef AVX512FP16
-  && (edx & bit_AVX512FP16)
+  && __builtin_cpu_supports ("avx512fp16")
 #endif
 #ifdef VAES
-  && (ecx & bit_VAES)
+  && __builtin_cpu_supports ("vaes")
 #endif
 #ifdef VPCLMULQDQ
-  && (ecx & bit_VPCLMULQDQ)
+  && __builtin_cpu_supports ("vpclmulqdq")
 #endif
 #ifdef AVX512VP2INTERSECT
-  && (edx & bit_AVX512VP2INTERSECT)
+  && __builtin_cpu_supports ("avx512vp2intersect")
 #endif
-  && avx512f_os_support ())
+  )
 {
   return 1;
 }


[gcc r15-1888] x86: Update branch hint for Redwood Cove.

2024-07-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a910c30c7c27cd0f6d2d2694544a09fb11d611b9

commit r15-1888-ga910c30c7c27cd0f6d2d2694544a09fb11d611b9
Author: H.J. Lu 
Date:   Tue Apr 26 11:08:55 2022 -0700

x86: Update branch hint for Redwood Cove.

According to Intel® 64 and IA-32 Architectures Optimization Reference
Manual[1], Branch Hint is updated for Redwood Cove.

cut from [1]-
Starting with the Redwood Cove microarchitecture, if the predictor has
no stored information about a branch, the branch has the Intel® SSE2
branch taken hint (i.e., instruction prefix 3EH), When the codec
decodes the branch, it flips the branch’s prediction from not-taken to
taken. It then flushes the pipeline in front of it and steers this
pipeline to fetch the taken path of the branch.
cut end -

Split tune branch_prediction_hints into branch_prediction_hints_taken
and branch_prediction_hints_not_taken, always generate branch hint for
conditional branches, both tunes are disabled by default.

[1] 
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

gcc/

* config/i386/i386.cc (ix86_print_operand): Always generate
branch hint for conditional branches.
* config/i386/i386.h (TARGET_BRANCH_PREDICTION_HINTS): Split
into ..
(TARGET_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.
* config/i386/x86-tune.def (X86_TUNE_BRANCH_PREDICTION_HINTS):
Split into ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.

Diff:
---
 gcc/config/i386/i386.cc  | 29 +
 gcc/config/i386/i386.h   |  6 --
 gcc/config/i386/x86-tune.def | 13 +++--
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f75250f79de4..17d23bbcbc27 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14057,7 +14057,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
 
if (!optimize
|| optimize_function_for_size_p (cfun)
-   || !TARGET_BRANCH_PREDICTION_HINTS)
+   || (!TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN
+   && !TARGET_BRANCH_PREDICTION_HINTS_TAKEN))
  return;
 
x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
@@ -14066,25 +14067,13 @@ ix86_print_operand (FILE *file, rtx x, int code)
int pred_val = profile_probability::from_reg_br_prob_note
 (XINT (x, 0)).to_reg_br_prob_base ();
 
-   if (pred_val < REG_BR_PROB_BASE * 45 / 100
-   || pred_val > REG_BR_PROB_BASE * 55 / 100)
- {
-   bool taken = pred_val > REG_BR_PROB_BASE / 2;
-   bool cputaken
- = final_forward_branch_p (current_output_insn) == 0;
-
-   /* Emit hints only in the case default branch prediction
-  heuristics would fail.  */
-   if (taken != cputaken)
- {
-   /* We use 3e (DS) prefix for taken branches and
-  2e (CS) prefix for not taken branches.  */
-   if (taken)
- fputs ("ds ; ", file);
-   else
- fputs ("cs ; ", file);
- }
- }
+   bool taken = pred_val > REG_BR_PROB_BASE / 2;
+   /* We use 3e (DS) prefix for taken branches and
+  2e (CS) prefix for not taken branches.  */
+   if (taken && TARGET_BRANCH_PREDICTION_HINTS_TAKEN)
+ fputs ("ds ; ", file);
+   else if (!taken && TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN)
+ fputs ("cs ; ", file);
  }
return;
  }
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0c5292e1d646..eabb3248ea00 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -309,8 +309,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_ZERO_EXTEND_WITH_AND \
ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
 #define TARGET_UNROLL_STRLEN   ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
-#define TARGET_BRANCH_PREDICTION_HINTS \
-   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
+#define TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN \
+   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN]
+#define TARGET_BRANCH_PREDICTION_HINTS_TAKEN \
+   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN]
 #define TARGET_DOUBLE_WITH_ADD ix86_tune

[gcc r15-1905] Rename __{float, double}_u to __x86_{float, double}_u to avoid pulluting the namespace.

2024-07-08 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:23ab7f632f4f5bae67fb53cf7b18fea7ba7242c4

commit r15-1905-g23ab7f632f4f5bae67fb53cf7b18fea7ba7242c4
Author: liuhongt 
Date:   Mon Jul 8 10:35:35 2024 +0800

Rename __{float,double}_u to __x86_{float,double}_u to avoid pulluting the 
namespace.

I have a build failure on NetBSD as the namespace pollution avoidance causes
a direct hit with the system /usr/include/math.h
===

In file included from /usr/src/local/gcc/obj/gcc/include/emmintrin.h:31,
 from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/ext/random:45,
 from 
/usr/src/local/gcc/libstdc++-v3/include/precompiled/extc++.h:65:
/usr/src/local/gcc/obj/gcc/include/xmmintrin.h:75:15: error: conflicting 
declaration 'typedef float __float_u'
   75 | typedef float __float_u __attribute__ ((__may_alias__, __aligned__ 
(1)));
  |   ^
In file included from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/cmath:47,
 from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/x86_64-unknown-netbsd10.99/bits/stdc++.h:114,
 from 
/usr/src/local/gcc/libstdc++-v3/include/precompiled/extc++.h:32:
/usr/src/local/gcc/obj/gcc/include-fixed/math.h:49:7: note: previous 
declaration as 'union __float_u'
   49 | union __float_u {

gcc/ChangeLog:

PR target/115796
* config/i386/emmintrin.h (__float_u): Rename to ..
(__x86_float_u): .. this.
(_mm_load_sd): Ditto.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
* config/i386/xmmintrin.h (__double_u): Rename to ..
(__x86_double_u): .. this.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115796.c: New test.

Diff:
---
 gcc/config/i386/emmintrin.h  | 10 +-
 gcc/config/i386/xmmintrin.h  |  6 +++---
 gcc/testsuite/gcc.target/i386/pr115796.c | 24 
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index d58030e5c4fe..a3fcd7a869cf 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef double __x86_double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return __extension__ (__m128d) { *(__double_u *)__P, 0.0 };
+  return __extension__ (__m128d) { *(__x86_double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *(__double_u *)__P = ((__v2df)__A)[0] ;
+  *(__x86_double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B };
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__x86_double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] };
+  return __extension__ (__m128d) { *(__x86_double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 37e5a94cf101..7f10f96d72ce 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_

[gcc r12-10617] Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

2024-07-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e1427b39d28f382d21e7a0ea1714b3250e0a6e5d

commit r12-10617-ge1427b39d28f382d21e7a0ea1714b3250e0a6e5d
Author: liuhongt 
Date:   Fri Jul 12 09:39:23 2024 +0800

Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

-  _5 = __atomic_fetch_or_8 (&set_work_pending_p, 1, 0);
-  # DEBUG old => (long int) _5
+  _6 = .ATOMIC_BIT_TEST_AND_SET (&set_work_pending_p, 0, 1, 0, 
__atomic_fetch_or_8);
+  # DEBUG old => NULL
   # DEBUG BEGIN_STMT
-  # DEBUG D#2 => _5 & 1
+  # DEBUG D#2 => NULL
...
-  _10 = ~_5;
-  _8 = (_Bool) _10;
-  # DEBUG ret => _8
+  _8 = _6 == 0;
+  # DEBUG ret => (_Bool) _10

confirmed.  convert_atomic_bit_not does this, it checks for single_use
and removes the def, failing to release the name (which would fix this up
IIRC).

Note the function removes stmts in "wrong" order (before uses of LHS
are removed), so it requires larger surgery.  And it leaks SSA names.

gcc/ChangeLog:

PR target/115872
* tree-ssa-ccp.cc (convert_atomic_bit_not): Remove use_stmt after 
use_nop_stmt is removed.
(optimize_atomic_bit_test_and): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115872.c: New test.

(cherry picked from commit a8209237dc46dc4db7d9d8e3807e6c93734c64b5)

Diff:
---
 gcc/testsuite/gcc.target/i386/pr115872.c | 16 
 gcc/tree-ssa-ccp.cc  | 12 
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr115872.c 
b/gcc/testsuite/gcc.target/i386/pr115872.c
new file mode 100644
index ..937004456d37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115872.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g" } */
+
+long set_work_pending_p;
+_Bool set_work_pending() {
+  _Bool __trans_tmp_1;
+  long mask = 1, old = __atomic_fetch_or(&set_work_pending_p, mask, 0);
+  __trans_tmp_1 = old & mask;
+  return !__trans_tmp_1;
+}
+void __queue_work() {
+  _Bool ret = set_work_pending();
+  if (ret)
+__queue_work();
+}
+
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 42a02dccaeb1..3c63f2dd8a3b 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -3306,9 +3306,10 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
 return nullptr;
 
   gimple_stmt_iterator gsi;
-  gsi = gsi_for_stmt (use_stmt);
-  gsi_remove (&gsi, true);
   tree var = make_ssa_name (TREE_TYPE (lhs));
+  /* use_stmt need to be removed after use_nop_stmt,
+ so use_lhs can be released.  */
+  gimple *use_stmt_removal = use_stmt;
   use_stmt = gimple_build_assign (var, BIT_AND_EXPR, lhs, and_mask);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_insert_before (&gsi, use_stmt, GSI_NEW_STMT);
@@ -3318,6 +3319,8 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_remove (&gsi, true);
+  gsi = gsi_for_stmt (use_stmt_removal);
+  gsi_remove (&gsi, true);
   return use_stmt;
 }
 
@@ -3569,8 +3572,7 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   */
}
  var = make_ssa_name (TREE_TYPE (use_rhs));
- gsi = gsi_for_stmt (use_stmt);
- gsi_remove (&gsi, true);
+ gimple* use_stmt_removal = use_stmt;
  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
   and_mask);
  gsi = gsi_for_stmt (use_nop_stmt);
@@ -3584,6 +3586,8 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
  gsi = gsi_for_stmt (use_nop_stmt);
  gsi_remove (&gsi, true);
+ gsi = gsi_for_stmt (use_stmt_removal);
+ gsi_remove (&gsi, true);
}
}
  else


[gcc r13-8913] Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

2024-07-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:9a1cdaa5e8441394d613f5f3401e7aab21efe8f0

commit r13-8913-g9a1cdaa5e8441394d613f5f3401e7aab21efe8f0
Author: liuhongt 
Date:   Fri Jul 12 09:39:23 2024 +0800

Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

-  _5 = __atomic_fetch_or_8 (&set_work_pending_p, 1, 0);
-  # DEBUG old => (long int) _5
+  _6 = .ATOMIC_BIT_TEST_AND_SET (&set_work_pending_p, 0, 1, 0, 
__atomic_fetch_or_8);
+  # DEBUG old => NULL
   # DEBUG BEGIN_STMT
-  # DEBUG D#2 => _5 & 1
+  # DEBUG D#2 => NULL
...
-  _10 = ~_5;
-  _8 = (_Bool) _10;
-  # DEBUG ret => _8
+  _8 = _6 == 0;
+  # DEBUG ret => (_Bool) _10

confirmed.  convert_atomic_bit_not does this, it checks for single_use
and removes the def, failing to release the name (which would fix this up
IIRC).

Note the function removes stmts in "wrong" order (before uses of LHS
are removed), so it requires larger surgery.  And it leaks SSA names.

gcc/ChangeLog:

PR target/115872
* tree-ssa-ccp.cc (convert_atomic_bit_not): Remove use_stmt after 
use_nop_stmt is removed.
(optimize_atomic_bit_test_and): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115872.c: New test.

(cherry picked from commit a8209237dc46dc4db7d9d8e3807e6c93734c64b5)

Diff:
---
 gcc/testsuite/gcc.target/i386/pr115872.c | 16 
 gcc/tree-ssa-ccp.cc  | 12 
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr115872.c 
b/gcc/testsuite/gcc.target/i386/pr115872.c
new file mode 100644
index ..937004456d37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115872.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g" } */
+
+long set_work_pending_p;
+_Bool set_work_pending() {
+  _Bool __trans_tmp_1;
+  long mask = 1, old = __atomic_fetch_or(&set_work_pending_p, mask, 0);
+  __trans_tmp_1 = old & mask;
+  return !__trans_tmp_1;
+}
+void __queue_work() {
+  _Bool ret = set_work_pending();
+  if (ret)
+__queue_work();
+}
+
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 2e552b330b74..6c9da603ef95 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -3321,9 +3321,10 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
 return nullptr;
 
   gimple_stmt_iterator gsi;
-  gsi = gsi_for_stmt (use_stmt);
-  gsi_remove (&gsi, true);
   tree var = make_ssa_name (TREE_TYPE (lhs));
+  /* use_stmt need to be removed after use_nop_stmt,
+ so use_lhs can be released.  */
+  gimple *use_stmt_removal = use_stmt;
   use_stmt = gimple_build_assign (var, BIT_AND_EXPR, lhs, and_mask);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_insert_before (&gsi, use_stmt, GSI_NEW_STMT);
@@ -,6 +3334,8 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_remove (&gsi, true);
+  gsi = gsi_for_stmt (use_stmt_removal);
+  gsi_remove (&gsi, true);
   return use_stmt;
 }
 
@@ -3635,8 +3638,7 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   */
}
  var = make_ssa_name (TREE_TYPE (use_rhs));
- gsi = gsi_for_stmt (use_stmt);
- gsi_remove (&gsi, true);
+ gimple* use_stmt_removal = use_stmt;
  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
   and_mask);
  gsi = gsi_for_stmt (use_nop_stmt);
@@ -3653,6 +3655,8 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
  gsi = gsi_for_stmt (use_nop_stmt);
  gsi_remove (&gsi, true);
+ gsi = gsi_for_stmt (use_stmt_removal);
+ gsi_remove (&gsi, true);
}
}
  else


[gcc r14-10422] Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

2024-07-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:13bfc385b0baebd22aeabb0d90915f2e9b18febe

commit r14-10422-g13bfc385b0baebd22aeabb0d90915f2e9b18febe
Author: liuhongt 
Date:   Fri Jul 12 09:39:23 2024 +0800

Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

-  _5 = __atomic_fetch_or_8 (&set_work_pending_p, 1, 0);
-  # DEBUG old => (long int) _5
+  _6 = .ATOMIC_BIT_TEST_AND_SET (&set_work_pending_p, 0, 1, 0, 
__atomic_fetch_or_8);
+  # DEBUG old => NULL
   # DEBUG BEGIN_STMT
-  # DEBUG D#2 => _5 & 1
+  # DEBUG D#2 => NULL
...
-  _10 = ~_5;
-  _8 = (_Bool) _10;
-  # DEBUG ret => _8
+  _8 = _6 == 0;
+  # DEBUG ret => (_Bool) _10

confirmed.  convert_atomic_bit_not does this, it checks for single_use
and removes the def, failing to release the name (which would fix this up
IIRC).

Note the function removes stmts in "wrong" order (before uses of LHS
are removed), so it requires larger surgery.  And it leaks SSA names.

gcc/ChangeLog:

PR target/115872
* tree-ssa-ccp.cc (convert_atomic_bit_not): Remove use_stmt after 
use_nop_stmt is removed.
(optimize_atomic_bit_test_and): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115872.c: New test.

(cherry picked from commit a8209237dc46dc4db7d9d8e3807e6c93734c64b5)

Diff:
---
 gcc/testsuite/gcc.target/i386/pr115872.c | 16 
 gcc/tree-ssa-ccp.cc  | 12 
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr115872.c 
b/gcc/testsuite/gcc.target/i386/pr115872.c
new file mode 100644
index ..937004456d37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115872.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g" } */
+
+long set_work_pending_p;
+_Bool set_work_pending() {
+  _Bool __trans_tmp_1;
+  long mask = 1, old = __atomic_fetch_or(&set_work_pending_p, mask, 0);
+  __trans_tmp_1 = old & mask;
+  return !__trans_tmp_1;
+}
+void __queue_work() {
+  _Bool ret = set_work_pending();
+  if (ret)
+__queue_work();
+}
+
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index f6a5cd0ee6e0..cc78ff20bb81 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -3331,9 +3331,10 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
 return nullptr;
 
   gimple_stmt_iterator gsi;
-  gsi = gsi_for_stmt (use_stmt);
-  gsi_remove (&gsi, true);
   tree var = make_ssa_name (TREE_TYPE (lhs));
+  /* use_stmt need to be removed after use_nop_stmt,
+ so use_lhs can be released.  */
+  gimple *use_stmt_removal = use_stmt;
   use_stmt = gimple_build_assign (var, BIT_AND_EXPR, lhs, and_mask);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_insert_before (&gsi, use_stmt, GSI_NEW_STMT);
@@ -3343,6 +3344,8 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_remove (&gsi, true);
+  gsi = gsi_for_stmt (use_stmt_removal);
+  gsi_remove (&gsi, true);
   return use_stmt;
 }
 
@@ -3645,8 +3648,7 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   */
}
  var = make_ssa_name (TREE_TYPE (use_rhs));
- gsi = gsi_for_stmt (use_stmt);
- gsi_remove (&gsi, true);
+ gimple* use_stmt_removal = use_stmt;
  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
   and_mask);
  gsi = gsi_for_stmt (use_nop_stmt);
@@ -3663,6 +3665,8 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
  gsi = gsi_for_stmt (use_nop_stmt);
  gsi_remove (&gsi, true);
+ gsi = gsi_for_stmt (use_stmt_removal);
+ gsi_remove (&gsi, true);
}
}
  else


[gcc r15-2038] Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

2024-07-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:f27bf48e0204524ead795fe618cd8b1224f72fd4

commit r15-2038-gf27bf48e0204524ead795fe618cd8b1224f72fd4
Author: liuhongt 
Date:   Fri Jul 12 09:39:23 2024 +0800

Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

-  _5 = __atomic_fetch_or_8 (&set_work_pending_p, 1, 0);
-  # DEBUG old => (long int) _5
+  _6 = .ATOMIC_BIT_TEST_AND_SET (&set_work_pending_p, 0, 1, 0, 
__atomic_fetch_or_8);
+  # DEBUG old => NULL
   # DEBUG BEGIN_STMT
-  # DEBUG D#2 => _5 & 1
+  # DEBUG D#2 => NULL
...
-  _10 = ~_5;
-  _8 = (_Bool) _10;
-  # DEBUG ret => _8
+  _8 = _6 == 0;
+  # DEBUG ret => (_Bool) _10

confirmed.  convert_atomic_bit_not does this, it checks for single_use
and removes the def, failing to release the name (which would fix this up
IIRC).

Note the function removes stmts in "wrong" order (before uses of LHS
are removed), so it requires larger surgery.  And it leaks SSA names.

gcc/ChangeLog:

PR target/115872
* tree-ssa-ccp.cc (convert_atomic_bit_not): Remove use_stmt after 
use_nop_stmt is removed.
(optimize_atomic_bit_test_and): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115872.c: New test.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr115872.c | 16 
 gcc/tree-ssa-ccp.cc  | 12 
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr115872.c 
b/gcc/testsuite/gcc.target/i386/pr115872.c
new file mode 100644
index ..937004456d37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115872.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g" } */
+
+long set_work_pending_p;
+_Bool set_work_pending() {
+  _Bool __trans_tmp_1;
+  long mask = 1, old = __atomic_fetch_or(&set_work_pending_p, mask, 0);
+  __trans_tmp_1 = old & mask;
+  return !__trans_tmp_1;
+}
+void __queue_work() {
+  _Bool ret = set_work_pending();
+  if (ret)
+__queue_work();
+}
+
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 3749126b5f7c..de83d26d311a 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -3332,9 +3332,10 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
 return nullptr;
 
   gimple_stmt_iterator gsi;
-  gsi = gsi_for_stmt (use_stmt);
-  gsi_remove (&gsi, true);
   tree var = make_ssa_name (TREE_TYPE (lhs));
+  /* use_stmt need to be removed after use_nop_stmt,
+ so use_lhs can be released.  */
+  gimple *use_stmt_removal = use_stmt;
   use_stmt = gimple_build_assign (var, BIT_AND_EXPR, lhs, and_mask);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_insert_before (&gsi, use_stmt, GSI_NEW_STMT);
@@ -3344,6 +3345,8 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_remove (&gsi, true);
+  gsi = gsi_for_stmt (use_stmt_removal);
+  gsi_remove (&gsi, true);
   return use_stmt;
 }
 
@@ -3646,8 +3649,7 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   */
}
  var = make_ssa_name (TREE_TYPE (use_rhs));
- gsi = gsi_for_stmt (use_stmt);
- gsi_remove (&gsi, true);
+ gimple* use_stmt_removal = use_stmt;
  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
   and_mask);
  gsi = gsi_for_stmt (use_nop_stmt);
@@ -3664,6 +3666,8 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
  gsi = gsi_for_stmt (use_nop_stmt);
  gsi_remove (&gsi, true);
+ gsi = gsi_for_stmt (use_stmt_removal);
+ gsi_remove (&gsi, true);
}
}
  else


[gcc r14-10425] x86: Update branch hint for Redwood Cove.

2024-07-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:1fff665a51e221a578a92631fc8ea62dd79fa3b6

commit r14-10425-g1fff665a51e221a578a92631fc8ea62dd79fa3b6
Author: H.J. Lu 
Date:   Tue Apr 26 11:08:55 2022 -0700

x86: Update branch hint for Redwood Cove.

According to Intel® 64 and IA-32 Architectures Optimization Reference
Manual[1], Branch Hint is updated for Redwood Cove.

cut from [1]-
Starting with the Redwood Cove microarchitecture, if the predictor has
no stored information about a branch, the branch has the Intel® SSE2
branch taken hint (i.e., instruction prefix 3EH), When the codec
decodes the branch, it flips the branch’s prediction from not-taken to
taken. It then flushes the pipeline in front of it and steers this
pipeline to fetch the taken path of the branch.
cut end -

Split tune branch_prediction_hints into branch_prediction_hints_taken
and branch_prediction_hints_not_taken, always generate branch hint for
conditional branches, both tunes are disabled by default.

[1] 
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

gcc/

* config/i386/i386.cc (ix86_print_operand): Always generate
branch hint for conditional branches.
* config/i386/i386.h (TARGET_BRANCH_PREDICTION_HINTS): Split
into ..
(TARGET_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.
* config/i386/x86-tune.def (X86_TUNE_BRANCH_PREDICTION_HINTS):
Split into ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.

(cherry picked from commit a910c30c7c27cd0f6d2d2694544a09fb11d611b9)

Diff:
---
 gcc/config/i386/i386.cc  | 29 +
 gcc/config/i386/i386.h   |  6 --
 gcc/config/i386/x86-tune.def | 13 +++--
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 984ba37beeb9..3827e2b61fe4 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14203,7 +14203,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
 
if (!optimize
|| optimize_function_for_size_p (cfun)
-   || !TARGET_BRANCH_PREDICTION_HINTS)
+   || (!TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN
+   && !TARGET_BRANCH_PREDICTION_HINTS_TAKEN))
  return;
 
x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
@@ -14212,25 +14213,13 @@ ix86_print_operand (FILE *file, rtx x, int code)
int pred_val = profile_probability::from_reg_br_prob_note
 (XINT (x, 0)).to_reg_br_prob_base ();
 
-   if (pred_val < REG_BR_PROB_BASE * 45 / 100
-   || pred_val > REG_BR_PROB_BASE * 55 / 100)
- {
-   bool taken = pred_val > REG_BR_PROB_BASE / 2;
-   bool cputaken
- = final_forward_branch_p (current_output_insn) == 0;
-
-   /* Emit hints only in the case default branch prediction
-  heuristics would fail.  */
-   if (taken != cputaken)
- {
-   /* We use 3e (DS) prefix for taken branches and
-  2e (CS) prefix for not taken branches.  */
-   if (taken)
- fputs ("ds ; ", file);
-   else
- fputs ("cs ; ", file);
- }
- }
+   bool taken = pred_val > REG_BR_PROB_BASE / 2;
+   /* We use 3e (DS) prefix for taken branches and
+  2e (CS) prefix for not taken branches.  */
+   if (taken && TARGET_BRANCH_PREDICTION_HINTS_TAKEN)
+ fputs ("ds ; ", file);
+   else if (!taken && TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN)
+ fputs ("cs ; ", file);
  }
return;
  }
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 529edff93a41..26e15d2677fb 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -306,8 +306,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_ZERO_EXTEND_WITH_AND \
ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
 #define TARGET_UNROLL_STRLEN   ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
-#define TARGET_BRANCH_PREDICTION_HINTS \
-   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
+#define TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN \
+   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN]
+#define TARGET_BRANCH_PREDICTION_HINTS_TAKEN \
+   ix86_tune_features[X8

[gcc r15-2127] Optimize maskstore when mask is 0 or -1 in UNSPEC_MASKMOV

2024-07-17 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:228972b2b7bf50f4776f8ccae0d7c2950827d0f1

commit r15-2127-g228972b2b7bf50f4776f8ccae0d7c2950827d0f1
Author: liuhongt 
Date:   Tue Jul 16 15:29:01 2024 +0800

Optimize maskstore when mask is 0 or -1 in UNSPEC_MASKMOV

gcc/ChangeLog:

PR target/115843
* config/i386/predicates.md (const0_or_m1_operand): New
predicate.
* config/i386/sse.md (*_store_mask_1): New
pre_reload define_insn_and_split.
(V): Add V32BF,V16BF,V8BF.
(V4SF_V8BF): Rename to ..
(V24F_128): .. this.
(*vec_concat): Adjust with V24F_128.
(*vec_concat_0): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115843.c: New test.

Diff:
---
 gcc/config/i386/predicates.md|  5 +
 gcc/config/i386/sse.md   | 33 ++-
 gcc/testsuite/gcc.target/i386/pr115843.c | 38 
 3 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 5d0bb1e0f54a..680594871de0 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -825,6 +825,11 @@
   (and (match_code "const_int")
(match_test "op == constm1_rtx")))
 
+;; Match 0 or -1.
+(define_predicate "const0_or_m1_operand"
+  (ior (match_operand 0 "const0_operand")
+   (match_operand 0 "constm1_operand")))
+
 ;; Match exactly eight.
 (define_predicate "const8_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e44822f705b4..f54e966bdbb2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -294,6 +294,7 @@
(V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F && TARGET_EVEX512")  (V4DI "TARGET_AVX") V2DI
(V32HF "TARGET_AVX512F && TARGET_EVEX512") (V16HF "TARGET_AVX") V8HF
+   (V32BF "TARGET_AVX512F && TARGET_EVEX512") (V16BF "TARGET_AVX") V8BF
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F && TARGET_EVEX512")  (V4DF "TARGET_AVX") (V2DF 
"TARGET_SSE2")])
 
@@ -430,8 +431,8 @@
(V16SF "TARGET_EVEX512")
(V8DF "TARGET_EVEX512")])
 
-(define_mode_iterator V4SF_V8HF
-  [V4SF V8HF])
+(define_mode_iterator V24F_128
+  [V4SF V8HF V8BF])
 
 (define_mode_iterator VI48_AVX512VL
   [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
@@ -11543,8 +11544,8 @@
(set_attr "mode" "V4SF,SF,DI,DI")])
 
 (define_insn "*vec_concat"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=x,v,x,v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=x,v,x,v")
+   (vec_concat:V24F_128
  (match_operand: 1 "register_operand" " 0,v,0,v")
  (match_operand: 2 "nonimmediate_operand" " 
x,v,m,m")))]
   "TARGET_SSE"
@@ -11559,8 +11560,8 @@
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
 
 (define_insn "*vec_concat_0"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=v")
+   (vec_concat:V24F_128
  (match_operand: 1 "nonimmediate_operand" "vm")
  (match_operand: 2 "const0_operand")))]
   "TARGET_SSE2"
@@ -28574,6 +28575,26 @@
(set_attr "memory" "store")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_store_mask_1"
+  [(set (match_operand:V 0 "memory_operand")
+   (unspec:V
+ [(match_operand:V 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "const0_or_m1_operand")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX512F && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (constm1_operand (operands[2], mode))
+emit_move_insn (operands[0], operands[1]);
+  else
+emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+})
+
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI_AVX_AVX512F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr115843.c 
b/gcc/testsuite/gcc.target/i386/pr115843.c
new file mode 100644
index ..00d8605757a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115843.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl --param vect-partial-vector-usage=2 
-mtune=znver5 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-not "kxor\[bw]" } } */
+
+typedef unsigned long long BITBOARD;
+BITBOARD KingPressureMask1[64], KingSafetyMask1[64];
+
+void __attribute__((noinline))
+foo()
+{
+  int i;
+
+  for (i = 0; i < 64; i++) {
+if ((i & 7) == 0) {
+  KingPressureMask1[i] = KingSafetyMask1[i + 1];
+} else if ((i & 7) == 7) {
+  KingPressureMask1[i] = KingSafetyMask1[i - 1];
+} else {
+  KingPressureMask1[i] = KingSafetyMask1[i];
+}
+  }
+}
+
+BITBOARD verify[64] = {1, 1, 2, 3, 4, 5, 6, 6, 9, 9, 1

[gcc r15-2217] Relax ix86_hardreg_mov_ok after split1.

2024-07-22 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a3f03891065cb9691f6e9cebce4d4542deb92a35

commit r15-2217-ga3f03891065cb9691f6e9cebce4d4542deb92a35
Author: liuhongt 
Date:   Mon Jul 22 11:36:59 2024 +0800

Relax ix86_hardreg_mov_ok after split1.

ix86_hardreg_mov_ok is added by r11-5066-gbe39636d9f68c4

>The solution proposed here is to have the x86 backend/recog prevent
>early RTL passes composing instructions (that set likely_spilled hard
>registers) that they (combine) can't simplify, until after reload.
>We allow sets from pseudo registers, immediate constants and memory
>accesses, but anything more complicated is performed via a temporary
>pseudo.  Not only does this simplify things for the register allocator,
>but any remaining register-to-register moves are easily cleaned up
>by the late optimization passes after reload, such as peephole2 and
>cprop_hardreg.

The restriction is mainly for rtl optimization passes before pass_combine.

But split1 splits

```
(insn 17 13 18 2 (set (reg/i:V4SI 20 xmm0)
(vec_merge:V4SI (const_vector:V4SI [
(const_int -1 [0x]) repeated x4
])
(const_vector:V4SI [
(const_int 0 [0]) repeated x4
])
(unspec:QI [
(reg:V4SF 106)
(reg:V4SF 102)
(const_int 0 [0])
] UNSPEC_PCMP))) "/app/example.cpp":20:1 2929 
{*avx_cmpv4sf3_1}
 (expr_list:REG_DEAD (reg:V4SF 102)
(expr_list:REG_DEAD (reg:V4SF 106)
(nil
```

into:
```
(insn 23 13 24 2 (set (reg:V4SF 107)
(unspec:V4SF [
(reg:V4SF 106)
(reg:V4SF 102)
(const_int 0 [0])
] UNSPEC_PCMP)) "/app/example.cpp":20:1 -1
 (nil))
(insn 24 23 18 2 (set (reg/i:V4SI 20 xmm0)
(subreg:V4SI (reg:V4SF 107) 0)) "/app/example.cpp":20:1 -1
 (nil))
```

There're many splitters generating MOV insn with SUBREG and would have
same problem.
Instead of changing those splitters one by one, the patch relaxes
ix86_hard_mov_ok to allow mov subreg to hard register after
split1. ix86_pre_reload_split () is used to replace
!reload_completed && ira_in_progress.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_hardreg_mov_ok): Relax mov subreg
to hard register after split1.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr115982.C: New test.

Diff:
---
 gcc/config/i386/i386.cc  |  5 ++---
 gcc/testsuite/g++.target/i386/pr115982.C | 11 +++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9c2ebe74fc92..77c441893b40 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20212,7 +20212,7 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
 }
 
 /* Return true if a set of DST by the expression SRC should be allowed.
-   This prevents complex sets of likely_spilled hard regs before reload.  */
+   This prevents complex sets of likely_spilled hard regs before split1.  */
 
 bool
 ix86_hardreg_mov_ok (rtx dst, rtx src)
@@ -20224,8 +20224,7 @@ ix86_hardreg_mov_ok (rtx dst, rtx src)
   ? standard_sse_constant_p (src, GET_MODE (dst))
   : x86_64_immediate_operand (src, GET_MODE (dst)))
   && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
-  && !reload_completed
-  && !lra_in_progress)
+  && ix86_pre_reload_split ())
 return false;
   return true;
 }
diff --git a/gcc/testsuite/g++.target/i386/pr115982.C 
b/gcc/testsuite/g++.target/i386/pr115982.C
new file mode 100644
index ..4b91618405d5
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr115982.C
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -O2" } */
+
+typedef float VF __attribute__((__vector_size__(16)));
+typedef int VI __attribute__((__vector_size__(16)));
+
+VI
+foo (VF x)
+{
+  return !x;
+}


[gcc r14-9459] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:618e34d56cc38e9c3ae95a413228068e53ed76bb

commit r14-9459-g618e34d56cc38e9c3ae95a413228068e53ed76bb
Author: liuhongt 
Date:   Wed Mar 13 10:40:01 2024 +0800

i386[stv]: Handle REG_EH_REGION note

When we split
(insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
(mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 
{*movdi_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])

into

(insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
(vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) 
[6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
(const_int 0 [0]))) "test.C":22:42 -1
(nil)))
(insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
(subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
(nil)))

we must copy the REG_EH_REGION note to the first insn and split the block
after the newly added insn.  The REG_EH_REGION on the second insn will be
removed later since it no longer traps.

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::convert_op): Handle REG_EH_REGION note.
(convert_scalars_to_vector): Ditto.
* config/i386/i386-features.h (class scalar_chain): New
memeber control_flow_insns.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr111822.C: New test.

Diff:
---
 gcc/config/i386/i386-features.cc | 50 
 gcc/config/i386/i386-features.h  |  1 +
 gcc/testsuite/g++.target/i386/pr111822.C | 45 
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 1de2a07ed75..c7d7a965901 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -998,20 +998,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn, *movabs = NULL;
   rtx tmp = gen_reg_rtx (GET_MODE (*op));
 
-  /* Handle movabs.  */
+  /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
   if (!memory_operand (*op, GET_MODE (*op)))
{
  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
+ movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
 
- emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
  *op = tmp2;
}
 
-  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
-gen_gpr_to_xmm_move_src (vmode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
+gen_gpr_to_xmm_move_src (vmode, *op)),
+   insn);
+
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ if (movabs)
+   eh_insn = movabs;
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   *op = gen_rtx_SUBREG (vmode, tmp, 0);
 
   if (dump_file)
@@ -2494,6 +2510,7 @@ convert_scalars_to_vector (bool timode_p)
 {
   basic_block bb;
   int converted_insns = 0;
+  auto_vec control_flow_insns;
 
   bitmap_obstack_initialize (NULL);
   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
@@ -2575,6 +2592,11 @@ convert_scalars_to_vector (bool timode_p)
 chain->chain_id);
}
 
+ rtx_insn* iter_insn;
+ unsigned int ii;
+ FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
+   control_flow_insns.safe_push (iter_insn);
+
  delete chain;
}
 }
@@ -2643,6 +2665,24 @@ convert_scalars_to_vector (bool timode_p)
  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
  }
  }
+
+  if (!control_flow_insns.is_empty ())
+   {
+ free_dominance_info (CDI_DOMINATORS);
+
+ unsigned int i;
+ rtx_insn* insn;
+ FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+   if (control_flow_insn_p (insn))
+ {
+   /* Split the block after insn.  There will be a fallthru
+  edge, which is OK so we keep it.  We have to create
+  the exception edges ourselves.  */
+   bb = BLOCK_FOR_INSN (insn);
+   split_block (bb, insn);
+   rtl_make_eh_edge (NULL, bb, BB_END (bb));
+ }
+  

[gcc r13-8438] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:bdbcfbfcf591381f0faf95c881e3772b56d0a404

commit r13-8438-gbdbcfbfcf591381f0faf95c881e3772b56d0a404
Author: liuhongt 
Date:   Wed Mar 13 10:40:01 2024 +0800

i386[stv]: Handle REG_EH_REGION note

When we split
(insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
(mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 
{*movdi_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])

into

(insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
(vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) 
[6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
(const_int 0 [0]))) "test.C":22:42 -1
(nil)))
(insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
(subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
(nil)))

we must copy the REG_EH_REGION note to the first insn and split the block
after the newly added insn.  The REG_EH_REGION on the second insn will be
removed later since it no longer traps.

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::convert_op): Handle REG_EH_REGION note.
(convert_scalars_to_vector): Ditto.
* config/i386/i386-features.h (class scalar_chain): New
memeber control_flow_insns.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr111822.C: New test.

(cherry picked from commit 618e34d56cc38e9c3ae95a413228068e53ed76bb)

Diff:
---
 gcc/config/i386/i386-features.cc | 50 
 gcc/config/i386/i386-features.h  |  1 +
 gcc/testsuite/g++.target/i386/pr111822.C | 45 
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 74ee14a584a..ed3055b43f8 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -913,20 +913,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn, *movabs = NULL;
   rtx tmp = gen_reg_rtx (GET_MODE (*op));
 
-  /* Handle movabs.  */
+  /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
   if (!memory_operand (*op, GET_MODE (*op)))
{
  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
+ movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
 
- emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
  *op = tmp2;
}
 
-  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
-gen_gpr_to_xmm_move_src (vmode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
+gen_gpr_to_xmm_move_src (vmode, *op)),
+   insn);
+
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ if (movabs)
+   eh_insn = movabs;
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   *op = gen_rtx_SUBREG (vmode, tmp, 0);
 
   if (dump_file)
@@ -2215,6 +2231,7 @@ convert_scalars_to_vector (bool timode_p)
 {
   basic_block bb;
   int converted_insns = 0;
+  auto_vec control_flow_insns;
 
   bitmap_obstack_initialize (NULL);
   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
@@ -2296,6 +2313,11 @@ convert_scalars_to_vector (bool timode_p)
 chain->chain_id);
}
 
+ rtx_insn* iter_insn;
+ unsigned int ii;
+ FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
+   control_flow_insns.safe_push (iter_insn);
+
  delete chain;
}
 }
@@ -2364,6 +2386,24 @@ convert_scalars_to_vector (bool timode_p)
  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
  }
  }
+
+  if (!control_flow_insns.is_empty ())
+   {
+ free_dominance_info (CDI_DOMINATORS);
+
+ unsigned int i;
+ rtx_insn* insn;
+ FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+   if (control_flow_insn_p (insn))
+ {
+   /* Split the block after insn.  There will be a fallthru
+  edge, which is OK so we keep it.  We have to create
+  the exception edges ourselves.  */
+   bb = BLOCK_FOR_INSN (insn);
+   split_block (bb, insn);

[gcc r12-10214] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a861f940efffae2782c559cd04df2d2740cd28bd

commit r12-10214-ga861f940efffae2782c559cd04df2d2740cd28bd
Author: liuhongt 
Date:   Wed Mar 13 10:40:01 2024 +0800

i386[stv]: Handle REG_EH_REGION note

When we split
(insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
(mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 
{*movdi_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])

into

(insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
(vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) 
[6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
(const_int 0 [0]))) "test.C":22:42 -1
(nil)))
(insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
(subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
(nil)))

we must copy the REG_EH_REGION note to the first insn and split the block
after the newly added insn.  The REG_EH_REGION on the second insn will be
removed later since it no longer traps.

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::convert_op): Handle REG_EH_REGION note.
(convert_scalars_to_vector): Ditto.
* config/i386/i386-features.h (class scalar_chain): New
memeber control_flow_insns.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr111822.C: New test.

(cherry picked from commit 618e34d56cc38e9c3ae95a413228068e53ed76bb)

Diff:
---
 gcc/config/i386/i386-features.cc | 50 
 gcc/config/i386/i386-features.h  |  1 +
 gcc/testsuite/g++.target/i386/pr111822.C | 45 
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6a2444eb6b6..37f22ba3733 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -871,20 +871,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn, *movabs = NULL;
   rtx tmp = gen_reg_rtx (GET_MODE (*op));
 
-  /* Handle movabs.  */
+  /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
   if (!memory_operand (*op, GET_MODE (*op)))
{
  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
+ movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
 
- emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
  *op = tmp2;
}
 
-  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
-gen_gpr_to_xmm_move_src (vmode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
+gen_gpr_to_xmm_move_src (vmode, *op)),
+   insn);
+
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ if (movabs)
+   eh_insn = movabs;
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   *op = gen_rtx_SUBREG (vmode, tmp, 0);
 
   if (dump_file)
@@ -1681,6 +1697,7 @@ convert_scalars_to_vector (bool timode_p)
 {
   basic_block bb;
   int converted_insns = 0;
+  auto_vec control_flow_insns;
 
   bitmap_obstack_initialize (NULL);
   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
@@ -1759,6 +1776,11 @@ convert_scalars_to_vector (bool timode_p)
fprintf (dump_file, "Chain #%d conversion is not profitable\n",
 chain->chain_id);
 
+   rtx_insn* iter_insn;
+   unsigned int ii;
+   FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
+ control_flow_insns.safe_push (iter_insn);
+
delete chain;
   }
 
@@ -1826,6 +1848,24 @@ convert_scalars_to_vector (bool timode_p)
  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
  }
  }
+
+  if (!control_flow_insns.is_empty ())
+   {
+ free_dominance_info (CDI_DOMINATORS);
+
+ unsigned int i;
+ rtx_insn* insn;
+ FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+   if (control_flow_insn_p (insn))
+ {
+   /* Split the block after insn.  There will be a fallthru
+  edge, which is OK so we keep it.  We have to create
+  the exception edges ourselves.  */
+   bb = BLOCK_FOR_INSN (ins

[gcc r14-9512] Add missing hf/bf patterns.

2024-03-17 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:942d470a5a4fb1baeff943127a81b441dffaa543

commit r14-9512-g942d470a5a4fb1baeff943127a81b441dffaa543
Author: liuhongt 
Date:   Fri Mar 15 10:59:10 2024 +0800

Add missing hf/bf patterns.

It will be used by copysignm3/xorsignm3/lroundmn2 expanders.

gcc/ChangeLog:

PR target/114334
* config/i386/i386.md (mode): Add new number V8BF,V16BF,V32BF.
(MODEF248): New mode iterator.
(ssevecmodesuffix): Hanlde BF and HF.
* config/i386/sse.md (andnot3): Extend to HF/BF.
(3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114334.c: New test.

Diff:
---
 gcc/config/i386/i386.md  | 13 +
 gcc/config/i386/sse.md   | 22 +++---
 gcc/testsuite/gcc.target/i386/pr114334.c |  8 
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index df97a2d6270..11fdc6af3fa 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -543,8 +543,9 @@
 
 ;; Main data type used by the insn
 (define_attr "mode"
-  "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
-   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V4BF,V2HF,V2BF"
+  "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF,
+   V32HF,V16HF,V8HF,V4HF,V2HF,V32BF,V16BF,V8BF,V4BF,V2BF,
+   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -1323,6 +1324,8 @@
 ;; SSE and x87 SFmode and DFmode floating point modes
 (define_mode_iterator MODEF [SF DF])
 
+(define_mode_iterator MODEF248 [BF HF SF (DF "TARGET_SSE2")])
+
 ;; SSE floating point modes
 (define_mode_iterator MODEFH [(HF "TARGET_AVX512FP16") SF DF])
 
@@ -1347,7 +1350,8 @@
(V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")])
 
 ;; SSE vector suffix for floating point modes
-(define_mode_attr ssevecmodesuffix [(SF "ps") (DF "pd")])
+;; BF HF use same suffix as SF for logic operations.
+(define_mode_attr ssevecmodesuffix [(BF "ps") (HF "ps") (SF "ps") (DF "pd")])
 
 ;; SSE vector mode corresponding to a scalar mode
 (define_mode_attr ssevecmode
@@ -1357,7 +1361,8 @@
 
 ;; AVX512F vector mode corresponding to a scalar mode
 (define_mode_attr avx512fvecmode
-  [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI") (SF "V16SF") (DF 
"V8DF")])
+  [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI")
+   (HF "V32HF") (BF "V32BF") (SF "V16SF") (DF "V8DF")])
 
 ;; Instruction suffix for REX 64bit operators.
 (define_mode_attr rex64suffix [(SI "{l}") (DI "{q}")])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1bc614ab702..3286d3a4fac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5125,12 +5125,12 @@
 ;; because the native instructions read the full 128-bits.
 
 (define_insn "*andnot3"
-  [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
-   (and:MODEF
- (not:MODEF
-   (match_operand:MODEF 1 "register_operand" "0,x,v,v"))
-   (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
-  "SSE_FLOAT_MODE_P (mode)"
+  [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v")
+   (and:MODEF248
+ (not:MODEF248
+   (match_operand:MODEF248 1 "register_operand" "0,x,v,v"))
+   (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))]
+  "TARGET_SSE"
 {
   char buf[128];
   const char *ops;
@@ -5257,11 +5257,11 @@
  (const_string "TI")))])
 
 (define_insn "3"
-  [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
-   (any_logic:MODEF
- (match_operand:MODEF 1 "register_operand" "%0,x,v,v")
- (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
-  "SSE_FLOAT_MODE_P (mode)"
+  [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v")
+   (any_logic:MODEF248
+ (match_operand:MODEF248 1 "register_operand" "%0,x,v,v")
+ (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))]
+  "TARGET_SSE"
 {
   char buf[128];
   const char *ops;
diff --git a/gcc/testsuite/gcc.target/i386/pr114334.c 
b/gcc/testsuite/gcc.target/i386/pr114334.c
new file mode 100644
index 000..8e38e24cd16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114334.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512fp16" } */
+
+long
+foo(_Float16 f)
+{
+  return __builtin_lroundf16(f);
+}


[gcc r14-9588] Document -fexcess-precision=16.

2024-03-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:415091f09096a0ebba1fdcd4af8c2fda24cfd411

commit r14-9588-g415091f09096a0ebba1fdcd4af8c2fda24cfd411
Author: liuhongt 
Date:   Mon Mar 18 18:53:59 2024 +0800

Document -fexcess-precision=16.

gcc/ChangeLog:

PR middle-end/114347
* doc/invoke.texi: Document -fexcess-precision=16.

Diff:
---
 gcc/doc/invoke.texi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b446b2905c7..e0950ca5dc2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14931,6 +14931,9 @@ assignments).  This option is enabled by default for C 
or C++ if a strict
 conformance option such as @option{-std=c99} or @option{-std=c++17} is used.
 @option{-ffast-math} enables @option{-fexcess-precision=fast} by default
 regardless of whether a strict conformance option is used.
+If @option{-fexcess-precision=16} is specified, constants and the
+results of expressions with types @code{_Float16} and @code{__bf16}
+are computed without excess precision.
 
 @opindex mfpmath
 @option{-fexcess-precision=standard} is not implemented for languages


[gcc r14-9591] Fix runtime error for nonlinear iv vectorization(step_mult).

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ac2f8c2a367151fc0410f904339c475a953cffc8

commit r14-9591-gac2f8c2a367151fc0410f904339c475a953cffc8
Author: liuhongt 
Date:   Thu Mar 21 13:15:23 2024 +0800

Fix runtime error for nonlinear iv vectorization(step_mult).

wi::from_mpz doesn't take a sign argument, we want it to be wrapped
instead of saturation, so pass utype and true to it, and it fixes the
bug.

gcc/ChangeLog:

PR tree-optimization/114396
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Pass utype
and true to wi::from_mpz.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114396.c: New test.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr114396.c | 105 +++
 gcc/tree-vect-loop.cc|   2 +-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.target/i386/pr114396.c
new file mode 100644
index 000..4c4015f871f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114396.c
@@ -0,0 +1,105 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+
+short a = 0xF;
+short b[16];
+unsigned short ua = 0xF;
+unsigned short ub[16];
+
+short
+__attribute__((noipa))
+foo (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa))
+foo1 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou1 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo1_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou1_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+int main() {
+  unsigned short uexp, ures;
+  short exp, res;
+  exp = foo (a);
+  res = foo_o3 (a);
+  if (exp != res)
+__builtin_abort ();
+
+  exp = foo1 (a);
+  res = foo1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou (a);
+  ures = foou_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou1 (a);
+  ures = foou1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 4375ebdcb49..2921a9e6aa1 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9454,7 +9454,7 @@ vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree 
init_expr,
wi::to_mpz (skipn, exp, UNSIGNED);
mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
mpz_powm (res, base, exp, mod);
-   begin = wi::from_mpz (type, res, TYPE_SIGN (type));
+   begin = wi::from_mpz (utype, res, true);
tree mult_expr = wide_int_to_tree (utype, begin);
init_expr = gimple_build (stmts, MULT_EXPR, utype,
  init_expr, mult_expr);


[gcc r13-8475] Fix runtime error for nonlinear iv vectorization(step_mult).

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:199b021a38f30b681e0dbecd2d0296beabd50b13

commit r13-8475-g199b021a38f30b681e0dbecd2d0296beabd50b13
Author: liuhongt 
Date:   Thu Mar 21 13:15:23 2024 +0800

Fix runtime error for nonlinear iv vectorization(step_mult).

wi::from_mpz doesn't take a sign argument, we want it to be wrapped
instead of saturation, so pass utype and true to it, and it fixes the
bug.

gcc/ChangeLog:

PR tree-optimization/114396
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Pass utype
and true to wi::from_mpz.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114396.c: New test.

(cherry picked from commit ac2f8c2a367151fc0410f904339c475a953cffc8)

Diff:
---
 gcc/testsuite/gcc.target/i386/pr114396.c | 105 +++
 gcc/tree-vect-loop.cc|   2 +-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.target/i386/pr114396.c
new file mode 100644
index 000..4c4015f871f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114396.c
@@ -0,0 +1,105 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+
+short a = 0xF;
+short b[16];
+unsigned short ua = 0xF;
+unsigned short ub[16];
+
+short
+__attribute__((noipa))
+foo (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa))
+foo1 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou1 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo1_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou1_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+int main() {
+  unsigned short uexp, ures;
+  short exp, res;
+  exp = foo (a);
+  res = foo_o3 (a);
+  if (exp != res)
+__builtin_abort ();
+
+  exp = foo1 (a);
+  res = foo1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou (a);
+  ures = foou_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou1 (a);
+  ures = foou1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index d08d4996771..9615161ad37 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8730,7 +8730,7 @@ vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree 
init_expr,
wi::to_mpz (skipn, exp, UNSIGNED);
mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
mpz_powm (res, base, exp, mod);
-   begin = wi::from_mpz (type, res, TYPE_SIGN (type));
+   begin = wi::from_mpz (utype, res, true);
tree mult_expr = wide_int_to_tree (utype, begin);
init_expr = gimple_build (stmts, MULT_EXPR, utype,
  init_expr, mult_expr);


[gcc r14-9603] Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2

commit r14-9603-g9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2
Author: liuhongt 
Date:   Fri Mar 22 10:09:43 2024 +0800

Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

Also fixed a typo in the testcase.

gcc/testsuite/ChangeLog:

PR tree-optimization/114396
* gcc.target/i386/pr114396.c: Move to...
* gcc.c-torture/execute/pr114396.c: ...here.

Diff:
---
 gcc/testsuite/{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
similarity index 92%
rename from gcc/testsuite/gcc.target/i386/pr114396.c
rename to gcc/testsuite/gcc.c-torture/execute/pr114396.c
index 4c4015f871f..baf90eafabf 100644
--- a/gcc/testsuite/gcc.target/i386/pr114396.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
@@ -1,5 +1,5 @@
-/* { dg-do run } */
-/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+/* PR tree-optimization/114396 */
+/* { dg-additional-options "-fwrapv -fno-vect-cost-model" } */
 
 short a = 0xF;
 short b[16];
@@ -88,7 +88,7 @@ int main() {
 
   exp = foo1 (a);
   res = foo1_o3 (a);
-  if (uexp != ures)
+  if (exp != res)
 __builtin_abort ();
 
   uexp = foou (a);


[gcc r13-8488] Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e6a3d1f5bcfd954b614155d96c97bde8ac230e2e

commit r13-8488-ge6a3d1f5bcfd954b614155d96c97bde8ac230e2e
Author: liuhongt 
Date:   Fri Mar 22 10:09:43 2024 +0800

Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

Also fixed a typo in the testcase.

gcc/testsuite/ChangeLog:

PR tree-optimization/114396
* gcc.target/i386/pr114396.c: Move to...
* gcc.c-torture/execute/pr114396.c: ...here.

(cherry picked from commit 9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2)

Diff:
---
 gcc/testsuite/{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
similarity index 92%
rename from gcc/testsuite/gcc.target/i386/pr114396.c
rename to gcc/testsuite/gcc.c-torture/execute/pr114396.c
index 4c4015f871f..baf90eafabf 100644
--- a/gcc/testsuite/gcc.target/i386/pr114396.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
@@ -1,5 +1,5 @@
-/* { dg-do run } */
-/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+/* PR tree-optimization/114396 */
+/* { dg-additional-options "-fwrapv -fno-vect-cost-model" } */
 
 short a = 0xF;
 short b[16];
@@ -88,7 +88,7 @@ int main() {
 
   exp = foo1 (a);
   res = foo1_o3 (a);
-  if (uexp != ures)
+  if (exp != res)
 __builtin_abort ();
 
   uexp = foou (a);


[gcc r15-22] Adjust alternative *k to ?k for avx512 mask in zero_extend patterns

2024-04-28 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:c19a674d03847b900919b97d0957c8ae5164f8f1

commit r15-22-gc19a674d03847b900919b97d0957c8ae5164f8f1
Author: liuhongt 
Date:   Tue Apr 16 08:37:22 2024 +0800

Adjust alternative *k to ?k for avx512 mask in zero_extend patterns

So when both source operand and dest operand require avx512 MASK_REGS, RA
can allocate MASK_REGS register instead of GPR to avoid reload it from
GPR to MASK_REGS.

gcc/ChangeLog:

* config/i386/i386.md: (zero_extendsidi2): Adjust
alternative *k to ?k.
(zero_extenddi2): Ditto.
(*zero_extendsi2): Ditto.
(*zero_extendqihi2): Ditto.

Diff:
---
 gcc/config/i386/i386.md  | 16 -
 gcc/testsuite/gcc.target/i386/zero_extendkmask.c | 43 
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 80e64c603eb..764bfe20ff2 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4569,10 +4569,10 @@
 
 (define_insn "*zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-   "=r,?r,?o,r   ,o,?*y,?!*y,$r,$v,$x,*x,*v,*r,*k")
+   "=r,?r,?o,r   ,o,?*y,?!*y,$r,$v,$x,*x,*v,?r,?k")
(zero_extend:DI
 (match_operand:SI 1 "x86_64_zext_operand"
-   "0 ,rm,r ,rmWz,0,r  ,m   ,v ,r ,m ,*x,*v,*k,*km")))]
+   "0 ,rm,r ,rmWz,0,r  ,m   ,v ,r ,m ,*x,*v,?k,?km")))]
   ""
 {
   switch (get_attr_type (insn))
@@ -4705,9 +4705,9 @@
   [(QI "avx512dq") (HI "avx512f") (SI "avx512bw") (DI "avx512bw")])
 
 (define_insn "zero_extenddi2"
-  [(set (match_operand:DI 0 "register_operand" "=r,*r,*k")
+  [(set (match_operand:DI 0 "register_operand" "=r,?r,?k")
(zero_extend:DI
-(match_operand:SWI12 1 "nonimmediate_operand" "m,*k,*km")))]
+(match_operand:SWI12 1 "nonimmediate_operand" "m,?k,?km")))]
   "TARGET_64BIT"
   "@
movz{l|x}\t{%1, %k0|%k0, %1}
@@ -4760,9 +4760,9 @@
(set_attr "mode" "SI")])
 
 (define_insn "*zero_extendsi2"
-  [(set (match_operand:SI 0 "register_operand" "=r,*r,*k")
+  [(set (match_operand:SI 0 "register_operand" "=r,?r,?k")
(zero_extend:SI
- (match_operand:SWI12 1 "nonimmediate_operand" "m,*k,*km")))]
+ (match_operand:SWI12 1 "nonimmediate_operand" "m,?k,?km")))]
   "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))"
   "@
movz{l|x}\t{%1, %0|%0, %1}
@@ -4815,8 +4815,8 @@
 
 ; zero extend to SImode to avoid partial register stalls
 (define_insn "*zero_extendqihi2"
-  [(set (match_operand:HI 0 "register_operand" "=r,*r,*k")
-   (zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" 
"qm,*k,*km")))]
+  [(set (match_operand:HI 0 "register_operand" "=r,?r,?k")
+   (zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" 
"qm,?k,?km")))]
   "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))"
   "@
movz{bl|x}\t{%1, %k0|%k0, %1}
diff --git a/gcc/testsuite/gcc.target/i386/zero_extendkmask.c 
b/gcc/testsuite/gcc.target/i386/zero_extendkmask.c
new file mode 100644
index 000..6b18980bbd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero_extendkmask.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not {(?n)shr[bwl]} } } */
+/* { dg-final { scan-assembler-not {(?n)movz[bw]} } } */
+
+#include
+
+__m512
+foo (__m512d a, __m512d b, __m512 c, __m512 d)
+{
+  return _mm512_mask_mov_ps (c, (__mmask16) (_mm512_cmpeq_pd_mask (a, b) >> 
1), d);
+}
+
+
+__m512i
+foo1 (__m512d a, __m512d b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi16 (c, (__mmask32) (_mm512_cmpeq_pd_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo2 (__m512d a, __m512d b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmpeq_pd_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo3 (__m512 a, __m512 b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi16 (c, (__mmask32) (_mm512_cmpeq_ps_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo4 (__m512 a, __m512 b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmpeq_ps_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo5 (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmp_epi16_mask (a, b, 5) 
>> 1), d);
+}


[gcc r15-2395] Refine constraint "Bk" to define_special_memory_constraint.

2024-07-29 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:bc1fda00d5f20e2f3e77a50b2822562b6e0040b2

commit r15-2395-gbc1fda00d5f20e2f3e77a50b2822562b6e0040b2
Author: liuhongt 
Date:   Wed Jul 24 11:29:23 2024 +0800

Refine constraint "Bk" to define_special_memory_constraint.

For below pattern, RA may still allocate r162 as v/k register, try to
reload for address with leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rsi
which result a linker error.

(set (reg:DI 162)
 (mem/u/c:DI
   (const:DI (unspec:DI
 [(symbol_ref:DI ("a") [flags 0x60]  )]
 UNSPEC_GOTNTPOFF))

Quote from H.J for why linker issue an error.
>What do these do:
>
>leaq__libc_tsd_CTYPE_B@gottpoff(%rip), %rax
>vmovq   (%rax), %xmm0
>
>From x86-64 TLS psABI:
>
>The assembler generates for the x@gottpoff(%rip) expressions a R X86
>64 GOTTPOFF relocation for the symbol x which requests the linker to
>generate a GOT entry with a R X86 64 TPOFF64 relocation. The offset of
>the GOT entry relative to the end of the instruction is then used in
>the instruction. The R X86 64 TPOFF64 relocation is pro- cessed at
>program startup time by the dynamic linker by looking up the symbol x
>in the modules loaded at that point. The offset is written in the GOT
>entry and later loaded by the addq instruction.
>
>The above code sequence looks wrong to me.

gcc/ChangeLog:

PR target/116043
* config/i386/constraints.md (Bk): Refine to
define_special_memory_constraint.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116043.c: New test.

Diff:
---
 gcc/config/i386/constraints.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr116043.c | 33 
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7508d7a58bd7..b760e7c221a1 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -187,7 +187,7 @@
   "@internal Vector memory operand."
   (match_operand 0 "vector_memory_operand"))
 
-(define_memory_constraint "Bk"
+(define_special_memory_constraint "Bk"
   "@internal TLS address that allows insn using non-integer registers."
   (and (match_operand 0 "memory_operand")
(not (match_test "ix86_gpr_tls_address_pattern_p (op)"
diff --git a/gcc/testsuite/gcc.target/i386/pr116043.c 
b/gcc/testsuite/gcc.target/i386/pr116043.c
new file mode 100644
index ..76553496c109
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116043.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O3" } */
+/* { dg-final { scan-assembler-not {(?n)lea.*@gottpoff} } } */
+
+extern __thread int a, c, i, j, k, l;
+int *b;
+struct d {
+  int e;
+} f, g;
+char *h;
+
+void m(struct d *n) {
+  b = &k;
+  for (; n->e; b++, n--) {
+i = b && a;
+if (i)
+  j = c;
+  }
+}
+
+char *o(struct d *n) {
+  for (; n->e;)
+return h;
+}
+
+int q() {
+  if (l)
+return 1;
+  int p = *o(&g);
+  m(&f);
+  m(&g);
+  l = p;
+}


[gcc r15-2539] Fix mismatch between constraint and predicate for ashl3_doubleword.

2024-08-01 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:64ca25aec4939aea79bd812b089fbb666ca6f2fd

commit r15-2539-g64ca25aec4939aea79bd812b089fbb666ca6f2fd
Author: liuhongt 
Date:   Fri Jul 26 09:56:03 2024 +0800

Fix mismatch between constraint and predicate for ashl3_doubleword.

(insn 98 94 387 2 (parallel [
(set (reg:TI 337 [ _32 ])
(ashift:TI (reg:TI 329)
(reg:QI 521)))
(clobber (reg:CC 17 flags))
]) "test.c":11:13 953 {ashlti3_doubleword}

is reloaded into

(insn 98 452 387 2 (parallel [
(set (reg:TI 0 ax [orig:337 _32 ] [337])
(ashift:TI (const_int 1671291085 [0x639de0cd])
(reg:QI 2 cx [521])))
(clobber (reg:CC 17 flags))

since constraint n in the pattern accepts that.
(Not sure why reload doesn't check predicate)

(define_insn "ashl3_doubleword"
  [(set (match_operand:DWI 0 "register_operand" "=&r,&r")
(ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n,r")
(match_operand:QI 2 "nonmemory_operand" "c,c")))

The patch fixes the mismatch between constraint and predicate.

gcc/ChangeLog:

PR target/116096
* config/i386/constraints.md (Wc): New constraint for integer
1 or -1.
* config/i386/i386.md (ashl3_doubleword): Refine
constraint with Wc.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116096.c: New test.

Diff:
---
 gcc/config/i386/constraints.md   |  6 ++
 gcc/config/i386/i386.md  |  2 +-
 gcc/testsuite/gcc.target/i386/pr116096.c | 26 ++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index b760e7c221a1..18389c478002 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -254,6 +254,12 @@
   (and (match_code "const_int")
(match_test "IN_RANGE (ival, 0, 7)")))
 
+(define_constraint "Wc"
+  "Integer constant -1 or 1."
+  (and (match_code "const_int")
+   (ior (match_test "op == constm1_rtx")
+   (match_test "op == const1_rtx"
+
 (define_constraint "Ww"
   "Integer constant in the range 0 @dots{} 15, for 16-bit shifts."
   (and (match_code "const_int")
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 3c293c146569..caa3773a5212 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14764,7 +14764,7 @@
 
 (define_insn "ashl3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=&r,&r")
-   (ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n,r")
+   (ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0Wc,r")
(match_operand:QI 2 "nonmemory_operand" "c,c")))
(clobber (reg:CC FLAGS_REG))]
   ""
diff --git a/gcc/testsuite/gcc.target/i386/pr116096.c 
b/gcc/testsuite/gcc.target/i386/pr116096.c
new file mode 100644
index ..5ef39805f582
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116096.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -flive-range-shrinkage -fno-peephole2 -mstackrealign 
-Wno-psabi" } */
+
+typedef char U __attribute__((vector_size (32)));
+typedef unsigned V __attribute__((vector_size (32)));
+typedef __int128 W __attribute__((vector_size (32)));
+U g;
+
+W baz ();
+
+static inline U
+bar (V x, W y)
+{
+  y = y | y << (W) x;
+  return (U)y;
+}
+
+void
+foo (W w)
+{
+  g = g <<
+bar ((V){baz ()[1], 3, 3, 5, 7},
+(W){w[0], ~(int) 2623676210}) >>
+bar ((V){baz ()[1]},
+(W){-w[0], ~(int) 2623676210});
+}


[gcc r14-10551] Refine constraint "Bk" to define_special_memory_constraint.

2024-08-02 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a295076bee293aa3112c615f9af7a27231816a36

commit r14-10551-ga295076bee293aa3112c615f9af7a27231816a36
Author: liuhongt 
Date:   Wed Jul 24 11:29:23 2024 +0800

Refine constraint "Bk" to define_special_memory_constraint.

For below pattern, RA may still allocate r162 as v/k register, try to
reload for address with leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rsi
which result a linker error.

(set (reg:DI 162)
 (mem/u/c:DI
   (const:DI (unspec:DI
 [(symbol_ref:DI ("a") [flags 0x60]  )]
 UNSPEC_GOTNTPOFF))

Quote from H.J for why linker issue an error.
>What do these do:
>
>leaq__libc_tsd_CTYPE_B@gottpoff(%rip), %rax
>vmovq   (%rax), %xmm0
>
>From x86-64 TLS psABI:
>
>The assembler generates for the x@gottpoff(%rip) expressions a R X86
>64 GOTTPOFF relocation for the symbol x which requests the linker to
>generate a GOT entry with a R X86 64 TPOFF64 relocation. The offset of
>the GOT entry relative to the end of the instruction is then used in
>the instruction. The R X86 64 TPOFF64 relocation is pro- cessed at
>program startup time by the dynamic linker by looking up the symbol x
>in the modules loaded at that point. The offset is written in the GOT
>entry and later loaded by the addq instruction.
>
>The above code sequence looks wrong to me.

gcc/ChangeLog:

PR target/116043
* config/i386/constraints.md (Bk): Refine to
define_special_memory_constraint.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116043.c: New test.

(cherry picked from commit bc1fda00d5f20e2f3e77a50b2822562b6e0040b2)

Diff:
---
 gcc/config/i386/constraints.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr116043.c | 33 
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7508d7a58bd7..b760e7c221a1 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -187,7 +187,7 @@
   "@internal Vector memory operand."
   (match_operand 0 "vector_memory_operand"))
 
-(define_memory_constraint "Bk"
+(define_special_memory_constraint "Bk"
   "@internal TLS address that allows insn using non-integer registers."
   (and (match_operand 0 "memory_operand")
(not (match_test "ix86_gpr_tls_address_pattern_p (op)"
diff --git a/gcc/testsuite/gcc.target/i386/pr116043.c 
b/gcc/testsuite/gcc.target/i386/pr116043.c
new file mode 100644
index ..76553496c109
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116043.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O3" } */
+/* { dg-final { scan-assembler-not {(?n)lea.*@gottpoff} } } */
+
+extern __thread int a, c, i, j, k, l;
+int *b;
+struct d {
+  int e;
+} f, g;
+char *h;
+
+void m(struct d *n) {
+  b = &k;
+  for (; n->e; b++, n--) {
+i = b && a;
+if (i)
+  j = c;
+  }
+}
+
+char *o(struct d *n) {
+  for (; n->e;)
+return h;
+}
+
+int q() {
+  if (l)
+return 1;
+  int p = *o(&g);
+  m(&f);
+  m(&g);
+  l = p;
+}


[gcc r12-10668] Refine constraint "Bk" to define_special_memory_constraint.

2024-08-11 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:c94738e2462ff46f3013f6270f6a955b749d82b2

commit r12-10668-gc94738e2462ff46f3013f6270f6a955b749d82b2
Author: liuhongt 
Date:   Wed Jul 24 11:29:23 2024 +0800

Refine constraint "Bk" to define_special_memory_constraint.

For below pattern, RA may still allocate r162 as v/k register, try to
reload for address with leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rsi
which result a linker error.

(set (reg:DI 162)
 (mem/u/c:DI
   (const:DI (unspec:DI
 [(symbol_ref:DI ("a") [flags 0x60]  )]
 UNSPEC_GOTNTPOFF))

Quote from H.J for why linker issue an error.
>What do these do:
>
>leaq__libc_tsd_CTYPE_B@gottpoff(%rip), %rax
>vmovq   (%rax), %xmm0
>
>From x86-64 TLS psABI:
>
>The assembler generates for the x@gottpoff(%rip) expressions a R X86
>64 GOTTPOFF relocation for the symbol x which requests the linker to
>generate a GOT entry with a R X86 64 TPOFF64 relocation. The offset of
>the GOT entry relative to the end of the instruction is then used in
>the instruction. The R X86 64 TPOFF64 relocation is pro- cessed at
>program startup time by the dynamic linker by looking up the symbol x
>in the modules loaded at that point. The offset is written in the GOT
>entry and later loaded by the addq instruction.
>
>The above code sequence looks wrong to me.

gcc/ChangeLog:

PR target/116043
* config/i386/constraints.md (Bk): Refine to
define_special_memory_constraint.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116043.c: New test.

(cherry picked from commit bc1fda00d5f20e2f3e77a50b2822562b6e0040b2)

Diff:
---
 gcc/config/i386/constraints.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr116043.c | 33 
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7361687632fa..e4b66340589f 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -187,7 +187,7 @@
   (and (match_operand 0 "memory_operand")
(match_test "constant_address_p (XEXP (op, 0))")))
 
-(define_memory_constraint "Bk"
+(define_special_memory_constraint "Bk"
   "@internal TLS address that allows insn using non-integer registers."
   (and (match_operand 0 "memory_operand")
(not (match_test "ix86_gpr_tls_address_pattern_p (op)"
diff --git a/gcc/testsuite/gcc.target/i386/pr116043.c 
b/gcc/testsuite/gcc.target/i386/pr116043.c
new file mode 100644
index ..76553496c109
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116043.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O3" } */
+/* { dg-final { scan-assembler-not {(?n)lea.*@gottpoff} } } */
+
+extern __thread int a, c, i, j, k, l;
+int *b;
+struct d {
+  int e;
+} f, g;
+char *h;
+
+void m(struct d *n) {
+  b = &k;
+  for (; n->e; b++, n--) {
+i = b && a;
+if (i)
+  j = c;
+  }
+}
+
+char *o(struct d *n) {
+  for (; n->e;)
+return h;
+}
+
+int q() {
+  if (l)
+return 1;
+  int p = *o(&g);
+  m(&f);
+  m(&g);
+  l = p;
+}


[gcc r13-8971] Refine constraint "Bk" to define_special_memory_constraint.

2024-08-11 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:617562e4e422c7bd282960b14abfffd994445009

commit r13-8971-g617562e4e422c7bd282960b14abfffd994445009
Author: liuhongt 
Date:   Wed Jul 24 11:29:23 2024 +0800

Refine constraint "Bk" to define_special_memory_constraint.

For below pattern, RA may still allocate r162 as v/k register, try to
reload for address with leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rsi
which result a linker error.

(set (reg:DI 162)
 (mem/u/c:DI
   (const:DI (unspec:DI
 [(symbol_ref:DI ("a") [flags 0x60]  )]
 UNSPEC_GOTNTPOFF))

Quote from H.J for why linker issue an error.
>What do these do:
>
>leaq__libc_tsd_CTYPE_B@gottpoff(%rip), %rax
>vmovq   (%rax), %xmm0
>
>From x86-64 TLS psABI:
>
>The assembler generates for the x@gottpoff(%rip) expressions a R X86
>64 GOTTPOFF relocation for the symbol x which requests the linker to
>generate a GOT entry with a R X86 64 TPOFF64 relocation. The offset of
>the GOT entry relative to the end of the instruction is then used in
>the instruction. The R X86 64 TPOFF64 relocation is pro- cessed at
>program startup time by the dynamic linker by looking up the symbol x
>in the modules loaded at that point. The offset is written in the GOT
>entry and later loaded by the addq instruction.
>
>The above code sequence looks wrong to me.

gcc/ChangeLog:

PR target/116043
* config/i386/constraints.md (Bk): Refine to
define_special_memory_constraint.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116043.c: New test.

(cherry picked from commit bc1fda00d5f20e2f3e77a50b2822562b6e0040b2)

Diff:
---
 gcc/config/i386/constraints.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr116043.c | 33 
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index aeda1078471d..71265ddc7f42 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -190,7 +190,7 @@
   (and (match_operand 0 "memory_operand")
(match_test "constant_address_p (XEXP (op, 0))")))
 
-(define_memory_constraint "Bk"
+(define_special_memory_constraint "Bk"
   "@internal TLS address that allows insn using non-integer registers."
   (and (match_operand 0 "memory_operand")
(not (match_test "ix86_gpr_tls_address_pattern_p (op)"
diff --git a/gcc/testsuite/gcc.target/i386/pr116043.c 
b/gcc/testsuite/gcc.target/i386/pr116043.c
new file mode 100644
index ..76553496c109
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116043.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O3" } */
+/* { dg-final { scan-assembler-not {(?n)lea.*@gottpoff} } } */
+
+extern __thread int a, c, i, j, k, l;
+int *b;
+struct d {
+  int e;
+} f, g;
+char *h;
+
+void m(struct d *n) {
+  b = &k;
+  for (; n->e; b++, n--) {
+i = b && a;
+if (i)
+  j = c;
+  }
+}
+
+char *o(struct d *n) {
+  for (; n->e;)
+return h;
+}
+
+int q() {
+  if (l)
+return 1;
+  int p = *o(&g);
+  m(&f);
+  m(&g);
+  l = p;
+}


[gcc r15-2906] Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area.

2024-08-13 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:c3c83d22d212a35cb1bfb8727477819463f0dcd8

commit r15-2906-gc3c83d22d212a35cb1bfb8727477819463f0dcd8
Author: liuhongt 
Date:   Mon Aug 12 14:35:31 2024 +0800

Move ix86_align_loops into a separate pass and insert the pass after 
pass_endbr_and_patchable_area.

gcc/ChangeLog:

PR target/116174
* config/i386/i386.cc (ix86_align_loops): Move this to ..
* config/i386/i386-features.cc (ix86_align_loops): .. here.
(class pass_align_tight_loops): New class.
(make_pass_align_tight_loops): New function.
* config/i386/i386-passes.def: Insert pass_align_tight_loops
after pass_insert_endbr_and_patchable_area.
* config/i386/i386-protos.h (make_pass_align_tight_loops): New
declare.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116174.c: New test.

Diff:
---
 gcc/config/i386/i386-features.cc | 190 +++
 gcc/config/i386/i386-passes.def  |   3 +
 gcc/config/i386/i386-protos.h|   1 +
 gcc/config/i386/i386.cc  | 146 
 gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
 5 files changed, 206 insertions(+), 146 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c36d181f2d64..7e80e7b0103f 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3417,6 +3417,196 @@ make_pass_apx_nf_convert (gcc::context *ctxt)
   return new pass_apx_nf_convert (ctxt);
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+{
+  rtx_insn *label = BB_HEAD (bb);
+  bool has_fallthru = 0;
+  edge e;
+  edge_iterator ei;
+
+  if (!LABEL_P (label))
+   continue;
+
+  profile_count fallthru_count = profile_count::zero ();
+  profile_count branch_count = profile_count::zero ();
+
+  FOR_EACH_EDGE (e, ei, bb->preds)
+   {
+ if (e->flags & EDGE_FALLTHRU)
+   has_fallthru = 1, fallthru_count += e->count ();
+ else
+   branch_count += e->count ();
+   }
+
+  if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+   continue;
+
+  if (bb->loop_father
+ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+ && (has_fallthru
+ ? (!(single_succ_p (bb)
+  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+&& optimize_bb_for_speed_p (bb)
+&& branch_count + fallthru_count > count_threshold
+&& (branch_count > fallthru_count * 
param_align_loop_iterations))
+ /* In case there'no fallthru for the loop.
+Nops inserted won't be executed.  */
+ : (branch_count > count_threshold
+|| (bb->count > bb->prev_bb->count * 10
+&& (bb->prev_bb->count
+<= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)
+   {
+ rtx_insn* insn, *end_insn;
+ HOST_WIDE_INT size = 0;
+ bool padding_p = true;
+ basic_block tbb = bb;
+ unsigned cond_branch_num = 0;
+ bool detect_tight_loop_p = false;
+
+ for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+  i++, tbb = tbb->next_bb)
+   {
+ /* Only handle continuous cfg layout. */
+ if (bb->loop_father != tbb->loop_father)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_BB_INSNS (tbb, insn)
+   {
+ if (!NONDEBUG_INSN_P (insn))
+   continue;
+ size += ix86_min_insn_size (insn);
+
+ /* We don't know size of inline asm.
+Don't align loop for call.  */
+ if (asm_noperands (PATTERN (insn)) >= 0
+ || CALL_P (insn))
+   {
+ size = -1;
+ break;
+   }
+   }
+
+ if (size == -1 || size > ix86_cost->prefetch_block)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_EACH_EDGE (e, ei, tbb->succs)
+   {
+ /* It could be part of the loop.  */
+ if (e->dest == bb)
+   {
+ detect_tight_loop_p = true;
+ break;
+   }
+   }
+
+ if (detect_ti

[gcc r15-2930] Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload.

2024-08-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:f7e672da8fc3d416a6d07eb01f3be4400ef94fac

commit r15-2930-gf7e672da8fc3d416a6d07eb01f3be4400ef94fac
Author: liuhongt 
Date:   Mon Aug 12 18:24:34 2024 +0800

Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need 
secondary reload.

It results in 2 failures for x86_64-pc-linux-gnu{\
-march=cascadelake};

gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o
gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1

For pr113560.c, now GCC generates mulx instead of mulq with
-march=cascadelake, which should be optimal, so adjust testcase for
that.
For gcc.target/i386/extendditi2-1.c, RA happens to choose another
register instead of rax and result in

movq%rdi, %rbp
movq%rdi, %rax
sarq$63, %rbp
movq%rbp, %rdx

The patch adds a new define_peephole2 for that.

gcc/ChangeLog:

PR target/116274
* config/i386/i386-expand.cc (ix86_expand_vector_move):
Restrict special case TImode to 128-bit vector conversions via
V2DI under ix86_pre_reload_split ().
* config/i386/i386.cc (inline_secondary_memory_needed):
Movement between GENERAL_REGS and SSE_REGS for TImode doesn't
need secondary reload.
* config/i386/i386.md (*extendsidi2_rex64): Add a
define_peephole2 after it.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116274.c: New test.
* gcc.target/i386/pr113560.c: Scan either mulq or mulx.

Diff:
---
 gcc/config/i386/i386-expand.cc   |  2 +-
 gcc/config/i386/i386.cc  | 18 --
 gcc/config/i386/i386.md  | 19 +++
 gcc/testsuite/gcc.target/i386/pr113560.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr116274.c | 12 
 5 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index bdbc1423267..ed546eeed6b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
   && SUBREG_P (op1)
   && GET_MODE (SUBREG_REG (op1)) == TImode
   && TARGET_64BIT && TARGET_SSE
-  && can_create_pseudo_p ())
+  && ix86_pre_reload_split ())
 {
   rtx tmp = gen_reg_rtx (V2DImode);
   rtx lo = gen_reg_rtx (DImode);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0721e38ab2a..d06e2141e56 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, 
reg_class_t class1,
   if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)))
return true;
 
+  /* If the target says that inter-unit moves are more expensive
+than moving through memory, then don't generate them.  */
+  if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+ || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+   return true;
+
+  /* With SSE4.1, *mov{ti,di}_internal supports moves between
+SSE_REGS and GENERAL_REGS using pinsr{q,d} or pextr{q,d}.  */
+  if (TARGET_SSE4_1
+ && (TARGET_64BIT ? mode == TImode : mode == DImode))
+   return false;
+
   int msize = GET_MODE_SIZE (mode);
 
   /* Between SSE and general, we have moves no larger than word size.  */
@@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, 
reg_class_t class1,
 
   if (msize < minsize)
return true;
-
-  /* If the target says that inter-unit moves are more expensive
-than moving through memory, then don't generate them.  */
-  if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
- || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
-   return true;
 }
 
   return false;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d3ba2425f16..efbab2f25ec 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5041,6 +5041,25 @@
   DONE;
 })
 
+(define_peephole2
+  [(set (match_operand:DI 0 "general_reg_operand")
+   (match_operand:DI 1 "general_reg_operand"))
+   (parallel [(set (match_dup 0)
+  (ashiftrt:DI (match_dup 0)
+   (const_int 63)))
+  (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:DI 2 "general_reg_operand") (match_dup 1))
+   (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))]
+  "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+   && REGNO (operands[2]) == AX_REG
+   && REGNO (operands[3]) == DX_REG
+   && peep2_reg_dead_p (4, operands[0])
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[2], operands[0])"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 3) (

[gcc r14-10588] Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area.

2024-08-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:4e7735a8d87559bbddfe3a985786996e22241f8d

commit r14-10588-g4e7735a8d87559bbddfe3a985786996e22241f8d
Author: liuhongt 
Date:   Mon Aug 12 14:35:31 2024 +0800

Move ix86_align_loops into a separate pass and insert the pass after 
pass_endbr_and_patchable_area.

gcc/ChangeLog:

PR target/116174
* config/i386/i386.cc (ix86_align_loops): Move this to ..
* config/i386/i386-features.cc (ix86_align_loops): .. here.
(class pass_align_tight_loops): New class.
(make_pass_align_tight_loops): New function.
* config/i386/i386-passes.def: Insert pass_align_tight_loops
after pass_insert_endbr_and_patchable_area.
* config/i386/i386-protos.h (make_pass_align_tight_loops): New
declare.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116174.c: New test.

(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8)

Diff:
---
 gcc/config/i386/i386-features.cc | 191 +++
 gcc/config/i386/i386-passes.def  |   3 +
 gcc/config/i386/i386-protos.h|   1 +
 gcc/config/i386/i386.cc  | 146 ---
 gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
 5 files changed, 207 insertions(+), 146 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e3e004d5526..7de19d42363 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context 
*ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+{
+  rtx_insn *label = BB_HEAD (bb);
+  bool has_fallthru = 0;
+  edge e;
+  edge_iterator ei;
+
+  if (!LABEL_P (label))
+   continue;
+
+  profile_count fallthru_count = profile_count::zero ();
+  profile_count branch_count = profile_count::zero ();
+
+  FOR_EACH_EDGE (e, ei, bb->preds)
+   {
+ if (e->flags & EDGE_FALLTHRU)
+   has_fallthru = 1, fallthru_count += e->count ();
+ else
+   branch_count += e->count ();
+   }
+
+  if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+   continue;
+
+  if (bb->loop_father
+ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+ && (has_fallthru
+ ? (!(single_succ_p (bb)
+  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+&& optimize_bb_for_speed_p (bb)
+&& branch_count + fallthru_count > count_threshold
+&& (branch_count > fallthru_count * 
param_align_loop_iterations))
+ /* In case there'no fallthru for the loop.
+Nops inserted won't be executed.  */
+ : (branch_count > count_threshold
+|| (bb->count > bb->prev_bb->count * 10
+&& (bb->prev_bb->count
+<= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)
+   {
+ rtx_insn* insn, *end_insn;
+ HOST_WIDE_INT size = 0;
+ bool padding_p = true;
+ basic_block tbb = bb;
+ unsigned cond_branch_num = 0;
+ bool detect_tight_loop_p = false;
+
+ for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+  i++, tbb = tbb->next_bb)
+   {
+ /* Only handle continuous cfg layout. */
+ if (bb->loop_father != tbb->loop_father)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_BB_INSNS (tbb, insn)
+   {
+ if (!NONDEBUG_INSN_P (insn))
+   continue;
+ size += ix86_min_insn_size (insn);
+
+ /* We don't know size of inline asm.
+Don't align loop for call.  */
+ if (asm_noperands (PATTERN (insn)) >= 0
+ || CALL_P (insn))
+   {
+ size = -1;
+ break;
+   }
+   }
+
+ if (size == -1 || size > ix86_cost->prefetch_block)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_EACH_EDGE (e, ei, tbb->succs)
+   {
+ /* It could be part of the loop.  */
+ if (e->dest == bb)
+   {
+ detect_tight_loop_p

[gcc r15-814] Fix typo in the testcase.

2024-05-24 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:51f4b47c4f4f61fe31a7bd1fa80e08c2438d76a8

commit r15-814-g51f4b47c4f4f61fe31a7bd1fa80e08c2438d76a8
Author: liuhongt 
Date:   Fri May 24 09:49:08 2024 +0800

Fix typo in the testcase.

gcc/testsuite/ChangeLog:

PR target/114148
* gcc.target/i386/pr106010-7b.c: Refine testcase.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr106010-7b.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c 
b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
index 26482cc10f5..917e56e45f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr106010-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
@@ -34,11 +34,11 @@ avx_test (void)
 p_init[i] = i % 2 + 3;
 
   memcpy (pd_src, p_init, 2 * N * sizeof (double));
-  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
-  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
-  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
-  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
-  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+  memcpy (ps_src, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
 
   foo_pd (pd_dst, pd_src[0]);
   foo_ps (ps_dst, ps_src[0]);


[gcc r15-857] Fix predicate mismatch between vfcmaddcph's define_insn and define_expand.

2024-05-27 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:c65002347e595cda8b15e59e734d209283faf2b6

commit r15-857-gc65002347e595cda8b15e59e734d209283faf2b6
Author: liuhongt 
Date:   Tue May 28 10:32:12 2024 +0800

Fix predicate mismatch between vfcmaddcph's define_insn and define_expand.

When I applied Roger's patch [1], there's ICE due to it.
The patch fix the latent bug.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651365.html

gcc/ChangeLog:

* config/i386/sse.md
(___mask): Align
operands' predicate with corresponding expander.
(__):
Ditto.

Diff:
---
 gcc/config/i386/sse.md | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b59c988fc31..0f4fbcb2c5d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6867,9 +6867,9 @@
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=&v")
(vec_merge:VHF_AVX512VL
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")
-(match_operand:VHF_AVX512VL 3 "register_operand" "0")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")
+(match_operand:VHF_AVX512VL 3 "" "0")]
 UNSPEC_COMPLEX_F_C_MA)
  (match_dup 1)
  (unspec:
@@ -6892,8 +6892,8 @@
 (define_insn "__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=&v")
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")]
 UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && "
 {


[gcc r15-882] Reduce cost of MEM (A + imm).

2024-05-28 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:1d6199e5f8c1c08083eeb0279f71333234fe14ad

commit r15-882-g1d6199e5f8c1c08083eeb0279f71333234fe14ad
Author: liuhongt 
Date:   Mon Feb 19 13:57:24 2024 +0800

Reduce cost of MEM (A + imm).

For MEM, rtx_cost iterates each subrtx, and adds up the costs,
so for MEM (reg) and MEM (reg + 4), the former costs 5,
the latter costs 9, it is not accurate for x86. Ideally
address_cost should be used, but it reduce cost too much.
So current solution is make constant disp as cheap as possible.

gcc/ChangeLog:

PR target/67325
* config/i386/i386.cc (ix86_rtx_costs): Reduce cost of MEM (A
+ imm) to "cost of MEM (A)" + 1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr67325.c: New test.

Diff:
---
 gcc/config/i386/i386.cc | 18 +-
 gcc/testsuite/gcc.target/i386/pr67325.c |  7 +++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3e2a3a194f1..85d87b9f778 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22194,7 +22194,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
-*total += 1;
+   {
+ *total += 1;
+ rtx addr = XEXP (x, 0);
+ /* For MEM, rtx_cost iterates each subrtx, and adds up the costs,
+so for MEM (reg) and MEM (reg + 4), the former costs 5,
+the latter costs 9, it is not accurate for x86. Ideally
+address_cost should be used, but it reduce cost too much.
+So current solution is make constant disp as cheap as possible.  */
+ if (GET_CODE (addr) == PLUS
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+   {
+ *total += 1;
+ *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+ return true;
+   }
+   }
+
   return false;
 
 case ZERO_EXTRACT:
diff --git a/gcc/testsuite/gcc.target/i386/pr67325.c 
b/gcc/testsuite/gcc.target/i386/pr67325.c
new file mode 100644
index 000..c3c1e4c5b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr67325.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "(?:sar|shr)" } } */
+
+int f(long*l){
+  return *l>>32;
+}


[gcc r15-919] Don't reduce estimated unrolled size for innermost loop.

2024-05-29 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ef27b91b62c3aa8841c02665dffa8914c742fd37

commit r15-919-gef27b91b62c3aa8841c02665dffa8914c742fd37
Author: liuhongt 
Date:   Tue Feb 27 15:34:57 2024 +0800

Don't reduce estimated unrolled size for innermost loop.

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure.

The patch move the 2/3 reduction from estimated_unrolled_size to
tree_unroll_loops_completely.

gcc/ChangeLog:

PR tree-optimization/112325
* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
2 / 3 loop body size reduction to ..
(try_unroll_loop_completely): .. here, add it for the check of
body size shrink, and the check of comparison against
param_max_completely_peeled_insns when
(!cunrolli ||loop->inner).
(canonicalize_loop_induction_variables): Add new parameter
cunrolli and pass down.
(tree_unroll_loops_completely_1): Ditto.
(canonicalize_induction_variables): Pass cunrolli as false to
canonicalize_loop_induction_variables.
(tree_unroll_loops_completely): Set cunrolli to true at
beginning and set it to false after CHANGED is true.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr112325.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 59 
 gcc/tree-ssa-loop-ivcanon.cc | 49 --
 2 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c 
b/gcc/testsuite/gcc.dg/vect/pr112325.c
new file mode 100644
index 000..71cf4099253
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+unsigned short s;
+__builtin_memcpy(&s, &f, sizeof(unsigned short));
+return table_f32_f16[s];
+}
+
+typedef struct {
+ggml_fp16_t d;
+ggml_fp16_t m;
+unsigned char qh[4];
+unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+float d;
+float s;
+char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+const int qk = 32;
+const int nb = n / qk;
+
+const block_q5_1 * restrict x = vx;
+const block_q8_1 * restrict y = vy;
+
+float sumf = 0.0;
+
+for (int i = 0; i < nb; i++) {
+unsigned qh;
+__builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+int sumi = 0;
+
+for (int j = 0; j < qk/2; ++j) {
+const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+}
+
+sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+}
+
+*s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..5ef24a91917 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge 
edge_to_cancel,
It is (NUNROLL + 1) * size of loop body with taking into account
the fact that in last copy everything after exit conditional
is dead and that some instructions will be eliminated after
-   peeling.
-
-   Loop body is likely going to simplify further, this is difficult
-   to guess, we just decrease the result by 1/3.  */
-
+   peeling.  */
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
 unsigned HOST_WIDE_INT nunroll)
@@ -453,10 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
 unr_insns = 0;
   unr_insns += size->last_iteration - 
size->last_iteration_eliminated_by_peeling;
 
-  unr_insns = unr_insns * 2 / 3;
-  if (unr_insns <= 0)
-unr_insns = 1;
-
   return unr_insns;
 }
 
@@ -734,7 +726,8 @@ try_unroll_loop_completely (class loop *loop,
edge exit, tree niter, bool may_be_zero,
enum unroll_level ul,
  

[gcc r15-920] Support vcond_mask_qiqi and friends.

2024-05-29 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b6c6d5abf0d31c936f50f8f9073c5e335b9e24b7

commit r15-920-gb6c6d5abf0d31c936f50f8f9073c5e335b9e24b7
Author: liuhongt 
Date:   Wed Feb 28 11:17:10 2024 +0800

Support vcond_mask_qiqi and friends.

gcc/ChangeLog:

* config/i386/sse.md (vcond_mask_): New expander.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114125.c: New test.

Diff:
---
 gcc/config/i386/sse.md   | 20 
 gcc/testsuite/gcc.target/i386/pr114125.c | 10 ++
 2 files changed, 30 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0f4fbcb2c5d..7cd912eeeb1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4807,6 +4807,26 @@
   DONE;
 })
 
+(define_expand "vcond_mask_"
+  [(match_operand:SWI1248_AVX512BW 0 "register_operand")
+   (match_operand:SWI1248_AVX512BW 1 "register_operand")
+   (match_operand:SWI1248_AVX512BW 2 "register_operand")
+   (match_operand:SWI1248_AVX512BW 3 "register_operand")]
+  "TARGET_AVX512F"
+{
+  /* (operand[1] & operand[3]) | (operand[2] & ~operand[3])  */
+  rtx op1 = gen_reg_rtx (mode);
+  rtx op2 = gen_reg_rtx (mode);
+  rtx op3 = gen_reg_rtx (mode);
+
+  emit_insn (gen_and3 (op1, operands[1], operands[3]));
+  emit_insn (gen_one_cmpl2 (op3, operands[3]));
+  emit_insn (gen_and3 (op2, operands[2], op3));
+  emit_insn (gen_ior3 (operands[0], op1, op2));
+
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel floating point logical operations
diff --git a/gcc/testsuite/gcc.target/i386/pr114125.c 
b/gcc/testsuite/gcc.target/i386/pr114125.c
new file mode 100644
index 000..e63fbffe965
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114125.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -fdump-tree-forwprop3-raw " } */
+
+typedef long vec __attribute__((vector_size(16)));
+vec f(vec x){
+  vec y = x < 10;
+  return y & (y == 0);
+}
+
+/* { dg-final { scan-tree-dump-not "_expr" "forwprop3" } } */


[gcc r15-932] Rename double_u with __double_u to avoid pulluting the namespace.

2024-05-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:3a873c0a7bc8183de95a6103b507101a25eed413

commit r15-932-g3a873c0a7bc8183de95a6103b507101a25eed413
Author: liuhongt 
Date:   Thu May 30 14:15:48 2024 +0800

Rename double_u with __double_u to avoid pulluting the namespace.

gcc/ChangeLog:

* config/i386/emmintrin.h (__double_u): Rename from double_u.
(_mm_load_sd): Replace double_u with __double_u.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
* config/i386/xmmintrin.h (__float_u): Rename from float_u.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.

Diff:
---
 gcc/config/i386/emmintrin.h | 10 +-
 gcc/config/i386/xmmintrin.h |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index fa301103daf..356ca218fcb 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return __extension__ (__m128d) { *(double_u *)__P, 0.0 };
+  return __extension__ (__m128d) { *(__double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *(double_u *)__P = ((__v2df)__A)[0] ;
+  *(__double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B };
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] };
+  return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 87515ecb218..c90fc71331a 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef float __float_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Internal data types for implementing the intrinsics.  */
 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
@@ -910,7 +910,7 @@ _mm_set_ps1 (float __F)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_ss (float const *__P)
 {
-  return __extension__ (__m128) (__v4sf){ *(float_u *)__P, 0.0f, 0.0f, 0.0f };
+  return __extension__ (__m128) (__v4sf){ *(__float_u *)__P, 0.0f, 0.0f, 0.0f 
};
 }
 
 /* Create a vector with all four elements equal to *P.  */
@@ -966,7 +966,7 @@ _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *(float_u *)__P = ((__v4sf)__A)[0];
+  *(__float_u *)__P = ((__v4sf)__A)[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))


[gcc r15-984] Add some preference for floating point rtl ifcvt when sse4.1 is not available

2024-06-03 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ac306de7d5100d3682eae2270995a9abbe19db38

commit r15-984-gac306de7d5100d3682eae2270995a9abbe19db38
Author: liuhongt 
Date:   Fri May 31 14:38:07 2024 +0800

Add some preference for floating point rtl ifcvt when sse4.1 is not 
available

W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por) for
movdfcc/movsfcc, and could possibly fail cost comparison. Increase
branch cost could hurt performance for other modes, so specially add
some preference for floating point ifcvt.

gcc/ChangeLog:

PR target/115299
* config/i386/i386.cc (ix86_noce_conversion_profitable_p): Add
some preference for floating point ifcvt when SSE4.1 is not
available.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115299.c: New test.
* gcc.target/i386/pr86722.c: Adjust testcase.

Diff:
---
 gcc/config/i386/i386.cc  | 17 +
 gcc/testsuite/gcc.target/i386/pr115299.c | 10 ++
 gcc/testsuite/gcc.target/i386/pr86722.c  |  2 +-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1a0206ab573..271da127a89 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24879,6 +24879,23 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, 
struct noce_if_info *if_info)
return false;
}
 }
+
+  /* W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por)
+ for movdfcc/movsfcc, and could possibly fail cost comparison.
+ Increase branch cost will hurt performance for other modes, so
+ specially add some preference for floating point ifcvt.  */
+  if (!TARGET_SSE4_1 && if_info->x
+  && GET_MODE_CLASS (GET_MODE (if_info->x)) == MODE_FLOAT
+  && if_info->speed_p)
+{
+  unsigned cost = seq_cost (seq, true);
+
+  if (cost <= if_info->original_cost)
+   return true;
+
+  return cost <= (if_info->max_seq_cost + COSTS_N_INSNS (2));
+}
+
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
diff --git a/gcc/testsuite/gcc.target/i386/pr115299.c 
b/gcc/testsuite/gcc.target/i386/pr115299.c
new file mode 100644
index 000..53c5899136a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115299.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-sse4.1 -msse2" } */
+
+void f(double*d,double*e){
+  for(;d

[gcc r15-1003] Adjust testcase for -march=cascadelake

2024-06-03 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:4d207044195b97ecb27c72a7dc987eb8b86644a0

commit r15-1003-g4d207044195b97ecb27c72a7dc987eb8b86644a0
Author: liuhongt 
Date:   Tue Jun 4 10:13:09 2024 +0800

Adjust testcase for -march=cascadelake

gcc/testsuite/ChangeLog:

PR target/115299
* gcc.target/i386/pr86722.c: Also scan for blendvpd.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr86722.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr86722.c 
b/gcc/testsuite/gcc.target/i386/pr86722.c
index e266a1e56c2..95ddbd8ddb9 100644
--- a/gcc/testsuite/gcc.target/i386/pr86722.c
+++ b/gcc/testsuite/gcc.target/i386/pr86722.c
@@ -6,5 +6,5 @@ void f(double*d,double*e){
 *d=(*d<.5)?.7:0;
 }
 
-/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd)} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd|blendvpd)} 1 } } */
 /* { dg-final { scan-assembler-not "orpd" } } */


[gcc r15-1022] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-06-04 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b05288d1f1e4b632eddf8830b4369d4659f6c2ff

commit r15-1022-gb05288d1f1e4b632eddf8830b4369d4659f6c2ff
Author: liuhongt 
Date:   Tue May 21 16:57:17 2024 +0800

Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

According to IEEE standard, for conversions from floating point to
integer. When a NaN or infinite operand cannot be represented in the
destination format and this cannot otherwise be indicated, the invalid
operation exception shall be signaled. When a numeric operand would
convert to an integer outside the range of the destination format, the
invalid operation exception shall be signaled if this situation cannot
otherwise be indicated.

The patch prevent simplication of the conversion from floating point
to integer for NAN/INF/out-of-range constant when flag_trapping_math.

gcc/ChangeLog:

PR rtl-optimization/100927
PR rtl-optimization/115161
PR rtl-optimization/115115
* simplify-rtx.cc (simplify_const_unary_operation): Prevent
simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range
constant when flag_trapping_math.
* fold-const.cc (fold_convert_const_int_from_real): Don't fold
for overflow value when_trapping_math.

gcc/testsuite/ChangeLog:

* gcc.dg/pr100927.c: New test.
* c-c++-common/Wconversion-1.c: Add -fno-trapping-math.
* c-c++-common/dfp/convert-int-saturate.c: Ditto.
* g++.dg/ubsan/pr63956.C: Ditto.
* g++.dg/warn/Wconversion-real-integer.C: Ditto.
* gcc.c-torture/execute/20031003-1.c: Ditto.
* gcc.dg/Wconversion-complex-c99.c: Ditto.
* gcc.dg/Wconversion-real-integer.c: Ditto.
* gcc.dg/c90-const-expr-11.c: Ditto.
* gcc.dg/overflow-warn-8.c: Ditto.

Diff:
---
 gcc/fold-const.cc  | 13 -
 gcc/simplify-rtx.cc| 23 +---
 gcc/testsuite/c-c++-common/Wconversion-1.c |  2 +-
 .../c-c++-common/dfp/convert-int-saturate.c|  1 +
 gcc/testsuite/g++.dg/ubsan/pr63956.C   |  7 -
 .../g++.dg/warn/Wconversion-real-integer.C |  2 +-
 gcc/testsuite/gcc.c-torture/execute/20031003-1.c   |  2 ++
 gcc/testsuite/gcc.dg/Wconversion-complex-c99.c |  2 +-
 gcc/testsuite/gcc.dg/Wconversion-real-integer.c|  2 +-
 gcc/testsuite/gcc.dg/c90-const-expr-11.c   |  2 +-
 gcc/testsuite/gcc.dg/overflow-warn-8.c |  1 +
 gcc/testsuite/gcc.dg/pr100927.c| 31 ++
 12 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 92b048c307e..710d697c021 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2246,7 +2246,18 @@ fold_convert_const_int_from_real (enum tree_code code, 
tree type, const_tree arg
   if (! overflow)
 val = real_to_integer (&r, &overflow, TYPE_PRECISION (type));
 
-  t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  /* According to IEEE standard, for conversions from floating point to
+ integer. When a NaN or infinite operand cannot be represented in the
+ destination format and this cannot otherwise be indicated, the invalid
+ operation exception shall be signaled. When a numeric operand would
+ convert to an integer outside the range of the destination format, the
+ invalid operation exception shall be signaled if this situation cannot
+ otherwise be indicated.  */
+  if (!flag_trapping_math || !overflow)
+t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  else
+t = NULL_TREE;
+
   return t;
 }
 
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 5caf1dfd957..f6b4d73b593 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2256,14 +2256,25 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
   switch (code)
{
case FIX:
+ /* According to IEEE standard, for conversions from floating point to
+integer. When a NaN or infinite operand cannot be represented in
+the destination format and this cannot otherwise be indicated, the
+invalid operation exception shall be signaled. When a numeric
+operand would convert to an integer outside the range of the
+destination format, the invalid operation exception shall be
+signaled if this situation cannot otherwise be indicated.  */
  if (REAL_VALUE_ISNAN (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the signed upper bound.  */
  wmax = wi::max_value (width, SIGNED);
  real_from_integer (&t, VOIDmode, wmax

[gcc r15-1047] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-06-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:7876cde25cbd2f026a0ae488e5263e72f8e9bfa0

commit r15-1047-g7876cde25cbd2f026a0ae488e5263e72f8e9bfa0
Author: liuhongt 
Date:   Fri Apr 19 10:29:34 2024 +0800

Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

gcc/ChangeLog:

PR target/114428
* simplify-rtx.cc
(simplify_context::simplify_binary_operation_1):
Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
specific mask.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428-1.c: New test.

Diff:
---
 gcc/simplify-rtx.cc| 25 +++
 gcc/testsuite/gcc.target/i386/pr114428-1.c | 39 ++
 2 files changed, 64 insertions(+)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index f6b4d73b593..9bc3ef9ad9f 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4065,6 +4065,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return tem;
}
 
+  /* (and:v4si
+  (ashiftrt:v4si A 16)
+  (const_vector: 0x x4))
+is just (lshiftrt:v4si A 16).  */
+  if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+ && (CONST_INT_P (XEXP (op0, 1))
+ || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && GET_CODE (op1) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (op1))
+   {
+ unsigned HOST_WIDE_INT shift_count
+   = (CONST_INT_P (XEXP (op0, 1))
+  ? UINTVAL (XEXP (op0, 1))
+  : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+ unsigned HOST_WIDE_INT inner_prec
+   = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+ /* Avoid UD shift count.  */
+ if (shift_count < inner_prec
+ && (UINTVAL (XVECEXP (op1, 0, 0))
+ == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+   return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
diff --git a/gcc/testsuite/gcc.target/i386/pr114428-1.c 
b/gcc/testsuite/gcc.target/i386/pr114428-1.c
new file mode 100644
index 000..927476f2269
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psrld" 1 } } */
+/* { dg-final { scan-assembler-times "psrlq" 1 { target { ! ia32 } } } } */
+
+
+#define SHIFTC 12
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+
+v8hi
+foo1 (v8hi a)
+{
+  return
+(a >> (16 - SHIFTC)) & (__extension__(v8hi){(1<> (32 - SHIFTC)) & (__extension__(v4si){(1<> (long long)(64 - SHIFTC)) & (__extension__(v2di){(1ULL<

[gcc r15-1048] Adjust rtx_cost for MEM to enable more simplication

2024-06-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:961dd0d635217c703a38c48903981e0d60962546

commit r15-1048-g961dd0d635217c703a38c48903981e0d60962546
Author: liuhongt 
Date:   Fri Apr 19 10:39:53 2024 +0800

Adjust rtx_cost for MEM to enable more simplication

For CONST_VECTOR_DUPLICATE_P in constant_pool, it is just broadcast or
variants in ix86_vector_duplicate_simode_const.
Adjust the cost to COSTS_N_INSNS (2) + speed which should be a little
bit larger than broadcast.

gcc/ChangeLog:
PR target/114428
* config/i386/i386.cc (ix86_rtx_costs): Adjust cost for
CONST_VECTOR_DUPLICATE_P in constant_pool.
* config/i386/i386-expand.cc (ix86_broadcast_from_constant):
Remove static.
* config/i386/i386-protos.h (ix86_broadcast_from_constant):
Declare.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc   |  2 +-
 gcc/config/i386/i386-protos.h|  1 +
 gcc/config/i386/i386.cc  | 13 +
 gcc/testsuite/gcc.target/i386/pr114428.c | 18 ++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 215a998fc26..56d29c15f9a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -588,7 +588,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 
 /* OP is a memref of CONST_VECTOR, return scalar constant mem
if CONST_VECTOR is a vec_duplicate, else return NULL.  */
-static rtx
+rtx
 ix86_broadcast_from_constant (machine_mode mode, rtx op)
 {
   int nunits = GET_MODE_NUNITS (mode);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..90712769200 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -107,6 +107,7 @@ extern void ix86_expand_clear (rtx);
 extern void ix86_expand_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move_misalign (machine_mode, rtx[]);
+extern rtx ix86_broadcast_from_constant (machine_mode, rtx);
 extern rtx ix86_fixup_binary_operands (enum rtx_code, machine_mode,
   rtx[], bool = false);
 extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, machine_mode,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 271da127a89..a9d62c84c52 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22191,6 +22191,19 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   return true;
 
 case MEM:
+  /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
+or variants in ix86_vector_duplicate_simode_const.  */
+
+  if (GET_MODE_SIZE (mode) >= 16
+ && VECTOR_MODE_P (mode)
+ && SYMBOL_REF_P (XEXP (x, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))
+ && ix86_broadcast_from_constant (mode, x))
+   {
+ *total = COSTS_N_INSNS (2) + speed;
+ return true;
+   }
+
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
diff --git a/gcc/testsuite/gcc.target/i386/pr114428.c 
b/gcc/testsuite/gcc.target/i386/pr114428.c
new file mode 100644
index 000..bbbc5a080f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-not "vpsra[dw]" } } */
+
+void
+foo2 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 32; i++)
+a[i] = b[i] >> (short)8;
+}
+
+void
+foo3 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 16; i++)
+a[i] = b[i] >> (short)8;
+}
+


[gcc r15-1050] Refine testcase for power10.

2024-06-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:fcfce55c85f842ed843cbc4aabe744c6a004dead

commit r15-1050-gfcfce55c85f842ed843cbc4aabe744c6a004dead
Author: liuhongt 
Date:   Thu Jun 6 11:27:53 2024 +0800

Refine testcase for power10.

For power10, there're extra 3 REG_EQUIV notes with (fix:SI. to avoid
the failure. Check (fix:SI is from the pattern not NOTE.

gcc/testsuite/ChangeLog:

PR target/115365
* gcc.dg/pr100927.c: Don't scan fix:SI from the note.

Diff:
---
 gcc/testsuite/gcc.dg/pr100927.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/pr100927.c b/gcc/testsuite/gcc.dg/pr100927.c
index ea0e627befa..8a7d69c3831 100644
--- a/gcc/testsuite/gcc.dg/pr100927.c
+++ b/gcc/testsuite/gcc.dg/pr100927.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -ftrapping-math -fdump-tree-optimized -fdump-rtl-final" } 
*/
 /* { dg-final { scan-tree-dump-times {(?n)= \(int\)} 3 "optimized" } }  */
-/* { dg-final { scan-rtl-dump-times {(?n)\(fix:SI} 3 "final" } }  */
+/* { dg-final { scan-rtl-dump-times {(?n)^[ \t]*\(fix:SI} 3 "final" } }  */
 
 int
 foo_ofr ()


[gcc r15-1088] Add additional option --param max-completely-peeled-insns=200 for power64*-*-*

2024-06-06 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b24f2954dbc13d85e9fb62e05a88e9df21e4d4f4

commit r15-1088-gb24f2954dbc13d85e9fb62e05a88e9df21e4d4f4
Author: liuhongt 
Date:   Fri Jun 7 09:29:24 2024 +0800

Add additional option --param max-completely-peeled-insns=200 for 
power64*-*-*

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr112325.c:Add additional option --param
max-completely-peeled-insns=200 for power64*-*-*.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c 
b/gcc/testsuite/gcc.dg/vect/pr112325.c
index dea6cca3b86..143903beab2 100644
--- a/gcc/testsuite/gcc.dg/vect/pr112325.c
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -3,6 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+/* { dg-additional-options "--param max-completely-peeled-insns=200" { target 
powerpc64*-*-* } } */
 
 typedef unsigned short ggml_fp16_t;
 static float table_f32_f16[1 << 16];


[gcc r13-8825] Disable FMADD in chains for Zen4 and generic

2024-06-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e4f85ea6271a10e13c6874709a05e04ab0508fbf

commit r13-8825-ge4f85ea6271a10e13c6874709a05e04ab0508fbf
Author: Jan Hubicka 
Date:   Fri Dec 29 23:51:03 2023 +0100

Disable FMADD in chains for Zen4 and generic

this patch disables use of FMA in matrix multiplication loop for generic 
(for
x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.

For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.

On core the micro-benchmark runs as follows:

With FMA:

   578,500,241  cycles:u #3.645 GHz
( +-  0.12% )
   753,318,477  instructions:u   #1.30  insn per
cycle  ( +-  0.00% )
   125,417,701  branches:u   #  790.227 M/sec
( +-  0.00% )
  0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )

No FMA:

   577,573,960  cycles:u #3.514 GHz
( +-  0.15% )
   878,318,479  instructions:u   #1.52  insn per
cycle  ( +-  0.00% )
   125,417,702  branches:u   #  763.035 M/sec
( +-  0.00% )
  0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )

So the cycle count is unchanged and discrete multiply+add takes same time as
FMA.

While on zen:

With FMA:
 484875179  cycles:u #3.599 GHz
 ( +-  0.05% )  (82.11%)
 752031517  instructions:u   #1.55  insn per
cycle
 125106525  branches:u   #  928.712 M/sec
 ( +-  0.03% )  (85.09%)
128356  branch-misses:u  #0.10% of all
branches  ( +-  0.06% )  (83.58%)

No FMA:
 375875209  cycles:u #3.592 GHz
 ( +-  0.08% )  (80.74%)
 875725341  instructions:u   #2.33  insn per
cycle
 124903825  branches:u   #1.194 G/sec
 ( +-  0.04% )  (84.59%)
  0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )

The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.

Since this seems noticeable win on zen and not loss on Core it seems like 
good
default for generic.

float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

void init(void)
{
   int i, j, k;
   for(i=0; i

[gcc r12-10497] Disable FMADD in chains for Zen4 and generic

2024-06-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:5d52558a531130675329d72ca5c4713abf5bf885

commit r12-10497-g5d52558a531130675329d72ca5c4713abf5bf885
Author: Jan Hubicka 
Date:   Fri Dec 29 23:51:03 2023 +0100

Disable FMADD in chains for Zen4 and generic

this patch disables use of FMA in matrix multiplication loop for generic 
(for
x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.

For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.

On core the micro-benchmark runs as follows:

With FMA:

   578,500,241  cycles:u #3.645 GHz
( +-  0.12% )
   753,318,477  instructions:u   #1.30  insn per
cycle  ( +-  0.00% )
   125,417,701  branches:u   #  790.227 M/sec
( +-  0.00% )
  0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )

No FMA:

   577,573,960  cycles:u #3.514 GHz
( +-  0.15% )
   878,318,479  instructions:u   #1.52  insn per
cycle  ( +-  0.00% )
   125,417,702  branches:u   #  763.035 M/sec
( +-  0.00% )
  0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )

So the cycle count is unchanged and discrete multiply+add takes same time as
FMA.

While on zen:

With FMA:
 484875179  cycles:u #3.599 GHz
 ( +-  0.05% )  (82.11%)
 752031517  instructions:u   #1.55  insn per
cycle
 125106525  branches:u   #  928.712 M/sec
 ( +-  0.03% )  (85.09%)
128356  branch-misses:u  #0.10% of all
branches  ( +-  0.06% )  (83.58%)

No FMA:
 375875209  cycles:u #3.592 GHz
 ( +-  0.08% )  (80.74%)
 875725341  instructions:u   #2.33  insn per
cycle
 124903825  branches:u   #1.194 G/sec
 ( +-  0.04% )  (84.59%)
  0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )

The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.

Since this seems noticeable win on zen and not loss on Core it seems like 
good
default for generic.

float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

void init(void)
{
   int i, j, k;
   for(i=0; i

[gcc r15-1191] Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P

2024-06-11 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:1d496d2cd1d5d8751a1637abca89339d6f9ddd3b

commit r15-1191-g1d496d2cd1d5d8751a1637abca89339d6f9ddd3b
Author: liuhongt 
Date:   Tue Jun 11 10:23:27 2024 +0800

Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P

The patch add extra check to make sure the component of CONST_VECTOR
is CONST_INT_P.

gcc/ChangeLog:

PR target/115384
* simplify-rtx.cc (simplify_context::simplify_binary_operation_1):
Only do the simplification of (AND (ASHIFTRT A imm) mask)
to (LSHIFTRT A imm) when the component of const_vector is
CONST_INT_P.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115384.c: New test.

Diff:
---
 gcc/simplify-rtx.cc  |  6 --
 gcc/testsuite/gcc.target/i386/pr115384.c | 12 
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 9bc3ef9ad9fd..3ee95f74d3db 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4072,9 +4072,11 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
   if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
  && (CONST_INT_P (XEXP (op0, 1))
  || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
- && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1))
+ && CONST_INT_P (XVECEXP (XEXP (op0, 1), 0, 0
  && GET_CODE (op1) == CONST_VECTOR
- && CONST_VECTOR_DUPLICATE_P (op1))
+ && CONST_VECTOR_DUPLICATE_P (op1)
+ && CONST_INT_P (XVECEXP (op1, 0, 0)))
{
  unsigned HOST_WIDE_INT shift_count
= (CONST_INT_P (XEXP (op0, 1))
diff --git a/gcc/testsuite/gcc.target/i386/pr115384.c 
b/gcc/testsuite/gcc.target/i386/pr115384.c
new file mode 100644
index ..31dd6f4eb18a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115384.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O" } */
+
+typedef __attribute__((__vector_size__(sizeof(__int128 __int128 W;
+
+W w;
+
+void
+foo()
+{
+  w = w >> 4 & 18446744073709551600llu;
+}


[gcc r15-1234] Fix ICE due to REGNO of a SUBREG.

2024-06-12 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:f8bf80a4e1682b2238baad8c44939682f96b1fe0

commit r15-1234-gf8bf80a4e1682b2238baad8c44939682f96b1fe0
Author: liuhongt 
Date:   Thu Jun 13 09:53:58 2024 +0800

Fix ICE due to REGNO of a SUBREG.

Use reg_or_subregno instead.

gcc/ChangeLog:

PR target/115452
* config/i386/i386-features.cc (scalar_chain::convert_op): Use
reg_or_subregno instead of REGNO to avoid ICE.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115452.c: New test.

Diff:
---
 gcc/config/i386/i386-features.cc | 2 +-
 gcc/testsuite/gcc.target/i386/pr115452.c | 4 
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e3e004d55267..607d19914606 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1054,7 +1054,7 @@ scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 
   if (dump_file)
fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
-INSN_UID (insn), REGNO (tmp));
+INSN_UID (insn), reg_or_subregno (tmp));
 }
   else if (REG_P (*op))
 *op = gen_rtx_SUBREG (vmode, *op, 0);
diff --git a/gcc/testsuite/gcc.target/i386/pr115452.c 
b/gcc/testsuite/gcc.target/i386/pr115452.c
new file mode 100644
index ..6c7935feb9f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115452.c
@@ -0,0 +1,4 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -mno-bmi -mno-stackrealign -fdump-rtl-stv2" 
} */
+
+#include "pr70322-2.c"


[gcc r15-1307] Remove one_if_conv for latest Intel processors.

2024-06-13 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:8b69efd9819f86b973d7a550e987ce455fce6d62

commit r15-1307-g8b69efd9819f86b973d7a550e987ce455fce6d62
Author: liuhongt 
Date:   Mon Jun 3 10:38:19 2024 +0800

Remove one_if_conv for latest Intel processors.

The tune is added by PR79390 for SciMark2 on Broadwell.
For latest GCC, with and without the -mtune-ctrl=^one_if_conv_insn.
GCC will generate the same binary for SciMark2. And for SPEC2017,
there's no big impact for SKX/CLX/ICX, and small improvements on SPR
and later.

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_ONE_IF_CONV_INSN): Remove
latest Intel processors.

Co-authored by: Lingling Kong 

Diff:
---
 gcc/config/i386/x86-tune.def | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 0fa1484b48d9..66512992b7b5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -346,8 +346,8 @@ DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", 
m_BDVER3 | m_BDVER4)
 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
if-converted sequence to one.  */
 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
- m_SILVERMONT | m_INTEL | m_CORE_ALL | m_GOLDMONT | m_GOLDMONT_PLUS
- | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_ZHAOXIN | m_GENERIC)
+ m_SILVERMONT | m_HASWELL | m_SKYLAKE | m_GOLDMONT | m_GOLDMONT_PLUS
+ | m_TREMONT  | m_ZHAOXIN)
 
 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",


[gcc r15-1308] Adjust ix86_rtx_costs for pternlog_operand_p.

2024-06-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:d3fae2bea034edb001cd45d1d86c5ceef146899b

commit r15-1308-gd3fae2bea034edb001cd45d1d86c5ceef146899b
Author: liuhongt 
Date:   Tue Jun 11 21:22:42 2024 +0800

Adjust ix86_rtx_costs for pternlog_operand_p.

r15-1100-gec985bc97a0157 improves handling of ternlog instructions,
now GCC can recognize lots of pternlog_operand with different
variants.

The patch adjust rtx_costs for that, so pass_combine can
reasonably generate more optimal vpternlog instructions.

.i.e
for avx512f-vpternlog-3.c, with the patch, 2 vpternlog are combined into 
one.

1532,1533c1526
<   vpternlogd  $168, %zmm1, %zmm0, %zmm2
<   vpternlogd  $0x55, %zmm2, %zmm2, %zmm2

>   vpternlogd  $87, %zmm1, %zmm0, %zmm2
1732,1733c1725,1726
<   vpand   %xmm0, %xmm1, %xmm0
<   vpternlogd  $0x55, %zmm0, %zmm0, %zmm0

>   vpternlogd  $63, %zmm1, %zmm0, %zmm1
>   vmovdqa %xmm1, %xmm0
1804,1805c1797
<   vpternlogd  $188, %zmm2, %zmm0, %zmm1
<   vpternlogd  $0x55, %zmm1, %zmm1, %zmm1

>   vpternlogd  $37, %zmm0, %zmm2, %zmm1

gcc/ChangeLog:

* config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
pternlog_operand under AVX512, also adjust VEC_DUPLICATE
according since vec_dup:mem can't be that cheap.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr98461.c: Scan either notl or
vpternlog.
* gcc.target/i386/avx512f-pr96891-3.c: Also scan for inversed
condition.
* gcc.target/i386/avx512f-vpternlogd-3.c: Adjust vpternlog
number to 673.
* gcc.target/i386/avx512f-vpternlogd-4.c: Ditto.
* gcc.target/i386/avx512f-vpternlogd-5.c: Ditto.
* gcc.target/i386/sse2-v1ti-vne.c: Add -mno-avx512f.

Diff:
---
 gcc/config/i386/i386.cc| 39 +-
 gcc/testsuite/gcc.target/i386/avx2-pr98461.c   |  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c  |  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-3.c |  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-4.c |  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-5.c |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c  |  2 +-
 7 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index c72f64da983d..d4ccc24be6ec 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21571,6 +21571,31 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 = speed ? ix86_tune_cost : &ix86_size_cost;
   int src_cost;
 
+  /* Handling different vternlog variants.  */
+  if ((GET_MODE_SIZE (mode) == 64
+   ? (TARGET_AVX512F && TARGET_EVEX512)
+   : (TARGET_AVX512VL
+ || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)))
+  && GET_MODE_SIZE (mode) >= 16
+  && outer_code_i == SET
+  && ternlog_operand (x, mode))
+{
+  rtx args[3];
+
+  args[0] = NULL_RTX;
+  args[1] = NULL_RTX;
+  args[2] = NULL_RTX;
+  int idx = ix86_ternlog_idx (x, args);
+  gcc_assert (idx >= 0);
+
+  *total = cost->sse_op;
+  for (int i = 0; i != 3; i++)
+   if (args[i])
+ *total += rtx_cost (args[i], GET_MODE (args[i]), UNSPEC, i, speed);
+  return true;
+}
+
+
   switch (code)
 {
 case SET:
@@ -22233,6 +22258,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   else if (XINT (x, 1) == UNSPEC_VTERNLOG)
{
  *total = cost->sse_op;
+ *total += rtx_cost (XVECEXP (x, 0, 0), mode, code, 0, speed);
+ *total += rtx_cost (XVECEXP (x, 0, 1), mode, code, 1, speed);
+ *total += rtx_cost (XVECEXP (x, 0, 2), mode, code, 2, speed);
  return true;
}
   else if (XINT (x, 1) == UNSPEC_PTEST)
@@ -22260,12 +22288,21 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 
 case VEC_SELECT:
 case VEC_CONCAT:
-case VEC_DUPLICATE:
   /* ??? Assume all of these vector manipulation patterns are
 recognizable.  In which case they all pretty much have the
 same cost.  */
  *total = cost->sse_op;
  return true;
+case VEC_DUPLICATE:
+  *total = rtx_cost (XEXP (x, 0),
+GET_MODE (XEXP (x, 0)),
+VEC_DUPLICATE, 0, speed);
+  /* It's broadcast instruction, not embedded broadcasting.  */
+  if (outer_code == SET)
+   *total += cost->sse_op;
+
+ return true;
+
 case VEC_MERGE:
   mask = XEXP (x, 2);
   /* This is masked instruction, assume the same cost,
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr98461.c 
b/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
index 15f49b864daa..225f2ab00e5f 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-pr9846

[gcc r15-1563] AVX-512: Pacify -Wshift-overflow=2. [PR115409]

2024-06-22 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:4c957d7ba84d8bbce6e778048f38e92ef71806c8

commit r15-1563-g4c957d7ba84d8bbce6e778048f38e92ef71806c8
Author: Collin Funk 
Date:   Mon Jun 10 06:36:47 2024 +

AVX-512: Pacify -Wshift-overflow=2. [PR115409]

A shift of 31 on a signed int is undefined behavior.  Since unsigned
int is 32-bits wide this change fixes it and silences the warning.

gcc/ChangeLog:

PR target/115409
* config/i386/avx512fp16intrin.h (_mm512_conj_pch): Make the
constant unsigned before shifting.
* config/i386/avx512fp16vlintrin.h (_mm256_conj_pch): Likewise.
(_mm_conj_pch): Likewise.

Signed-off-by: Collin Funk 

Diff:
---
 gcc/config/i386/avx512fp16intrin.h   | 2 +-
 gcc/config/i386/avx512fp16vlintrin.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index f86050b2087..1869a920dd3 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -3355,7 +3355,7 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_conj_pch (__m512h __A)
 {
-  return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 (1<<31));
+  return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 
(1U<<31));
 }
 
 extern __inline __m512h
diff --git a/gcc/config/i386/avx512fp16vlintrin.h 
b/gcc/config/i386/avx512fp16vlintrin.h
index a1e1cb567ff..405a06bbb9e 100644
--- a/gcc/config/i386/avx512fp16vlintrin.h
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -181,7 +181,7 @@ extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_conj_pch (__m256h __A)
 {
-  return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_avx512_set1_epi32 
(1<<31));
+  return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_avx512_set1_epi32 
(1U<<31));
 }
 
 extern __inline __m256h
@@ -209,7 +209,7 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_conj_pch (__m128h __A)
 {
-  return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_avx512_set1_epi32 
(1<<31));
+  return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_avx512_set1_epi32 
(1U<<31));
 }
 
 extern __inline __m128h


[gcc r14-10782] Add new microarchitecture tune for SRF/GRR/CWF.

2024-10-13 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:fe0692f689a18c432d6f59f404d4cd020cbebef2

commit r14-10782-gfe0692f689a18c432d6f59f404d4cd020cbebef2
Author: liuhongt 
Date:   Tue Sep 24 15:53:14 2024 +0800

Add new microarchitecture tune for SRF/GRR/CWF.

For Crestmont, 4-operand vex blendv instructions come from MSROM and
is slower than 3-instructions sequence (op1 & mask) | (op2 & ~mask).
legacy blendv instruction can still be handled by the decoder.

The patch add a new tune which is enabled for all processors except
for SRF/CWF. It will use vpand + vpandn + vpor instead of
vpblendvb(similar for vblendvps/vblendvpd) for SRF/CWF.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Guard
instruction blendv generation under new tune.
* config/i386/i386.h (TARGET_SSE_MOVCC_USE_BLENDV): New Macro.
* config/i386/x86-tune.def (X86_TUNE_SSE_MOVCC_USE_BLENDV):
New tune.

(cherry picked from commit 9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc)

Diff:
---
 gcc/config/i386/i386-expand.cc | 24 +++---
 gcc/config/i386/i386.h |  2 ++
 gcc/config/i386/x86-tune.def   |  8 
 .../gcc.target/i386/sse_movcc_use_blendv.c | 12 +++
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1a15388d9099..cad8b6d58842 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4222,23 +4222,23 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   switch (mode)
 {
 case E_V2SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_blendvps;
   break;
 case E_V4SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
   break;
 case E_V2DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
   break;
 case E_SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvss;
   break;
 case E_DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvsd;
   break;
 case E_V8QImode:
@@ -4246,7 +4246,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V4HFmode:
 case E_V4BFmode:
 case E_V2SImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v8qi;
  blend_mode = V8QImode;
@@ -4256,14 +4256,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V2HImode:
 case E_V2HFmode:
 case E_V2BFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v4qi;
  blend_mode = V4QImode;
}
   break;
 case E_V2QImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_pblendvb_v2qi;
   break;
 case E_V16QImode:
@@ -4273,18 +4273,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V4SImode:
 case E_V2DImode:
 case E_V1TImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_sse4_1_pblendvb;
  blend_mode = V16QImode;
}
   break;
 case E_V8SFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvps256;
   break;
 case E_V4DFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvpd256;
   break;
 case E_V32QImode:
@@ -4293,7 +4293,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V16BFmode:
 case E_V8SImode:
 case E_V4DImode:
-  if (TARGET_AVX2)
+  if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
{
  gen = gen_avx2_pblendvb;
  blend_mode = V32QImode;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 6272d40bcc53..89bc894fcd1c 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -461,6 +461,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
 #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
+#define TARGET_SSE_MOVCC_USE_BLENDV \
+   ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-

[gcc r14-10783] Add a new tune avx256_avoid_vec_perm for SRF.

2024-10-13 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:9b7d5ecbecfbd193899648e411f1a9b2a77471e2

commit r14-10783-g9b7d5ecbecfbd193899648e411f1a9b2a77471e2
Author: liuhongt 
Date:   Wed Sep 25 13:11:11 2024 +0800

Add a new tune avx256_avoid_vec_perm for SRF.

According to Intel SOM[1], For Crestmont,  most 256-bit Intel AVX2
instructions can be decomposed into two independent 128-bit
micro-operations, except for a subset of Intel AVX2 instructions,
known as cross-lane operations, can only compute the result for an
element by utilizing one or more sources belonging to other elements.

The 256-bit instructions listed below use more operand sources than
can be natively supported by a single reservation station within these
microarchitectures. They are decomposed into two μops, where the first
μop resolves a subset of operand dependencies across two cycles. The
dependent second μop executes the 256-bit operation by using a single
128-bit execution port for two consecutive cycles with a five-cycle
latency for a total latency of seven cycles.

VPERM2I128 ymm1, ymm2, ymm3/m256, imm8
VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
VPERMPD ymm1, ymm2/m256, imm8
VPERMPS ymm1, ymm2, ymm3/m256
VPERMD ymm1, ymm2, ymm3/m256
VPERMQ ymm1, ymm2/m256, imm8

Instead of setting tune avx128_optimal for SRF, the patch add a new
tune avx256_avoid_vec_perm for it. so by default, vectorizer still
uses 256-bit VF if cost is profitable, but lowers to 128-bit whenever
256-bit vec_perm is needed for auto-vectorization. w/o vec_perm,
performance of 256-bit vectorization should be similar as 128-bit
ones(some benchmark results show it's even better than 128-bit
vectorization since it enables more parallelism for convert cases.)

[1] 
https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs):
Add new member m_num_avx256_vec_perm.
(ix86_vector_costs::add_stmt_cost): Record 256-bit vec_perm.
(ix86_vector_costs::finish_cost): Prevent vectorization for
TAREGT_AVX256_AVOID_VEC_PERM when there's 256-bit vec_perm
instruction.
* config/i386/i386.h (TARGET_AVX256_AVOID_VEC_PERM): New
Macro.
* config/i386/x86-tune.def (X86_TUNE_AVX256_SPLIT_REGS): Add
m_CORE_ATOM.
(X86_TUNE_AVX256_AVOID_VEC_PERM): New tune.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx256_avoid_vec_perm.c: New test.

(cherry picked from commit 9eaecce3d8c1d9349adbf8c2cdaf8d87672ed29c)

Diff:
---
 gcc/config/i386/i386.cc| 14 +-
 gcc/config/i386/i386.h |  2 ++
 gcc/config/i386/x86-tune.def   |  7 ++-
 .../gcc.target/i386/avx256_avoid_vec_perm.c| 22 ++
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f8ab1893985c..48921d422cf8 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24829,12 +24829,15 @@ private:
  where we know it's not loaded from memory.  */
   unsigned m_num_gpr_needed[3];
   unsigned m_num_sse_needed[3];
+  /* Number of 256-bit vector permutation.  */
+  unsigned m_num_avx256_vec_perm[3];
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
 m_num_gpr_needed (),
-m_num_sse_needed ()
+m_num_sse_needed (),
+m_num_avx256_vec_perm ()
 {
 }
 
@@ -25068,6 +25071,10 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   if (stmt_cost == -1)
 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  if (kind == vec_perm && vectype
+  && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
+m_num_avx256_vec_perm[where]++;
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
   && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
@@ -25137,6 +25144,11 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
 
   ix86_vect_estimate_reg_pressure ();
 
+  for (int i = 0; i != 3; i++)
+if (m_num_avx256_vec_perm[i]
+   && TARGET_AVX256_AVOID_VEC_PERM)
+  m_costs[i] = INT_MAX;
+
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 89bc894fcd1c..1c6e323d6551 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -436,6 +436,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
 #define TARGET_AVX256_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#defin

[gcc r13-9117] Add new microarchitecture tune for SRF/GRR/CWF.

2024-10-16 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e9eadc29c1c57cd7be9ec8de231d8fb9e8ac0c7c

commit r13-9117-ge9eadc29c1c57cd7be9ec8de231d8fb9e8ac0c7c
Author: liuhongt 
Date:   Tue Sep 24 15:53:14 2024 +0800

Add new microarchitecture tune for SRF/GRR/CWF.

For Crestmont, 4-operand vex blendv instructions come from MSROM and
is slower than 3-instructions sequence (op1 & mask) | (op2 & ~mask).
legacy blendv instruction can still be handled by the decoder.

The patch add a new tune which is enabled for all processors except
for SRF/CWF. It will use vpand + vpandn + vpor instead of
vpblendvb(similar for vblendvps/vblendvpd) for SRF/CWF.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Guard
instruction blendv generation under new tune.
* config/i386/i386.h (TARGET_SSE_MOVCC_USE_BLENDV): New Macro.
* config/i386/x86-tune.def (X86_TUNE_SSE_MOVCC_USE_BLENDV):
New tune.

(cherry picked from commit 9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc)

Diff:
---
 gcc/config/i386/i386-expand.cc | 24 +++---
 gcc/config/i386/i386.h |  2 ++
 gcc/config/i386/x86-tune.def   |  8 
 .../gcc.target/i386/sse_movcc_use_blendv.c | 12 +++
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 3112c0b78dcc..1130b6a51853 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4116,29 +4116,29 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   switch (mode)
 {
 case E_V2SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_blendvps;
   break;
 case E_V4SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
   break;
 case E_V2DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
   break;
 case E_SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvss;
   break;
 case E_DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvsd;
   break;
 case E_V8QImode:
 case E_V4HImode:
 case E_V2SImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v8qi;
  blend_mode = V8QImode;
@@ -4146,14 +4146,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   break;
 case E_V4QImode:
 case E_V2HImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v4qi;
  blend_mode = V4QImode;
}
   break;
 case E_V2QImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_pblendvb_v2qi;
   break;
 case E_V16QImode:
@@ -4163,18 +4163,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V4SImode:
 case E_V2DImode:
 case E_V1TImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_sse4_1_pblendvb;
  blend_mode = V16QImode;
}
   break;
 case E_V8SFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvps256;
   break;
 case E_V4DFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvpd256;
   break;
 case E_V32QImode:
@@ -4183,7 +4183,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V16BFmode:
 case E_V8SImode:
 case E_V4DImode:
-  if (TARGET_AVX2)
+  if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
{
  gen = gen_avx2_pblendvb;
  blend_mode = V32QImode;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 79f7dc31b779..cda755b374d8 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -448,6 +448,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
 #define TARGET_DEST_FALSE_DEP_FOR_GLC \
ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
+#define TARGET_SSE_MOVCC_USE_BLENDV \
+   ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4231ca90b0ed..ce903cf29a75 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -529,6 +529,14 @@ 

[gcc r13-9118] Add a new tune avx256_avoid_vec_perm for SRF.

2024-10-16 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:eecd5f8ce1729a214bf0a1edfdd3ee1cf79be881

commit r13-9118-geecd5f8ce1729a214bf0a1edfdd3ee1cf79be881
Author: liuhongt 
Date:   Wed Sep 25 13:11:11 2024 +0800

Add a new tune avx256_avoid_vec_perm for SRF.

According to Intel SOM[1], For Crestmont,  most 256-bit Intel AVX2
instructions can be decomposed into two independent 128-bit
micro-operations, except for a subset of Intel AVX2 instructions,
known as cross-lane operations, can only compute the result for an
element by utilizing one or more sources belonging to other elements.

The 256-bit instructions listed below use more operand sources than
can be natively supported by a single reservation station within these
microarchitectures. They are decomposed into two μops, where the first
μop resolves a subset of operand dependencies across two cycles. The
dependent second μop executes the 256-bit operation by using a single
128-bit execution port for two consecutive cycles with a five-cycle
latency for a total latency of seven cycles.

VPERM2I128 ymm1, ymm2, ymm3/m256, imm8
VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
VPERMPD ymm1, ymm2/m256, imm8
VPERMPS ymm1, ymm2, ymm3/m256
VPERMD ymm1, ymm2, ymm3/m256
VPERMQ ymm1, ymm2/m256, imm8

Instead of setting tune avx128_optimal for SRF, the patch add a new
tune avx256_avoid_vec_perm for it. so by default, vectorizer still
uses 256-bit VF if cost is profitable, but lowers to 128-bit whenever
256-bit vec_perm is needed for auto-vectorization. w/o vec_perm,
performance of 256-bit vectorization should be similar as 128-bit
ones(some benchmark results show it's even better than 128-bit
vectorization since it enables more parallelism for convert cases.)

[1] 
https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs):
Add new member m_num_avx256_vec_perm.
(ix86_vector_costs::add_stmt_cost): Record 256-bit vec_perm.
(ix86_vector_costs::finish_cost): Prevent vectorization for
TAREGT_AVX256_AVOID_VEC_PERM when there's 256-bit vec_perm
instruction.
* config/i386/i386.h (TARGET_AVX256_AVOID_VEC_PERM): New
Macro.
* config/i386/x86-tune.def (X86_TUNE_AVX256_SPLIT_REGS): Add
m_CORE_ATOM.
(X86_TUNE_AVX256_AVOID_VEC_PERM): New tune.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx256_avoid_vec_perm.c: New test.

(cherry picked from commit 9eaecce3d8c1d9349adbf8c2cdaf8d87672ed29c)

Diff:
---
 gcc/config/i386/i386.cc|  5 +
 gcc/config/i386/i386.h |  2 ++
 gcc/config/i386/x86-tune.def   |  7 ++-
 .../gcc.target/i386/avx256_avoid_vec_perm.c| 22 ++
 4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1e43ae15d7bd..8323b2e7cd39 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23746,6 +23746,11 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   if (stmt_cost == -1)
 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  if (kind == vec_perm && vectype
+  && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32
+  && TARGET_AVX256_AVOID_VEC_PERM)
+stmt_cost += 1000;
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
   && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index cda755b374d8..08309367c18b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -425,6 +425,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
 #define TARGET_AVX256_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#define TARGET_AVX256_AVOID_VEC_PERM \
+   ix86_tune_features[X86_TUNE_AVX256_AVOID_VEC_PERM]
 #define TARGET_AVX512_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS]
 #define TARGET_GENERAL_REGS_SSE_SPILL \
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index ce903cf29a75..773a4ea4ccf6 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -553,7 +553,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, 
"256_unaligned_store_optimal"
 
 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 
ops.  */
 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
- | m_ZNVER1)
+ | m_ZNVER1 | m_CORE_ATOM)
 
 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction 

[gcc r14-10807] Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

2024-10-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:79e7e02b7cc578d03eab2b50c029f44409ef8e26

commit r14-10807-g79e7e02b7cc578d03eab2b50c029f44409ef8e26
Author: liuhongt 
Date:   Wed Oct 16 13:43:48 2024 +0800

Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

r12-6103-g1a7ce8570997eb combines vpcmpuw + zero_extend to vpcmpuw
with the pre_reload splitter, but the splitter transforms the
zero_extend into a subreg which make reload think the upper part is
garbage, it's not correct.

The patch adjusts the zero_extend define_insn_and_split to
define_insn to keep zero_extend.

gcc/ChangeLog:

PR target/117159
* config/i386/sse.md
(*_cmp3_zero_extend):
Change from define_insn_and_split to define_insn.
(*_cmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_cmp3_zero_extend_2):
Split to the zero_extend pattern.
(*_cmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117159.c: New test.
* gcc.target/i386/avx512bw-pr103750-1.c: Remove xfail.
* gcc.target/i386/avx512bw-pr103750-2.c: Remove xfail.

(cherry picked from commit 5259d3927c1c8e3a15b4b844adef59b48c241233)

Diff:
---
 gcc/config/i386/sse.md | 198 +
 .../gcc.target/i386/avx512bw-pr103750-1.c  |   3 +-
 .../gcc.target/i386/avx512bw-pr103750-2.c  |   3 +-
 gcc/testsuite/gcc.target/i386/pr117159.c   |  42 +
 4 files changed, 125 insertions(+), 121 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c91a7e07bc76..e186b6ad452d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4006,32 +4006,19 @@
 
 ;; Since vpcmpd implicitly clear the upper bits of dest, transform
 ;; vpcmpd + zero_extend to vpcmpd since the instruction
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
-(match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+(match_operand:SI 3 "const_0_to_7_operand" "n")]
UNSPEC_PCMP)))]
   "TARGET_AVX512F
&& (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW)
-   && ix86_pre_reload_split ()
&& (GET_MODE_NUNITS (mode)
   < GET_MODE_PRECISION (mode))"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))]
-{
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
-operands[0], mode);
-}
+  "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssecmp")
(set_attr "length_immediate" "1")
(set_attr "prefix" "evex")
@@ -4059,21 +4046,22 @@
   "#"
   "&& 1"
   [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))
-   (set (match_dup 4) (match_dup 0))]
+(zero_extend:SWI248x
+ (unspec:
+   [(match_dup 1)
+(match_dup 2)
+(match_dup 3)]
+   UNSPEC_PCMP)))
+   (set (match_dup 4) (match_dup 5))]
 {
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
+  operands[5] = lowpart_subreg (mode,
operands[0], mode);
-}
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "")])
+  if (SUBREG_P (operands[5]))
+{
+  SUBREG_PROMOTED_VAR_P (operands[5]) = 1;
+  SUBREG_PROMOTED_SET (operands[5], 1);
+}
+})
 
 (define_insn_and_split "*_cmp3"
   [(set (match_operand: 0 "register_operand")
@@ -4108,31 +4096,18 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
-(match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v")
+

[gcc r15-4510] Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

2024-10-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:5259d3927c1c8e3a15b4b844adef59b48c241233

commit r15-4510-g5259d3927c1c8e3a15b4b844adef59b48c241233
Author: liuhongt 
Date:   Wed Oct 16 13:43:48 2024 +0800

Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

r12-6103-g1a7ce8570997eb combines vpcmpuw + zero_extend to vpcmpuw
with the pre_reload splitter, but the splitter transforms the
zero_extend into a subreg which make reload think the upper part is
garbage, it's not correct.

The patch adjusts the zero_extend define_insn_and_split to
define_insn to keep zero_extend.

gcc/ChangeLog:

PR target/117159
* config/i386/sse.md
(*_cmp3_zero_extend):
Change from define_insn_and_split to define_insn.
(*_cmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_cmp3_zero_extend_2):
Split to the zero_extend pattern.
(*_cmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117159.c: New test.
* gcc.target/i386/avx512bw-pr103750-1.c: Remove xfail.
* gcc.target/i386/avx512bw-pr103750-2.c: Remove xfail.

Diff:
---
 gcc/config/i386/sse.md | 198 +
 .../gcc.target/i386/avx512bw-pr103750-1.c  |   3 +-
 .../gcc.target/i386/avx512bw-pr103750-2.c  |   3 +-
 gcc/testsuite/gcc.target/i386/pr117159.c   |  42 +
 4 files changed, 125 insertions(+), 121 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 685bce3094ab..6c28b74ac3f2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4298,32 +4298,19 @@
 
 ;; Since vpcmpd implicitly clear the upper bits of dest, transform
 ;; vpcmpd + zero_extend to vpcmpd since the instruction
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
-(match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+(match_operand:SI 3 "const_0_to_7_operand" "n")]
UNSPEC_PCMP)))]
   "TARGET_AVX512F
&& (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW)
-   && ix86_pre_reload_split ()
&& (GET_MODE_NUNITS (mode)
   < GET_MODE_PRECISION (mode))"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))]
-{
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
-operands[0], mode);
-}
+  "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssecmp")
(set_attr "length_immediate" "1")
(set_attr "prefix" "evex")
@@ -4351,21 +4338,22 @@
   "#"
   "&& 1"
   [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))
-   (set (match_dup 4) (match_dup 0))]
+(zero_extend:SWI248x
+ (unspec:
+   [(match_dup 1)
+(match_dup 2)
+(match_dup 3)]
+   UNSPEC_PCMP)))
+   (set (match_dup 4) (match_dup 5))]
 {
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
+  operands[5] = lowpart_subreg (mode,
operands[0], mode);
-}
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "")])
+  if (SUBREG_P (operands[5]))
+{
+  SUBREG_PROMOTED_VAR_P (operands[5]) = 1;
+  SUBREG_PROMOTED_SET (operands[5], 1);
+}
+})
 
 (define_insn_and_split "*_cmp3"
   [(set (match_operand: 0 "register_operand")
@@ -4400,31 +4388,18 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
-(match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+(match

[gcc r13-9139] Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

2024-10-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:fca35b417c236e3448bc3666820fd1ba423fe6e9

commit r13-9139-gfca35b417c236e3448bc3666820fd1ba423fe6e9
Author: liuhongt 
Date:   Wed Oct 16 13:43:48 2024 +0800

Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

r12-6103-g1a7ce8570997eb combines vpcmpuw + zero_extend to vpcmpuw
with the pre_reload splitter, but the splitter transforms the
zero_extend into a subreg which make reload think the upper part is
garbage, it's not correct.

The patch adjusts the zero_extend define_insn_and_split to
define_insn to keep zero_extend.

gcc/ChangeLog:

PR target/117159
* config/i386/sse.md
(*_cmp3_zero_extend):
Change from define_insn_and_split to define_insn.
(*_cmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_cmp3_zero_extend_2):
Split to the zero_extend pattern.
(*_cmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117159.c: New test.
* gcc.target/i386/avx512bw-pr103750-1.c: Remove xfail.
* gcc.target/i386/avx512bw-pr103750-2.c: Remove xfail.

(cherry picked from commit 5259d3927c1c8e3a15b4b844adef59b48c241233)

Diff:
---
 gcc/config/i386/sse.md | 198 +
 .../gcc.target/i386/avx512bw-pr103750-1.c  |   3 +-
 .../gcc.target/i386/avx512bw-pr103750-2.c  |   3 +-
 gcc/testsuite/gcc.target/i386/pr117159.c   |  42 +
 4 files changed, 125 insertions(+), 121 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6550a72f76e0..50cc5882dbb2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3783,32 +3783,19 @@
 
 ;; Since vpcmpd implicitly clear the upper bits of dest, transform
 ;; vpcmpd + zero_extend to vpcmpd since the instruction
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
-(match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+(match_operand:SI 3 "const_0_to_7_operand" "n")]
UNSPEC_PCMP)))]
   "TARGET_AVX512F
&& (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW)
-   && ix86_pre_reload_split ()
&& (GET_MODE_NUNITS (mode)
   < GET_MODE_PRECISION (mode))"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))]
-{
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
-operands[0], mode);
-}
+  "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssecmp")
(set_attr "length_immediate" "1")
(set_attr "prefix" "evex")
@@ -3836,21 +3823,22 @@
   "#"
   "&& 1"
   [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))
-   (set (match_dup 4) (match_dup 0))]
+(zero_extend:SWI248x
+ (unspec:
+   [(match_dup 1)
+(match_dup 2)
+(match_dup 3)]
+   UNSPEC_PCMP)))
+   (set (match_dup 4) (match_dup 5))]
 {
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
+  operands[5] = lowpart_subreg (mode,
operands[0], mode);
-}
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "")])
+  if (SUBREG_P (operands[5]))
+{
+  SUBREG_PROMOTED_VAR_P (operands[5]) = 1;
+  SUBREG_PROMOTED_SET (operands[5], 1);
+}
+})
 
 (define_insn_and_split "*_cmp3"
   [(set (match_operand: 0 "register_operand")
@@ -3885,31 +3873,18 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
-(match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v")
+(

[gcc r12-10778] Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

2024-10-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:91800a70a2af1349eefc5f3380be2b254b1db395

commit r12-10778-g91800a70a2af1349eefc5f3380be2b254b1db395
Author: liuhongt 
Date:   Wed Oct 16 13:43:48 2024 +0800

Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

r12-6103-g1a7ce8570997eb combines vpcmpuw + zero_extend to vpcmpuw
with the pre_reload splitter, but the splitter transforms the
zero_extend into a subreg which make reload think the upper part is
garbage, it's not correct.

The patch adjusts the zero_extend define_insn_and_split to
define_insn to keep zero_extend.

gcc/ChangeLog:

PR target/117159
* config/i386/sse.md
(*_cmp3_zero_extend):
Change from define_insn_and_split to define_insn.
(*_cmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_cmp3_zero_extend_2):
Split to the zero_extend pattern.
(*_cmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117159.c: New test.
* gcc.target/i386/avx512bw-pr103750-1.c: Remove xfail.
* gcc.target/i386/avx512bw-pr103750-2.c: Remove xfail.

(cherry picked from commit 5259d3927c1c8e3a15b4b844adef59b48c241233)

Diff:
---
 gcc/config/i386/sse.md | 196 +
 .../gcc.target/i386/avx512bw-pr103750-1.c  |   3 +-
 .../gcc.target/i386/avx512bw-pr103750-2.c  |   3 +-
 gcc/testsuite/gcc.target/i386/pr117159.c   |  42 +
 4 files changed, 124 insertions(+), 120 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 924effce2b55..c94c8eceb338 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3724,32 +3724,19 @@
 
 ;; Since vpcmpd implicitly clear the upper bits of dest, transform
 ;; vpcmpd + zero_extend to vpcmpd since the instruction
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
-(match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
+   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
 (match_operand:SI 3 "const_0_to_7_operand" "n")]
UNSPEC_PCMP)))]
   "TARGET_AVX512F
&& (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW)
-   && ix86_pre_reload_split ()
&& (GET_MODE_NUNITS (mode)
   < GET_MODE_PRECISION (mode))"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))]
-{
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
-operands[0], mode);
-}
+  "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssecmp")
(set_attr "length_immediate" "1")
(set_attr "prefix" "evex")
@@ -3777,21 +3764,22 @@
   "#"
   "&& 1"
   [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))
-   (set (match_dup 4) (match_dup 0))]
+(zero_extend:SWI248x
+ (unspec:
+   [(match_dup 1)
+(match_dup 2)
+(match_dup 3)]
+   UNSPEC_PCMP)))
+   (set (match_dup 4) (match_dup 5))]
 {
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
+  operands[5] = lowpart_subreg (mode,
operands[0], mode);
-}
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "")])
+  if (SUBREG_P (operands[5]))
+{
+  SUBREG_PROMOTED_VAR_P (operands[5]) = 1;
+  SUBREG_PROMOTED_SET (operands[5], 1);
+}
+})
 
 (define_insn_and_split "*_cmp3"
   [(set (match_operand: 0 "register_operand")
@@ -3826,31 +3814,18 @@
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
-(match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm"

[gcc r13-9142] [GCC13/GCC12] Fix testcase.

2024-10-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:8b43518a01cbbbafe042b85a48fa09a32948380a

commit r13-9142-g8b43518a01cbbbafe042b85a48fa09a32948380a
Author: liuhongt 
Date:   Tue Oct 22 11:24:23 2024 +0800

[GCC13/GCC12] Fix testcase.

The optimization relies on other patterns which are only available at
GCC14 and obove, so restore the xfail for GCC13/12 branch.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-pr103750-2.c: Add xfail for ia32.

Diff:
---
 gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
index 3392e193222a..7303f5403ba8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
@@ -1,7 +1,8 @@
 /* PR target/103750 */
 /* { dg-do compile }  */
 /* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */
-/* { dg-final { scan-assembler-not "kmov" } } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
 
 #include 
 extern __m128i* pi128;


[gcc r12-10781] [GCC13/GCC12] Fix testcase.

2024-10-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:45bde60836d04cce4637b74ecadbb0aff90b832f

commit r12-10781-g45bde60836d04cce4637b74ecadbb0aff90b832f
Author: liuhongt 
Date:   Tue Oct 22 11:24:23 2024 +0800

[GCC13/GCC12] Fix testcase.

The optimization relies on other patterns which are only available at
GCC14 and obove, so restore the xfail for GCC13/12 branch.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-pr103750-2.c: Add xfail for ia32.

(cherry picked from commit 8b43518a01cbbbafe042b85a48fa09a32948380a)

Diff:
---
 gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
index 3392e193222a..7303f5403ba8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
@@ -1,7 +1,8 @@
 /* PR target/103750 */
 /* { dg-do compile }  */
 /* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */
-/* { dg-final { scan-assembler-not "kmov" } } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
 
 #include 
 extern __m128i* pi128;


[gcc r15-4225] Enable vectorization for unknown tripcount in very cheap cost model but disable epilog vectorization

2024-10-09 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:70c3db511ba14ff5fa68cb41d0714a9fb957ea5d

commit r15-4225-g70c3db511ba14ff5fa68cb41d0714a9fb957ea5d
Author: liuhongt 
Date:   Mon Mar 25 21:28:14 2024 -0700

Enable vectorization for unknown tripcount in very cheap cost model but 
disable epilog vectorization.

gcc/ChangeLog:

* tree-vect-loop.cc (vect_analyze_loop_costing): Enable
vectorization for LOOP_VINFO_PEELING_FOR_NITER in very cheap
cost model.
(vect_analyze_loop): Disable epilogue vectorization in very
cheap cost model.
* doc/invoke.texi: Adjust documents for very-cheap cost model.

Diff:
---
 gcc/doc/invoke.texi   | 11 ---
 gcc/tree-vect-loop.cc |  6 +++---
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c0c8bf1c29a9..12477e6f9df3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14315,13 +14315,10 @@ counts that will likely execute faster than when 
executing the original
 scalar loop.  The @samp{cheap} model disables vectorization of
 loops where doing so would be cost prohibitive for example due to
 required runtime checks for data dependence or alignment but otherwise
-is equal to the @samp{dynamic} model.  The @samp{very-cheap} model only
-allows vectorization if the vector code would entirely replace the
-scalar code that is being vectorized.  For example, if each iteration
-of a vectorized loop would only be able to handle exactly four iterations
-of the scalar loop, the @samp{very-cheap} model would only allow
-vectorization if the scalar iteration count is known to be a multiple
-of four.
+is equal to the @samp{dynamic} model.  The @samp{very-cheap} model disables
+vectorization of loops when any runtime check for data dependence or alignment
+is required, it also disables vectorization of epilogue loops but otherwise is
+equal to the @samp{cheap} model.
 
 The default cost model depends on other optimization flags and is
 either @samp{dynamic} or @samp{cheap}.
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9be50aaa621c..ade72a5124f7 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2375,8 +2375,7 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
  a copy of the scalar code (even if we might be able to vectorize it).  */
   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
   && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+ || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
 {
   if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -3683,7 +3682,8 @@ vect_analyze_loop (class loop *loop, gimple 
*loop_vectorized_call,
   /* No code motion support for multiple epilogues so 
for now
  not supported when multiple exits.  */
 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
-&& !loop->simduid);
+&& !loop->simduid
+&& loop_cost_model (loop) > 
VECT_COST_MODEL_VERY_CHEAP);
   if (!vect_epilogues)
 return first_loop_vinfo;


[gcc r15-4226] Adjust testcase after relax O2 vectorization.

2024-10-09 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:d5d1189c12199db79f6feb5cfcc7e6475c3a4d91

commit r15-4226-gd5d1189c12199db79f6feb5cfcc7e6475c3a4d91
Author: liuhongt 
Date:   Thu Sep 19 13:38:34 2024 +0800

Adjust testcase after relax O2 vectorization.

gcc/testsuite/ChangeLog:

* gcc.dg/fstack-protector-strong.c: Adjust
scan-assembler-times.
* gcc.dg/graphite/scop-6.c: Refine the testcase to avoid array
out of bounds.
* gcc.dg/graphite/scop-9.c: Ditto.
* gcc.dg/tree-ssa/ivopts-lt-2.c: Add -fno-tree-vectorize.
* gcc.dg/tree-ssa/ivopts-lt.c: Ditto.
* gcc.dg/tree-ssa/loop-16.c: Ditto.
* gcc.dg/tree-ssa/loop-28.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-2.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-4.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-6.c: Ditto.
* gcc.dg/tree-ssa/predcom-4.c: Ditto.
* gcc.dg/tree-ssa/predcom-5.c: Ditto.
* gcc.dg/tree-ssa/scev-11.c: Ditto.
* gcc.dg/tree-ssa/scev-9.c: Ditto.
* gcc.dg/tree-ssa/split-path-11.c: Ditto.
* gcc.dg/unroll-8.c: Ditto.
* gcc.dg/var-expand1.c: Ditto.
* gcc.dg/vect/vect-cost-model-6.c: Removed.
* gcc.target/i386/pr86270.c: Ditto.
* gcc.target/i386/pr86722.c: Ditto.
* gcc.target/x86_64/abi/callabi/leaf-2.c: Ditto.

Diff:
---
 gcc/testsuite/gcc.dg/fstack-protector-strong.c   |  2 +-
 gcc/testsuite/gcc.dg/graphite/scop-6.c   |  7 +++
 gcc/testsuite/gcc.dg/graphite/scop-9.c   |  4 ++--
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-16.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-28.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-2.c |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-4.c |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-6.c |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/scev-11.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/scev-9.c   |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-11.c|  2 +-
 gcc/testsuite/gcc.dg/unroll-8.c  |  3 +--
 gcc/testsuite/gcc.dg/var-expand1.c   |  2 +-
 gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c| 12 
 gcc/testsuite/gcc.target/i386/pr86270.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pr86722.c  |  2 +-
 gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c |  2 +-
 21 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/fstack-protector-strong.c 
b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
index 94dc3508f1ad..b9f63966b7cc 100644
--- a/gcc/testsuite/gcc.dg/fstack-protector-strong.c
+++ b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
@@ -154,4 +154,4 @@ void foo12 ()
   global3 ();
 }
 
-/* { dg-final { scan-assembler-times "stack_chk_fail" 12 } } */
+/* { dg-final { scan-assembler-times "stack_chk_fail" 11 } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/scop-6.c 
b/gcc/testsuite/gcc.dg/graphite/scop-6.c
index 9bc1d9f4ccd0..e7e0a080c5fd 100644
--- a/gcc/testsuite/gcc.dg/graphite/scop-6.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-6.c
@@ -4,7 +4,7 @@ int toto()
 {
   int i, j, k;
   int a[100][100];
-  int b[100];
+  int b[200];
 
   for (i = 1; i < 100; i++)
 {
@@ -18,9 +18,8 @@ int toto()
 for (k = 1; k < 100; k++)
   b[i+k] = b[i+k-1] + 2;
 }
-  
-  for (k = 1; k < 100; k++)
-b[i+k] = b[i+k-5] + 2;
+  for (k = 4; k < 100; k++)
+   b[i+k] = b[i+k-5] + 2;
 }
 
   return a[3][5] + b[2];
diff --git a/gcc/testsuite/gcc.dg/graphite/scop-9.c 
b/gcc/testsuite/gcc.dg/graphite/scop-9.c
index b19291be2f81..2676452b1e60 100644
--- a/gcc/testsuite/gcc.dg/graphite/scop-9.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-9.c
@@ -4,7 +4,7 @@ int toto()
 {
   int i, j, k;
   int a[100][100];
-  int b[100];
+  int b[200];
 
   for (i = 1; i < 100; i++)
 {
@@ -14,7 +14,7 @@ int toto()
   if (i * 2 == i + 8)
a[i][i] = 2;
 
-  for (k = 1; k < 100; k++)
+  for (k = 4; k < 100; k++)
 b[i+k] = b[i+k-5] + 2;
 }
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
index bdbdbff19ffb..be325775fbb7 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fno-tree-loop-distribute-patterns -fdump-tree-ivopts" } 
*/
+/* { dg-options "-O2 -fno-tree-vectorize -fno-tree-loop-distribute-patterns 
-fdump-tree-ivopts" } */
 /* { dg-skip-if "PR68644" { hppa*-*-* powerpc*-*-* } } */
 
 void
diff --git a/gcc/testsuite

[gcc r15-4234] Add a new tune avx256_avoid_vec_perm for SRF.

2024-10-09 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:9eaecce3d8c1d9349adbf8c2cdaf8d87672ed29c

commit r15-4234-g9eaecce3d8c1d9349adbf8c2cdaf8d87672ed29c
Author: liuhongt 
Date:   Wed Sep 25 13:11:11 2024 +0800

Add a new tune avx256_avoid_vec_perm for SRF.

According to Intel SOM[1], For Crestmont,  most 256-bit Intel AVX2
instructions can be decomposed into two independent 128-bit
micro-operations, except for a subset of Intel AVX2 instructions,
known as cross-lane operations, can only compute the result for an
element by utilizing one or more sources belonging to other elements.

The 256-bit instructions listed below use more operand sources than
can be natively supported by a single reservation station within these
microarchitectures. They are decomposed into two μops, where the first
μop resolves a subset of operand dependencies across two cycles. The
dependent second μop executes the 256-bit operation by using a single
128-bit execution port for two consecutive cycles with a five-cycle
latency for a total latency of seven cycles.

VPERM2I128 ymm1, ymm2, ymm3/m256, imm8
VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
VPERMPD ymm1, ymm2/m256, imm8
VPERMPS ymm1, ymm2, ymm3/m256
VPERMD ymm1, ymm2, ymm3/m256
VPERMQ ymm1, ymm2/m256, imm8

Instead of setting tune avx128_optimal for SRF, the patch add a new
tune avx256_avoid_vec_perm for it. so by default, vectorizer still
uses 256-bit VF if cost is profitable, but lowers to 128-bit whenever
256-bit vec_perm is needed for auto-vectorization. w/o vec_perm,
performance of 256-bit vectorization should be similar as 128-bit
ones(some benchmark results show it's even better than 128-bit
vectorization since it enables more parallelism for convert cases.)

[1] 
https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs):
Add new member m_num_avx256_vec_perm.
(ix86_vector_costs::add_stmt_cost): Record 256-bit vec_perm.
(ix86_vector_costs::finish_cost): Prevent vectorization for
TAREGT_AVX256_AVOID_VEC_PERM when there's 256-bit vec_perm
instruction.
* config/i386/i386.h (TARGET_AVX256_AVOID_VEC_PERM): New
Macro.
* config/i386/x86-tune.def (X86_TUNE_AVX256_SPLIT_REGS): Add
m_CORE_ATOM.
(X86_TUNE_AVX256_AVOID_VEC_PERM): New tune.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx256_avoid_vec_perm.c: New test.

Diff:
---
 gcc/config/i386/i386.cc| 14 +-
 gcc/config/i386/i386.h |  2 ++
 gcc/config/i386/x86-tune.def   |  7 ++-
 .../gcc.target/i386/avx256_avoid_vec_perm.c| 22 ++
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 90a564b2ffaa..ab0ade3790f2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25025,12 +25025,15 @@ private:
  where we know it's not loaded from memory.  */
   unsigned m_num_gpr_needed[3];
   unsigned m_num_sse_needed[3];
+  /* Number of 256-bit vector permutation.  */
+  unsigned m_num_avx256_vec_perm[3];
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
 m_num_gpr_needed (),
-m_num_sse_needed ()
+m_num_sse_needed (),
+m_num_avx256_vec_perm ()
 {
 }
 
@@ -25264,6 +25267,10 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   if (stmt_cost == -1)
 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  if (kind == vec_perm && vectype
+  && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
+m_num_avx256_vec_perm[where]++;
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
   && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
@@ -25333,6 +25340,11 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
 
   ix86_vect_estimate_reg_pressure ();
 
+  for (int i = 0; i != 3; i++)
+if (m_num_avx256_vec_perm[i]
+   && TARGET_AVX256_AVOID_VEC_PERM)
+  m_costs[i] = INT_MAX;
+
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d5d54ee66040..f5204aa1ed23 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -439,6 +439,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
 #define TARGET_AVX256_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#define TARGET_AVX256_AVOID_VEC_PERM \
+   ix86_tune_features[X86_TUNE_AVX256_AVO

[gcc r15-4233] Add new microarchitecture tune for SRF/GRR/CWF.

2024-10-09 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc

commit r15-4233-g9c8cea8feb6cd54ef73113a0b74f1df7b60d09dc
Author: liuhongt 
Date:   Tue Sep 24 15:53:14 2024 +0800

Add new microarchitecture tune for SRF/GRR/CWF.

For Crestmont, 4-operand vex blendv instructions come from MSROM and
is slower than 3-instructions sequence (op1 & mask) | (op2 & ~mask).
legacy blendv instruction can still be handled by the decoder.

The patch add a new tune which is enabled for all processors except
for SRF/CWF. It will use vpand + vpandn + vpor instead of
vpblendvb(similar for vblendvps/vblendvpd) for SRF/CWF.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Guard
instruction blendv generation under new tune.
* config/i386/i386.h (TARGET_SSE_MOVCC_USE_BLENDV): New Macro.
* config/i386/x86-tune.def (X86_TUNE_SSE_MOVCC_USE_BLENDV):
New tune.

Diff:
---
 gcc/config/i386/i386-expand.cc | 24 +++---
 gcc/config/i386/i386.h |  2 ++
 gcc/config/i386/x86-tune.def   |  8 
 .../gcc.target/i386/sse_movcc_use_blendv.c | 12 +++
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 32840113cf60..0734399e4955 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4344,23 +4344,23 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   switch (mode)
 {
 case E_V2SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_blendvps;
   break;
 case E_V4SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
   break;
 case E_V2DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
   break;
 case E_SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvss;
   break;
 case E_DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvsd;
   break;
 case E_V8QImode:
@@ -4368,7 +4368,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V4HFmode:
 case E_V4BFmode:
 case E_V2SImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v8qi;
  blend_mode = V8QImode;
@@ -4378,14 +4378,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V2HImode:
 case E_V2HFmode:
 case E_V2BFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v4qi;
  blend_mode = V4QImode;
}
   break;
 case E_V2QImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_pblendvb_v2qi;
   break;
 case E_V16QImode:
@@ -4395,18 +4395,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V4SImode:
 case E_V2DImode:
 case E_V1TImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_sse4_1_pblendvb;
  blend_mode = V16QImode;
}
   break;
 case E_V8SFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvps256;
   break;
 case E_V4DFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvpd256;
   break;
 case E_V32QImode:
@@ -4415,7 +4415,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V16BFmode:
 case E_V8SImode:
 case E_V4DImode:
-  if (TARGET_AVX2)
+  if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
{
  gen = gen_avx2_pblendvb;
  blend_mode = V32QImode;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 82177b9d3839..d5d54ee66040 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -462,6 +462,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
 #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
+#define TARGET_SSE_MOVCC_USE_BLENDV \
+   ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 3d123da95f0c..b815b6dc255b 100644
--- a/gcc/config/i386/x86-tune

[gcc r15-4560] i386: Optimize EQ/NE comparison between avx512 kmask and -1.

2024-10-22 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ee7e77e9c121f5a6f27c92b6b24b2abf9cd66a4d

commit r15-4560-gee7e77e9c121f5a6f27c92b6b24b2abf9cd66a4d
Author: liuhongt 
Date:   Mon Oct 21 02:22:08 2024 -0700

i386: Optimize EQ/NE comparison between avx512 kmask and -1.

r15-974-gbf7745f887c765e06f2e75508f263debb60aeb2e has optimized for
jcc/setcc, but missed movcc.
The patch supports movcc.

gcc/ChangeLog:

PR target/117232
* config/i386/sse.md 
(*kortest_cmp_movqicc):
New define_insn_and_split.
(*kortest_cmp_movcc):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117232-1.c: New test.
* gcc.target/i386/pr117232-apx-1.c: New test.

Diff:
---
 gcc/config/i386/sse.md | 85 ++
 gcc/testsuite/gcc.target/i386/pr117232-1.c | 47 ++
 gcc/testsuite/gcc.target/i386/pr117232-apx-1.c | 48 +++
 3 files changed, 180 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6c28b74ac3f2..2345015db1b3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2423,6 +2423,91 @@
   DONE;
 })
 
+;; Optimize cmp + movcc with mask register by kortest + movcc.
+(define_insn_and_split "*kortest_cmp_movqicc"
+   [(set (match_operand:QI 0 "register_operand" "=r,r,r,r,r,r")
+  (if_then_else:QI
+   (match_operator 1 "bt_comparison_operator"
+ [(match_operand:SWI1248_AVX512BWDQ_64 4 "register_operand"
+ "?k,,?k, ,?k,r")
+  (const_int -1)])
+   (match_operand:QI 2 "register_operand"  "r,r,0,0,r,r")
+   (match_operand:QI 3 "register_operand" " 0,0,r,r,r,r")))
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW && TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (if_then_else:SI
+ (match_dup 5)
+ (match_dup 2)
+ (match_dup 3)))]
+{
+  rtx flag_reg;
+  if (MASK_REGNO_P (REGNO (operands[4])))
+{
+  emit_insn (gen_kortest_ccc (operands[4], 
operands[4]));
+  flag_reg = gen_rtx_REG (CCCmode, FLAGS_REG);
+}
+  else
+{
+  flag_reg = gen_rtx_REG (CCZmode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (flag_reg,
+ gen_rtx_COMPARE (CCZmode,
+  operands[4],
+  constm1_rtx)));
+}
+  operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[1]), VOIDmode,
+   flag_reg,const0_rtx);
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[2] = gen_lowpart (SImode, operands[2]);
+  operands[3] = gen_lowpart (SImode, operands[3]);
+}
+  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd")
+   (set_attr "type" "icmov")
+   (set_attr "mode" "QI")])
+
+(define_insn_and_split 
"*kortest_cmp_movcc"
+   [(set (match_operand:SWI248 0 "register_operand" "=r,r,r,r,r,r,r,r")
+  (if_then_else:SWI248
+   (match_operator 1 "bt_comparison_operator"
+ [(match_operand:SWI1248_AVX512BWDQ_64 4 "register_operand"
+ "?k,,?k, ,?k,r,?k, 
r")
+  (const_int -1)])
+   (match_operand:SWI248 2 "nonimmediate_operand" "rm,rm, 0, 0,rm,rm, r, 
r")
+   (match_operand:SWI248 3 "nonimmediate_operand" " 0, 0,rm,rm, r, 
r,rm,rm")))
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW && TARGET_CMOVE
+   && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (if_then_else:SWI248
+ (match_dup 5)
+ (match_dup 2)
+ (match_dup 3)))]
+{
+  rtx flag_reg;
+  if (MASK_REGNO_P (REGNO (operands[4])))
+{
+  emit_insn (gen_kortest_ccc (operands[4], 
operands[4]));
+  flag_reg = gen_rtx_REG (CCCmode, FLAGS_REG);
+}
+  else
+{
+  flag_reg = gen_rtx_REG (CCZmode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (flag_reg,
+ gen_rtx_COMPARE (CCZmode,
+  operands[4],
+  constm1_rtx)));
+}
+  operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[1]), VOIDmode,
+   flag_reg,const0_rtx);
+}
+  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd")
+   (set_attr "type" "icmov")
+   (set_attr "mode" "")])
+
 (define_insn "kunpckhi"
   [(set (match_operand:HI 0 "register_operand" "=k")
(ior:HI
diff --git a/gcc/testsuite/gcc.target/i386/pr117232-1.c 
b/gcc/testsuite/gcc.target/i386/pr117232-1.c
new file mode 100644
index ..cd7f5d112a79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117232-1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-times {(?n)kortest[bwqd]} 7 { target { ! ia32 } 
} } } */
+/* { dg-final { scan-assembler-times {(?n)cmovn?c} 7 { target { ! ia32 } } } } 
*/
+
+#include 
+int
+foo (__m512i a, __m512i b, int c, i

[gcc r14-10831] Fix ICE due to isa mismatch for the builtins.

2024-10-23 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b718f6ec1674c0db30f26c65b7a9215e9388dd6c

commit r14-10831-gb718f6ec1674c0db30f26c65b7a9215e9388dd6c
Author: liuhongt 
Date:   Tue Oct 22 01:54:40 2024 -0700

Fix ICE due to isa mismatch for the builtins.

gcc/ChangeLog:

PR target/117240
* config/i386/i386-builtin.def: Add avx/avx512f to vaes
ymm/zmm builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117240_avx.c: New test.
* gcc.target/i386/pr117240_avx512f.c: New test.

(cherry picked from commit 403e361d5aa620e77c9832578b2409a0fdd79d96)

Diff:
---
 gcc/config/i386/i386-builtin.def | 16 
 gcc/testsuite/gcc.target/i386/pr117240_avx.c | 10 ++
 gcc/testsuite/gcc.target/i386/pr117240_avx512f.c | 10 ++
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index ab73e20121aa..fdd9dba6e542 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2832,17 +2832,17 @@ BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, 
"__builtin_ia32_rdpid", IX86_B
 
 /* VAES.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
"__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, 
UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
"__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdec_v64qi, 
"__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", 
IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
"__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", 
IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
"__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdeclast_v64qi, 
"__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
"__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, 
UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
"__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenc_v64qi, 
"__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", 
IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
"__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", 
IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
"__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenclast_v64qi, 
"__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 
 /* BF16 */
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16 | OPTION_MASK_ISA2_EVEX512

[gcc r13-9145] Fix ICE due to isa mismatch for the builtins.

2024-10-23 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:2452387468423882c0732e0fad3a83e887574ccc

commit r13-9145-g2452387468423882c0732e0fad3a83e887574ccc
Author: liuhongt 
Date:   Tue Oct 22 01:54:40 2024 -0700

Fix ICE due to isa mismatch for the builtins.

gcc/ChangeLog:

PR target/117240
* config/i386/i386-builtin.def: Add avx/avx512f to vaes
ymm/zmm builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117240_avx.c: New test.
* gcc.target/i386/pr117240_avx512f.c: New test.

(cherry picked from commit 403e361d5aa620e77c9832578b2409a0fdd79d96)

Diff:
---
 gcc/config/i386/i386-builtin.def | 16 
 gcc/testsuite/gcc.target/i386/pr117240_avx.c | 10 ++
 gcc/testsuite/gcc.target/i386/pr117240_avx512f.c | 10 ++
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 0037fb34891a..63861a6d8329 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2802,17 +2802,17 @@ BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, 
"__builtin_ia32_rdpid", IX86_B
 
 /* VAES.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
"__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, 
"__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
"__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, 
"__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", 
IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
"__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, 
"__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
"__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", 
IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
"__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, 
"__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
"__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, 
"__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", 
IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
"__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, 
"__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
"__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", 
IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
 
 /* BF16 */
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, 
"__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, 
UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF)
diff --git a/gcc/testsuite/gcc.target/i386/pr117240_avx.c 
b/gcc/testsuite/gcc.target/i386/pr1

  1   2   >