[gcc r15-3942] Implement CSHIFT and EOSHIFT for unsigned.

2024-09-28 Thread Thomas Kテカnig via Gcc-cvs
https://gcc.gnu.org/g:1c928004cf0bc2131b6199905d11133d23a7cef2

commit r15-3942-g1c928004cf0bc2131b6199905d11133d23a7cef2
Author: Thomas Koenig 
Date:   Sat Sep 28 22:28:59 2024 +0200

Implement CSHIFT and EOSHIFT for unsigned.

gcc/fortran/ChangeLog:

* check.cc (gfc_check_eoshift): Handle BT_UNSIGNED.
* simplify.cc (gfc_simplify_eoshift): Likewise.
* gfortran.texi: Document CSHIFT and EOSHIFT for UNSIGNED.

gcc/testsuite/ChangeLog:

* gfortran.dg/unsigned_31.f90: New test.
* gfortran.dg/unsigned_32.f90: New test.

Diff:
---
 gcc/fortran/check.cc  |  6 ++
 gcc/fortran/gfortran.texi |  3 ++-
 gcc/fortran/simplify.cc   |  4 
 gcc/testsuite/gfortran.dg/unsigned_31.f90 | 27 +++
 gcc/testsuite/gfortran.dg/unsigned_32.f90 | 27 +++
 5 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/gcc/fortran/check.cc b/gcc/fortran/check.cc
index 1851cfb8d4ad..1da269f5b725 100644
--- a/gcc/fortran/check.cc
+++ b/gcc/fortran/check.cc
@@ -3073,6 +3073,12 @@ gfc_check_eoshift (gfc_expr *array, gfc_expr *shift, 
gfc_expr *boundary,
case BT_CHARACTER:
  break;
 
+   case BT_UNSIGNED:
+ if (flag_unsigned)
+   break;
+
+ gcc_fallthrough();
+
default:
  gfc_error ("Missing %qs argument to %qs intrinsic at %L for %qs "
 "of type %qs", gfc_current_intrinsic_arg[2]->name,
diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi
index a5ebadff3bb8..b42d0095e571 100644
--- a/gcc/fortran/gfortran.texi
+++ b/gcc/fortran/gfortran.texi
@@ -2790,7 +2790,8 @@ As of now, the following intrinsics take unsigned 
arguments:
 @item @code{TRANSFER}
 @item @code{SUM}, @code{PRODUCT}, @code{MATMUL} and @code{DOT_PRODUCT}
 @item @code{IANY}, @code{IALL} and @code{IPARITY}
-@item @code{RANDOM_NUMBER}.
+@item @code{RANDOM_NUMBER}
+@item @code{CSHIFT} and @code{EOSHIFT}.
 @end itemize
 This list will grow in the near future.
 @c -
diff --git a/gcc/fortran/simplify.cc b/gcc/fortran/simplify.cc
index bd2f6485c95e..2f6c3c39dad8 100644
--- a/gcc/fortran/simplify.cc
+++ b/gcc/fortran/simplify.cc
@@ -2630,6 +2630,10 @@ gfc_simplify_eoshift (gfc_expr *array, gfc_expr *shift, 
gfc_expr *boundary,
  bnd = gfc_get_int_expr (array->ts.kind, NULL, 0);
  break;
 
+   case BT_UNSIGNED:
+ bnd = gfc_get_unsigned_expr (array->ts.kind, NULL, 0);
+ break;
+
case BT_LOGICAL:
  bnd = gfc_get_logical_expr (array->ts.kind, NULL, 0);
  break;
diff --git a/gcc/testsuite/gfortran.dg/unsigned_31.f90 
b/gcc/testsuite/gfortran.dg/unsigned_31.f90
new file mode 100644
index ..2a7c08ddba86
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/unsigned_31.f90
@@ -0,0 +1,27 @@
+! { dg-do run }
+! { dg-options "-funsigned" }
+program memain
+  call test1
+  call test2
+contains
+  subroutine test1
+unsigned, dimension(3) :: v
+unsigned, dimension(3,3) :: w, x
+integer, dimension(3) :: shft
+v = [1u, 2u, 3u]
+if (any(eoshift(v,1) /= [2u,3u,0u])) error stop 1
+w = reshape([1u,2u,3u,4u,5u,6u,7u,8u,9u],[3,3])
+x = eoshift(w, shift=[1,-2,1], boundary=10u, dim=1)
+if (any(x /= reshape([2u,3u,10u,10u,10u,4u,8u,9u,10u],[3,3]))) error stop 2
+shft = [2,-1,-2]
+x = eoshift(w,shift=shft,boundary=20u,dim=2)
+if (any(x /= reshape([7u,20u,20u,20u,2u,20u,20u,5u,3u],[3,3]))) error stop 
3
+  end subroutine test1
+  subroutine test2
+unsigned, dimension(3), parameter :: v = eoshift([1u,2u,3u],1)
+unsigned, dimension(3,3), parameter :: w = 
reshape([1u,2u,3u,4u,5u,6u,7u,8u,9u],[3,3])
+unsigned, dimension(3,3), parameter :: x = eoshift(w,shift=[1,-2,1], 
boundary=10u, dim=1)
+if (any(v /= [2u,3u,0u])) error stop 11
+if (any(x /= reshape([2u,3u,10u,10u,10u,4u,8u,9u,10u],[3,3]))) error stop 2
+  end subroutine test2
+end program memain
diff --git a/gcc/testsuite/gfortran.dg/unsigned_32.f90 
b/gcc/testsuite/gfortran.dg/unsigned_32.f90
new file mode 100644
index ..7d41988b0420
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/unsigned_32.f90
@@ -0,0 +1,27 @@
+! { dg-do run }
+! { dg-options "-funsigned" }
+program memain
+  call test1
+  call test2
+contains
+  subroutine test1
+unsigned, dimension(3) :: v
+unsigned, dimension(3,3) :: w, x
+integer, dimension(3) :: shft
+v = [1u, 2u, 3u]
+if (any(cshift(v,1) /= [2u,3u,1u])) error stop 1
+w = reshape([1u,2u,3u,4u,5u,6u,7u,8u,9u],[3,3])
+x = cshift(w, shift=[1,-2,1], dim=1)
+if (any(x /= reshape([2u,3u,1u,5u,6u,4u,8u,9u,7u],[3,3]))) error stop 2
+shft = [2,-1,-2]
+x = cshift(w,shift=shft,dim=2)
+if (any(x /= reshape([7u,8u,6u,1u,2u,9u,4u,5u,3u],[3,3]))) error stop 3
+  end subroutine test1
+  subroutine test2
+unsigned, dimens

[gcc r13-9061] Reduce recursive inlining of always_inline functions

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2532944e3588cf69bce019eaf03de9c63b78568f

commit r13-9061-g2532944e3588cf69bce019eaf03de9c63b78568f
Author: Jan Hubicka 
Date:   Tue May 14 12:58:56 2024 +0200

Reduce recursive inlining of always_inline functions

this patch tames down inliner on (mutiply) self-recursive always_inline 
functions.
While we already have caps on recursive inlning, the testcase combines 
early inliner
and late inliner to get very wide recursive inlining tree.  The basic idea 
is to
ignore DISREGARD_INLINE_LIMITS when deciding on inlining self recursive 
functions
(so we cut on function being large) and clear the flag once it is detected.

I did not include the testcase since it still produces a lot of code and 
would
slow down testing.  It also outputs many inlining failed messages that is 
not
very nice, but it is hard to detect self recursin cycles in full generality
when indirect calls and other tricks may happen.

gcc/ChangeLog:

PR ipa/113291

* ipa-inline.cc (enum can_inline_edge_by_limits_flags): New enum.
(can_inline_edge_by_limits_p): Take flags instead of multiple 
bools; add flag
for forcing inlinie limits.
(can_early_inline_edge_p): Update.
(want_inline_self_recursive_call_p): Update; use FORCE_LIMITS mode.
(check_callers): Update.
(update_caller_keys): Update.
(update_callee_keys): Update.
(recursive_inlining): Update.
(add_new_edges_to_heap): Update.
(speculation_useful_p): Update.
(inline_small_functions): Clear DECL_DISREGARD_INLINE_LIMITS on 
self recursion.
(flatten_function): Update.
(inline_to_all_callers_1): Update.

(cherry picked from commit 1ec49897253e093e1ef6261eb104ac0c111bac83)

Diff:
---
 gcc/ipa-inline.cc | 79 +--
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index 474fbff20574..77cb0726f9f0 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -443,24 +443,33 @@ inline_insns_auto (cgraph_node *n, bool hint, bool hint2)
   return max_inline_insns_auto;
 }
 
+enum can_inline_edge_by_limits_flags
+{
+  /* True if we are early inlining.  */
+  CAN_INLINE_EARLY = 1,
+  /* Ignore size limits.  */
+  CAN_INLINE_DISREGARD_LIMITS = 2,
+  /* Force size limits (ignore always_inline).  This is used for
+ recrusive inlining where always_inline may lead to inline bombs
+ and technically it is non-sential anyway.  */
+  CAN_INLINE_FORCE_LIMITS = 4,
+  /* Report decision to dump file.  */
+  CAN_INLINE_REPORT = 8,
+};
+
 /* Decide if we can inline the edge and possibly update
inline_failed reason.  
We check whether inlining is possible at all and whether
-   caller growth limits allow doing so.  
-
-   if REPORT is true, output reason to the dump file.
-
-   if DISREGARD_LIMITS is true, ignore size limits.  */
+   caller growth limits allow doing so.  */
 
 static bool
-can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report,
-bool disregard_limits = false, bool early = false)
+can_inline_edge_by_limits_p (struct cgraph_edge *e, int flags)
 {
   gcc_checking_assert (e->inline_failed);
 
   if (cgraph_inline_failed_type (e->inline_failed) == CIF_FINAL_ERROR)
 {
-  if (report)
+  if (flags & CAN_INLINE_REPORT)
 report_inline_failed_reason (e);
   return false;
 }
@@ -474,10 +483,11 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
   tree callee_tree
 = callee ? DECL_FUNCTION_SPECIFIC_OPTIMIZATION (callee->decl) : NULL;
   /* Check if caller growth allows the inlining.  */
-  if (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
-  && !disregard_limits
-  && !lookup_attribute ("flatten",
-DECL_ATTRIBUTES (caller->decl))
+  if (!(flags & CAN_INLINE_DISREGARD_LIMITS)
+  && ((flags & CAN_INLINE_FORCE_LIMITS)
+ || (!DECL_DISREGARD_INLINE_LIMITS (callee->decl)
+ && !lookup_attribute ("flatten",
+DECL_ATTRIBUTES (caller->decl
   && !caller_growth_limits (e))
 inlinable = false;
   else if (callee->externally_visible
@@ -505,7 +515,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
to inline library always_inline functions. See PR65873.
Disable the check for early inlining for now until better solution
is found.  */
- if (always_inline && early)
+ if (always_inline && (flags & CAN_INLINE_EARLY))
;
   /* There are some options that change IL semantics which means
  we cannot inline in these cases for correctness reason.
@@ -541,7 +551,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
  /* When devirtualization is disabled for callee, it is not safe
  

[gcc r14-10717] Zen5 tuning part 1: avoid FMA chains

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:fce2fe0406aa66c5d6f6465984a6af9ccc63370d

commit r14-10717-gfce2fe0406aa66c5d6f6465984a6af9ccc63370d
Author: Jan Hubicka 
Date:   Tue Sep 3 13:38:33 2024 +0200

Zen5 tuning part 1: avoid FMA chains

testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the split
registers. Znver5 benefits from avoding FMA on all widths.  This may be 
different
with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on S323 from TSVC:

real_t s323(struct args_t * func_args)
{

//recurrences
//coupled recurrence

initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);

for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 1; i < LEN_1D; i++) {
a[i] = b[i-1] + c[i] * d[i];
b[i] = a[i] + c[i] * e[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}

gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable 
for
znver5.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

(cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4)

Diff:
---
 gcc/config/i386/x86-tune.def | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 1ab2f444b569..4a3bd15d0ad2 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -515,17 +515,18 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
   | m_YONGFENG | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3 | m_ZNVER4
- | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
+ m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
+ | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction.  */


[gcc r14-10718] Zen5 tuning part 2: disable gather and scatter

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:3d0a91130eceaf428387ba314cfdfceb99b51709

commit r14-10718-g3d0a91130eceaf428387ba314cfdfceb99b51709
Author: Jan Hubicka 
Date:   Tue Sep 3 15:07:41 2024 +0200

Zen5 tuning part 2: disable gather and scatter

We disable gathers for zen4.  It seems that gather has improved a bit 
compared
to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions 
when
the indices are known ahead of time. Vector loads followed by shuffles 
result
in a higher load bandwidth." however the situation seems to be more
complicated.

gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
products in TSVC. Curiously enough breaking these out into microbenchmark
reversed the situation and it turns out that the performance depends on
how indices are distributed.  gather is loss if indices are sequential,
neutral if they are random and win for some strides (4, 8).

This seems to be similar to earlier zens, so I think (especially for
backporting znver5 support) that it makes sense to be conistent and disable
gather unless we work out a good heuristics on when to use it. Since we
typically do not know the indices in advance, I don't see how that can be 
done.

I opened PR116582 with some examples of wins and loses

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
ZNVER5.
(X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.

(cherry picked from commit d82edbe92eed53a479736fcbbe6d54d0fb42daa4)

Diff:
---
 gcc/config/i386/x86-tune.def | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4a3bd15d0ad2..01324a88a3be 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -483,35 +483,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID | 
m_CORE_ATOM
| m_YONGFENG | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */


[gcc r14-10721] Zen5 tuning part 4: update reassocation width

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:b17cb7ed709ea7250eaa4ddc4a713ebbb6b94b37

commit r14-10721-gb17cb7ed709ea7250eaa4ddc4a713ebbb6b94b37
Author: Jan Hubicka 
Date:   Tue Sep 3 18:20:34 2024 +0200

Zen5 tuning part 4: update reassocation width

Zen5 has 6 instead of 4 ALUs and the integer multiplication can now execute 
in
3 of them.  FP units can do 2 additions and 2 multiplications with latency 2
and 3.  This patch updates reassociation width accordingly.  This has 
potential
of increasing register pressure but unlike while benchmarking znver1 tuning
I did not noticed this actually causing problem on spec, so this patch bumps
up reassociation width to 6 for everything except for integer vectors, where
there are 4 units with typical latency of 1.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_reassociation_width): Update for Znver5.
* config/i386/x86-tune-costs.h (znver5_costs): Update reassociation
widths.

(cherry picked from commit f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5)

Diff:
---
 gcc/config/i386/i386.cc  | 10 +++---
 gcc/config/i386/x86-tune-costs.h | 23 +--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 93d05a301c92..2a0a79888be3 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24537,13 +24537,17 @@ ix86_reassociation_width (unsigned int op, 
machine_mode mode)
   if (width == 1)
return 1;
 
-  /* Integer vector instructions execute in FP unit
+  /* Znver1-4 Integer vector instructions execute in FP unit
 and can execute 3 additions and one multiplication per cycle.  */
   if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
-  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
-  || ix86_tune == PROCESSOR_ZNVER5)
+  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
  && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;
+  /* Znver5 can do 2 integer multiplications per cycle with latency
+of 3.  */
+  if (ix86_tune == PROCESSOR_ZNVER5
+ && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
+   width = 6;
 
   /* Account for targets that splits wide vectors into multiple parts.  */
   if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 8348ab8230ad..da36d2adfeca 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2100,16 +2100,19 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (13),  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (20),  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle.  FP operations
- take 3 cycles and it can execute 2 integer additions and 2
- multiplications thus reassociation may make sense up to with of 6.
- SPEC2k6 bencharks suggests
- that 4 works better than 6 probably due to register pressure.
-
- Integer vector operations are taken by FP unit and execute 3 vector
- plus/minus operations per cycle but only one multiply.  This is adjusted
- in ix86_reassociation_width.  */
-  4, 4, 3, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Zen5 can execute:
+  - integer ops: 6 per cycle, at most 3 multiplications.
+   latency 1 for additions, 3 for multiplications (pipelined)
+
+   Setting width of 9 for multiplication is probably excessive
+   for register pressure.
+  - fp ops: 2 additions per cycle, latency 2-3
+   2 multiplicaitons per cycle, latency 3
+  - vector intger ops: 4 additions, latency 1
+  2 multiplications, latency 4
+   We increase width to 6 for multiplications
+   in ix86_reassociation_width.  */
+  6, 6, 4, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */


[gcc r13-9062] Add AMD znver5 processor enablement with scheduler model

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:499afa8e6899d8e866bbd1e6cc340e5a52557883

commit r13-9062-g499afa8e6899d8e866bbd1e6cc340e5a52557883
Author: Jan Hubicka 
Date:   Mon Mar 18 10:22:44 2024 +0100

Add AMD znver5 processor enablement with scheduler model

2024-02-14  Jan Hubicka  
Karthiban Anbazhagan  

gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver5.
* common/config/i386/i386-common.cc (processor_names): Add znver5.
(processor_alias_table): Likewise.
* common/config/i386/i386-cpuinfo.h (processor_types): Add new zen
family.
(processor_subtypes): Add znver5.
* config.gcc (x86_64-*-* |...): Likewise.
* config/i386/driver-i386.cc (host_detect_local_cpu): Let
march=native detect znver5 cpu's.
* config/i386/i386-c.cc (ix86_target_macros_internal): Add
znver5.
* config/i386/i386-options.cc (m_ZNVER5): New definition
(processor_cost_table): Add znver5.
* config/i386/i386.cc (ix86_reassociation_width): Likewise.
* config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER5
(PTA_ZNVER5): New definition.
* config/i386/i386.md (define_attr "cpu"): Add znver5.
(Scheduling descriptions) Add znver5.md.
* config/i386/x86-tune-costs.h (znver5_cost): New definition.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver5.
(ix86_adjust_cost): Likewise.
* config/i386/x86-tune.def (avx512_move_by_pieces): Add m_ZNVER5.
(avx512_store_by_pieces): Add m_ZNVER5.
* doc/extend.texi: Add znver5.
* doc/invoke.texi: Likewise.
* config/i386/znver4.md: Rename to zn4zn5.md; combine znver4 and 
znver5 Scheduler.

gcc/testsuite/ChangeLog:
* g++.target/i386/mv29.C: Handle znver5 arch.
* gcc.target/i386/funcspec-56.inc:Likewise.

(cherry picked from commit d0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a)

Diff:
---
 gcc/common/config/i386/cpuinfo.h  |  16 +
 gcc/common/config/i386/i386-common.cc |   6 +-
 gcc/common/config/i386/i386-cpuinfo.h |   2 +
 gcc/config.gcc|  14 +-
 gcc/config/i386/driver-i386.cc|   5 +
 gcc/config/i386/i386-c.cc |   7 +
 gcc/config/i386/i386-options.cc   |   6 +-
 gcc/config/i386/i386.cc   |   3 +-
 gcc/config/i386/i386.h|   3 +
 gcc/config/i386/i386.md   |   4 +-
 gcc/config/i386/x86-tune-costs.h  | 136 +
 gcc/config/i386/x86-tune-sched.cc |   2 +
 gcc/config/i386/x86-tune.def  |   4 +-
 gcc/config/i386/{znver4.md => zn4zn5.md}  | 817 --
 gcc/doc/extend.texi   |   3 +
 gcc/doc/invoke.texi   |  10 +
 gcc/testsuite/g++.target/i386/mv29.C  |   6 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 18 files changed, 985 insertions(+), 61 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 441fae0cdc9f..a2e28e47a7d2 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -310,6 +310,22 @@ get_amd_cpu (struct __processor_model *cpu_model,
  cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3;
}
   break;
+case 0x1a:
+  cpu_model->__cpu_type = AMDFAM1AH;
+  if (model <= 0x77)
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  else if (has_cpu_feature (cpu_model, cpu_features2,
+   FEATURE_AVX512VP2INTERSECT))
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  break;
 default:
   break;
 }
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index a8809889360b..f36101558077 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1983,7 +1983,8 @@ const char *const processor_names[] =
   "znver1",
   "znver2",
   "znver3",
-  "znver4"
+  "znver4",
+  "znver5"
 };
 
 /* Guarantee that the array is aligned with enum processor_type.  */
@@ -2243,6 +2244,9 @@ const pta processor_alias_table[] =
   {"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4,
 PTA_ZNVER4,
 M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F},
+  {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5,
+PTA_ZNVER5,
+M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F},
   {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
   | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW
diff --git a/gcc/common

[gcc r13-9063] Fixup unaligned load/store cost for znver5

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:22f4e4a5043fa69c54b1010d04bcd77958646189

commit r13-9063-g22f4e4a5043fa69c54b1010d04bcd77958646189
Author: Richard Biener 
Date:   Tue Jul 16 10:45:27 2024 +0200

Fixup unaligned load/store cost for znver5

Currently unaligned YMM and ZMM load and store costs are cheaper than
aligned which causes the vectorizer to purposely mis-align accesses
by adding an alignment prologue.  It looks like the unaligned costs
were simply copied from the bogus znver4 costs.  The following makes
the unaligned costs equal to the aligned costs like in the fixed znver4
version.

* config/i386/x86-tune-costs.h (znver5_cost): Update unaligned
load and store cost from the aligned costs.

(cherry picked from commit 896393791ee34ffc176c87d232dfee735db3aaab)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 4d3194323e14..02fad74c4d1c 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2060,8 +2060,8 @@ struct processor_costs znver5_cost = {
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
   {8, 8, 8, 12, 12},   /* cost of storing SSE register
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
-  {6, 6, 6, 6, 6}, /* cost of unaligned loads.  */
-  {8, 8, 8, 8, 8}, /* cost of unaligned stores.  */
+  {6, 6, 10, 10, 12},  /* cost of unaligned loads.  */
+  {8, 8, 8, 12, 12},   /* cost of unaligned stores.  */
   2, 2, 2, /* cost of moving XMM,YMM,ZMM
   register.  */
   6,   /* cost of moving SSE register to 
integer.  */


[gcc r14-10719] Zen5 tuning part 3: scheduler tweaks

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2c01292411044adbd67f79355c1e24decd2fd3c0

commit r14-10719-g2c01292411044adbd67f79355c1e24decd2fd3c0
Author: Jan Hubicka 
Date:   Tue Sep 3 16:26:16 2024 +0200

Zen5 tuning part 3: scheduler tweaks

this patch adds support for new fussion in znver5 documented in the
optimization manual:

   The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
   with certain ALU instructions. The following conditions need to be met 
for
   fusion to happen:
 - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
 - The MOV is followed by an ALU instruction where the MOV and ALU 
destination register match.
 - The ALU instruction may source only registers or immediate data. 
There cannot be any memory source.
 - The ALU instruction sources either the source or dest of MOV 
instruction.
 - If ALU instruction has 2 reg sources, they should be different.
 - The following ALU instructions can fuse with an older qualified MOV 
instruction:
   ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
   (I assume OP is OR)

I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, 
but
with our model we can't realy use it.
Increasing issue rate to 8 leads to infinite loop in scheduler.

Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).

New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
.cfi_offset 3, -32
leaq63(%rsi), %rbx
movq%rbx, %rbp
+   shrq$6, %rbp
+   salq$3, %rbp
subq$16, %rsp
.cfi_def_cfa_offset 48
movq%rdi, %r12
-   shrq$6, %rbp
-   movq%rsi, 8(%rsp)
-   salq$3, %rbp
movq%rbp, %rdi
+   movq%rsi, 8(%rsp)
call_Znwm
movq8(%rsp), %rsi
movl$0, 8(%r12)
@@ -2224,8 +2224,8 @@
movq%rax, (%r12)
movq%rbp, 32(%r12)
testq   %rsi, %rsi
-   movq%rsi, %rdx
cmovns  %rsi, %rbx
+   movq%rsi, %rdx
sarq$63, %rdx
shrq$58, %rdx
sarq$6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.

gcc/ChangeLog:

* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
(ix86_adjust_cost): Add TODO about znver5 memory latency.
(ix86_fuse_mov_alu_p): New.
(ix86_macro_fusion_pair_p): Use it.
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add 
ZNVER5.
(X86_TUNE_FUSE_MOV_AND_ALU): New tune;

(cherry picked from commit e2125a600552bc6e0329e3f1224eea14804db8d3)

Diff:
---
 gcc/config/i386/i386.h|  2 ++
 gcc/config/i386/x86-tune-sched.cc | 67 ++-
 gcc/config/i386/x86-tune.def  | 11 +--
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 26e15d2677fb..2de838ef15ce 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -427,6 +427,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+   ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
 #define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index 578ba57e6b22..07b79876c36f 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -69,7 +69,6 @@ ix86_issue_rate (void)
 case PROCESSOR_ZNVER2:
 case PROCESSOR_ZNVER3:
 case PROCESSOR_ZNVER4:
-case PROCESSOR_ZNVER5:
 case PROCESSOR_CORE2:
 case PROCESSOR_NEHALEM:
 case PROCESSOR_SANDYBRIDGE:
@@ -92,6 +91,13 @@ ix86_issue_rate (void)
   return 5;
 
 case PROCESSOR_SAPPHIRERAPIDS:
+/* For znver5 decoder can handle 4 or 8 instructions per cycle,
+   op cache 12 instruction/cycle, dispatch 8 instructions
+   integer rename 8 instructions and Fp 6 instructions.
+
+   The scheduler, without understanding out of order nature of the CPU
+   is unlikely going to be able to fill all of these.  */
+case PROCESSOR_ZNVER5:
   return 6;
 
 default:
@@ -435,6 +441,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
  enum attr_un

[gcc r14-10720] Zen5 tuning part 3: fix typo in previous patch

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:2eade72b0e2ac9dd18ef517bc3b868157f1ddf48

commit r14-10720-g2eade72b0e2ac9dd18ef517bc3b868157f1ddf48
Author: Jan Hubicka 
Date:   Tue Sep 3 17:25:05 2024 +0200

Zen5 tuning part 3: fix typo in previous patch

gcc/ChangeLog:

* config/i386/x86-tune-sched.cc (ix86_fuse_mov_alu_p): Fix
typo.

(cherry picked from commit 910e1769a0653ac32bd8c1d6aabb39c797d5d773)

Diff:
---
 gcc/config/i386/x86-tune-sched.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index 07b79876c36f..746f23b3cbc4 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -615,7 +615,7 @@ ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
   /* One of operands should be register.  */
   if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
 std::swap (op0, op1);
-  if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
+  if (!REG_P (op0) || REGNO (op0) != REGNO (reg))
 return false;
   if (op1
   && !REG_P (op1)


[gcc r12-10732] Add AMD znver5 processor enablement with scheduler model

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:54806268b47775449c7e237f8f03e922d6da26f6

commit r12-10732-g54806268b47775449c7e237f8f03e922d6da26f6
Author: Jan Hubicka 
Date:   Mon Mar 18 10:22:44 2024 +0100

Add AMD znver5 processor enablement with scheduler model

2024-02-14  Jan Hubicka  
Karthiban Anbazhagan  

gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver5.
* common/config/i386/i386-common.cc (processor_names): Add znver5.
(processor_alias_table): Likewise.
* common/config/i386/i386-cpuinfo.h (processor_types): Add new zen
family.
(processor_subtypes): Add znver5.
* config.gcc (x86_64-*-* |...): Likewise.
* config/i386/driver-i386.cc (host_detect_local_cpu): Let
march=native detect znver5 cpu's.
* config/i386/i386-c.cc (ix86_target_macros_internal): Add
znver5.
* config/i386/i386-options.cc (m_ZNVER5): New definition
(processor_cost_table): Add znver5.
* config/i386/i386.cc (ix86_reassociation_width): Likewise.
* config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER5
(PTA_ZNVER5): New definition.
* config/i386/i386.md (define_attr "cpu"): Add znver5.
(Scheduling descriptions) Add znver5.md.
* config/i386/x86-tune-costs.h (znver5_cost): New definition.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver5.
(ix86_adjust_cost): Likewise.
* config/i386/x86-tune.def (avx512_move_by_pieces): Add m_ZNVER5.
(avx512_store_by_pieces): Add m_ZNVER5.
* doc/extend.texi: Add znver5.
* doc/invoke.texi: Likewise.
* config/i386/znver4.md: Rename to zn4zn5.md; combine znver4 and 
znver5 Scheduler.

gcc/testsuite/ChangeLog:
* g++.target/i386/mv29.C: Handle znver5 arch.
* gcc.target/i386/funcspec-56.inc:Likewise.

(cherry picked from commit d0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a)

Diff:
---
 gcc/common/config/i386/cpuinfo.h  |  16 +
 gcc/common/config/i386/i386-common.cc |   6 +-
 gcc/common/config/i386/i386-cpuinfo.h |   2 +
 gcc/config.gcc|  14 +-
 gcc/config/i386/driver-i386.cc|   5 +
 gcc/config/i386/i386-c.cc |   7 +
 gcc/config/i386/i386-options.cc   |   6 +-
 gcc/config/i386/i386.cc   |   3 +-
 gcc/config/i386/i386.h|   3 +
 gcc/config/i386/i386.md   |   4 +-
 gcc/config/i386/x86-tune-costs.h  | 134 +
 gcc/config/i386/x86-tune-sched.cc |   2 +
 gcc/config/i386/x86-tune.def  |   4 +-
 gcc/config/i386/{znver4.md => zn4zn5.md}  | 817 --
 gcc/doc/extend.texi   |   3 +
 gcc/doc/invoke.texi   |  10 +
 gcc/testsuite/g++.target/i386/mv29.C  |   6 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 18 files changed, 983 insertions(+), 61 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 316ad3cb3e9b..d79534331f77 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -282,6 +282,22 @@ get_amd_cpu (struct __processor_model *cpu_model,
  cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3;
}
   break;
+case 0x1a:
+  cpu_model->__cpu_type = AMDFAM1AH;
+  if (model <= 0x77)
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  else if (has_cpu_feature (cpu_model, cpu_features2,
+   FEATURE_AVX512VP2INTERSECT))
+   {
+ cpu = "znver5";
+ CHECK___builtin_cpu_is ("znver5");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
+   }
+  break;
 default:
   break;
 }
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index e2594cae4cc1..a01172cab2fb 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1831,7 +1831,8 @@ const char *const processor_names[] =
   "znver1",
   "znver2",
   "znver3",
-  "znver4"
+  "znver4",
+  "znver5"
 };
 
 /* Guarantee that the array is aligned with enum processor_type.  */
@@ -2067,6 +2068,9 @@ const pta processor_alias_table[] =
   {"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4,
 PTA_ZNVER4,
 M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F},
+  {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5,
+PTA_ZNVER5,
+M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F},
   {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
   | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW
diff --git a/gcc/commo

[gcc r12-10733] Fixup unaligned load/store cost for znver5

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:c77b1c833e84b62928a729556c502e1311782b2d

commit r12-10733-gc77b1c833e84b62928a729556c502e1311782b2d
Author: Richard Biener 
Date:   Tue Jul 16 10:45:27 2024 +0200

Fixup unaligned load/store cost for znver5

Currently unaligned YMM and ZMM load and store costs are cheaper than
aligned which causes the vectorizer to purposely mis-align accesses
by adding an alignment prologue.  It looks like the unaligned costs
were simply copied from the bogus znver4 costs.  The following makes
the unaligned costs equal to the aligned costs like in the fixed znver4
version.

* config/i386/x86-tune-costs.h (znver5_cost): Update unaligned
load and store cost from the aligned costs.

(cherry picked from commit 896393791ee34ffc176c87d232dfee735db3aaab)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 11a9dd0ff9ed..b8e7ab9372ea 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2028,8 +2028,8 @@ struct processor_costs znver5_cost = {
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
   {8, 8, 8, 12, 12},   /* cost of storing SSE register
   in 32bit, 64bit, 128bit, 256bit and 
512bit */
-  {6, 6, 6, 6, 6}, /* cost of unaligned loads.  */
-  {8, 8, 8, 8, 8}, /* cost of unaligned stores.  */
+  {6, 6, 10, 10, 12},  /* cost of unaligned loads.  */
+  {8, 8, 8, 12, 12},   /* cost of unaligned stores.  */
   2, 2, 2, /* cost of moving XMM,YMM,ZMM
   register.  */
   6,   /* cost of moving SSE register to 
integer.  */


[gcc r13-9065] Re-add m_ZNVER4 to X86_TUNE_AVOID_256FMA_CHAINS

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:ad9ba1eccec5086b84f1030fb3e87947242ba904

commit r13-9065-gad9ba1eccec5086b84f1030fb3e87947242ba904
Author: Jan Hubicka 
Date:   Sun Sep 29 02:10:14 2024 +0200

Re-add m_ZNVER4 to X86_TUNE_AVOID_256FMA_CHAINS

* config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Re-add 
m_ZNVER4
accidentally removed during znver5 merge.

Diff:
---
 gcc/config/i386/x86-tune.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 0ef75e986be9..629e1fdf5f77 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -518,7 +518,7 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", 
m_ZNVER)
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
- | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER5)
+ | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER4 
| m_ZNVER5)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */


[gcc r13-9064] Zen5 tuning part 1: avoid FMA chains

2024-09-28 Thread Jan Hubicka via Gcc-cvs
https://gcc.gnu.org/g:7c0c772e4fb89bf4d9bc09f7d8e41c6bc0b0e093

commit r13-9064-g7c0c772e4fb89bf4d9bc09f7d8e41c6bc0b0e093
Author: Jan Hubicka 
Date:   Tue Sep 3 13:38:33 2024 +0200

Zen5 tuning part 1: avoid FMA chains

testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the split
registers. Znver5 benefits from avoding FMA on all widths.  This may be 
different
with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on S323 from TSVC:

real_t s323(struct args_t * func_args)
{

//recurrences
//coupled recurrence

initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);

for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 1; i < LEN_1D; i++) {
a[i] = b[i-1] + c[i] * d[i];
b[i] = a[i] + c[i] * e[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}

gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable 
for
znver5.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

(cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4)

Diff:
---
 gcc/config/i386/x86-tune.def | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 9cc44e2b628c..0ef75e986be9 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -513,16 +513,16 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
- | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER4)
+ | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER5)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction.  */


[gcc r14-10716] x86: Don't use address override with segment regsiter

2024-09-28 Thread H.J. Lu via Gcc-cvs
https://gcc.gnu.org/g:25cb153f93bb9ff3543ba8e31bbe7be4f6168aa4

commit r14-10716-g25cb153f93bb9ff3543ba8e31bbe7be4f6168aa4
Author: H.J. Lu 
Date:   Wed Sep 25 16:39:04 2024 +0800

x86: Don't use address override with segment regsiter

Address override only applies to the (reg32) part in the thread address
fs:(reg32).  Don't rewrite thread address like

(set (reg:CCZ 17 flags)
(compare:CCZ (reg:SI 98 [ __gmpfr_emax.0_1 ])
(mem/c:SI (plus:SI (plus:SI (unspec:SI [
(const_int 0 [0])
] UNSPEC_TP)
(reg:SI 107))
(const:SI (unspec:SI [
(symbol_ref:SI ("previous_emax") [flags 0x1a] 
)
] UNSPEC_DTPOFF))) [1 previous_emax+0 S4 A32])))

if address override is used to avoid the invalid memory operand like

cmpl%fs:previous_emax@dtpoff(%eax), %r12d

gcc/

PR target/116839
* config/i386/i386.cc (ix86_rewrite_tls_address_1): Make it
static.  Return if TLS address is thread register plus an integer
register.

gcc/testsuite/

PR target/116839
* gcc.target/i386/pr116839.c: New file.

Signed-off-by: H.J. Lu 
(cherry picked from commit c79cc30862d7255ca15884aa956d1ccfa279d86a)

Diff:
---
 gcc/config/i386/i386.cc  |  9 +-
 gcc/testsuite/gcc.target/i386/pr116839.c | 48 
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 8f1c1f9ccd0a..93d05a301c92 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -12458,7 +12458,7 @@ ix86_tls_address_pattern_p (rtx op)
 }
 
 /* Rewrite *LOC so that it refers to a default TLS address space.  */
-void
+static void
 ix86_rewrite_tls_address_1 (rtx *loc)
 {
   subrtx_ptr_iterator::array_type array;
@@ -12480,6 +12480,13 @@ ix86_rewrite_tls_address_1 (rtx *loc)
  if (GET_CODE (u) == UNSPEC
  && XINT (u, 1) == UNSPEC_TP)
{
+ /* NB: Since address override only applies to the
+(reg32) part in fs:(reg32), return if address
+override is used.  */
+ if (Pmode != word_mode
+ && REG_P (XEXP (*x, 1 - i)))
+   return;
+
  addr_space_t as = DEFAULT_TLS_SEG_REG;
 
  *x = XEXP (*x, 1 - i);
diff --git a/gcc/testsuite/gcc.target/i386/pr116839.c 
b/gcc/testsuite/gcc.target/i386/pr116839.c
new file mode 100644
index ..e5df82562518
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116839.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target maybe_x32 } */
+/* { dg-options "-mx32 -O2 -fPIC -mtls-dialect=gnu2" } */
+/* { dg-final { scan-assembler-not "cmpl\[ 
\t\]+%fs:previous_emax@dtpoff\\(%eax\\)" } } */
+
+typedef long mpfr_prec_t;
+typedef long mpfr_exp_t;
+typedef struct {
+  mpfr_prec_t _mpfr_prec;
+} __mpfr_struct;
+typedef __mpfr_struct mpfr_t[1];
+extern _Thread_local mpfr_exp_t __gmpfr_emax;
+static _Thread_local mpfr_exp_t previous_emax;
+static _Thread_local mpfr_t bound_emax;
+extern const mpfr_t __gmpfr_const_log2_RNDD;
+extern const mpfr_t __gmpfr_const_log2_RNDU;
+
+typedef enum {
+  MPFR_RNDN=0,
+  MPFR_RNDZ,
+  MPFR_RNDU,
+  MPFR_RNDD,
+  MPFR_RNDA,
+  MPFR_RNDF,
+  MPFR_RNDNA=-1
+} mpfr_rnd_t;
+typedef __mpfr_struct *mpfr_ptr;
+typedef const __mpfr_struct *mpfr_srcptr;
+void mpfr_mul (mpfr_ptr, mpfr_srcptr, mpfr_rnd_t);
+
+void
+foo (void)
+{
+  mpfr_exp_t saved_emax;
+
+  if (__gmpfr_emax != previous_emax)
+{
+  saved_emax = __gmpfr_emax;
+
+  bound_emax->_mpfr_prec = 32;
+
+  mpfr_mul (bound_emax, saved_emax < 0 ?
+__gmpfr_const_log2_RNDD : __gmpfr_const_log2_RNDU,
+MPFR_RNDU);
+  previous_emax = saved_emax;
+  __gmpfr_emax = saved_emax;
+}
+}


[gcc r12-10731] x86: Don't use address override with segment regsiter

2024-09-28 Thread H.J. Lu via Gcc-cvs
https://gcc.gnu.org/g:2e66eb7e7eae82bcd6675e79eabbdd6decfa9fe5

commit r12-10731-g2e66eb7e7eae82bcd6675e79eabbdd6decfa9fe5
Author: H.J. Lu 
Date:   Wed Sep 25 16:39:04 2024 +0800

x86: Don't use address override with segment regsiter

Address override only applies to the (reg32) part in the thread address
fs:(reg32).  Don't rewrite thread address like

(set (reg:CCZ 17 flags)
(compare:CCZ (reg:SI 98 [ __gmpfr_emax.0_1 ])
(mem/c:SI (plus:SI (plus:SI (unspec:SI [
(const_int 0 [0])
] UNSPEC_TP)
(reg:SI 107))
(const:SI (unspec:SI [
(symbol_ref:SI ("previous_emax") [flags 0x1a] 
)
] UNSPEC_DTPOFF))) [1 previous_emax+0 S4 A32])))

if address override is used to avoid the invalid memory operand like

cmpl%fs:previous_emax@dtpoff(%eax), %r12d

gcc/

PR target/116839
* config/i386/i386.cc (ix86_rewrite_tls_address_1): Make it
static.  Return if TLS address is thread register plus an integer
register.

gcc/testsuite/

PR target/116839
* gcc.target/i386/pr116839.c: New file.

Signed-off-by: H.J. Lu 
(cherry picked from commit c79cc30862d7255ca15884aa956d1ccfa279d86a)

Diff:
---
 gcc/config/i386/i386.cc  |  9 +-
 gcc/testsuite/gcc.target/i386/pr116839.c | 48 
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b52eb0d5f7b7..bf8553e3dd00 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11787,7 +11787,7 @@ ix86_tls_address_pattern_p (rtx op)
 }
 
 /* Rewrite *LOC so that it refers to a default TLS address space.  */
-void
+static void
 ix86_rewrite_tls_address_1 (rtx *loc)
 {
   subrtx_ptr_iterator::array_type array;
@@ -11809,6 +11809,13 @@ ix86_rewrite_tls_address_1 (rtx *loc)
  if (GET_CODE (u) == UNSPEC
  && XINT (u, 1) == UNSPEC_TP)
{
+ /* NB: Since address override only applies to the
+(reg32) part in fs:(reg32), return if address
+override is used.  */
+ if (Pmode != word_mode
+ && REG_P (XEXP (*x, 1 - i)))
+   return;
+
  addr_space_t as = DEFAULT_TLS_SEG_REG;
 
  *x = XEXP (*x, 1 - i);
diff --git a/gcc/testsuite/gcc.target/i386/pr116839.c 
b/gcc/testsuite/gcc.target/i386/pr116839.c
new file mode 100644
index ..e5df82562518
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116839.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target maybe_x32 } */
+/* { dg-options "-mx32 -O2 -fPIC -mtls-dialect=gnu2" } */
+/* { dg-final { scan-assembler-not "cmpl\[ 
\t\]+%fs:previous_emax@dtpoff\\(%eax\\)" } } */
+
+typedef long mpfr_prec_t;
+typedef long mpfr_exp_t;
+typedef struct {
+  mpfr_prec_t _mpfr_prec;
+} __mpfr_struct;
+typedef __mpfr_struct mpfr_t[1];
+extern _Thread_local mpfr_exp_t __gmpfr_emax;
+static _Thread_local mpfr_exp_t previous_emax;
+static _Thread_local mpfr_t bound_emax;
+extern const mpfr_t __gmpfr_const_log2_RNDD;
+extern const mpfr_t __gmpfr_const_log2_RNDU;
+
+typedef enum {
+  MPFR_RNDN=0,
+  MPFR_RNDZ,
+  MPFR_RNDU,
+  MPFR_RNDD,
+  MPFR_RNDA,
+  MPFR_RNDF,
+  MPFR_RNDNA=-1
+} mpfr_rnd_t;
+typedef __mpfr_struct *mpfr_ptr;
+typedef const __mpfr_struct *mpfr_srcptr;
+void mpfr_mul (mpfr_ptr, mpfr_srcptr, mpfr_rnd_t);
+
+void
+foo (void)
+{
+  mpfr_exp_t saved_emax;
+
+  if (__gmpfr_emax != previous_emax)
+{
+  saved_emax = __gmpfr_emax;
+
+  bound_emax->_mpfr_prec = 32;
+
+  mpfr_mul (bound_emax, saved_emax < 0 ?
+__gmpfr_const_log2_RNDD : __gmpfr_const_log2_RNDU,
+MPFR_RNDU);
+  previous_emax = saved_emax;
+  __gmpfr_emax = saved_emax;
+}
+}


[gcc r13-9060] x86: Don't use address override with segment regsiter

2024-09-28 Thread H.J. Lu via Gcc-cvs
https://gcc.gnu.org/g:bf5d8d44f7a8f90a2ebfe3f28689bc3d86e185fb

commit r13-9060-gbf5d8d44f7a8f90a2ebfe3f28689bc3d86e185fb
Author: H.J. Lu 
Date:   Wed Sep 25 16:39:04 2024 +0800

x86: Don't use address override with segment regsiter

Address override only applies to the (reg32) part in the thread address
fs:(reg32).  Don't rewrite thread address like

(set (reg:CCZ 17 flags)
(compare:CCZ (reg:SI 98 [ __gmpfr_emax.0_1 ])
(mem/c:SI (plus:SI (plus:SI (unspec:SI [
(const_int 0 [0])
] UNSPEC_TP)
(reg:SI 107))
(const:SI (unspec:SI [
(symbol_ref:SI ("previous_emax") [flags 0x1a] 
)
] UNSPEC_DTPOFF))) [1 previous_emax+0 S4 A32])))

if address override is used to avoid the invalid memory operand like

cmpl%fs:previous_emax@dtpoff(%eax), %r12d

gcc/

PR target/116839
* config/i386/i386.cc (ix86_rewrite_tls_address_1): Make it
static.  Return if TLS address is thread register plus an integer
register.

gcc/testsuite/

PR target/116839
* gcc.target/i386/pr116839.c: New file.

Signed-off-by: H.J. Lu 
(cherry picked from commit c79cc30862d7255ca15884aa956d1ccfa279d86a)

Diff:
---
 gcc/config/i386/i386.cc  |  9 +-
 gcc/testsuite/gcc.target/i386/pr116839.c | 48 
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e4d3ce1acc87..b39d18ab5767 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11852,7 +11852,7 @@ ix86_tls_address_pattern_p (rtx op)
 }
 
 /* Rewrite *LOC so that it refers to a default TLS address space.  */
-void
+static void
 ix86_rewrite_tls_address_1 (rtx *loc)
 {
   subrtx_ptr_iterator::array_type array;
@@ -11874,6 +11874,13 @@ ix86_rewrite_tls_address_1 (rtx *loc)
  if (GET_CODE (u) == UNSPEC
  && XINT (u, 1) == UNSPEC_TP)
{
+ /* NB: Since address override only applies to the
+(reg32) part in fs:(reg32), return if address
+override is used.  */
+ if (Pmode != word_mode
+ && REG_P (XEXP (*x, 1 - i)))
+   return;
+
  addr_space_t as = DEFAULT_TLS_SEG_REG;
 
  *x = XEXP (*x, 1 - i);
diff --git a/gcc/testsuite/gcc.target/i386/pr116839.c 
b/gcc/testsuite/gcc.target/i386/pr116839.c
new file mode 100644
index ..e5df82562518
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116839.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target maybe_x32 } */
+/* { dg-options "-mx32 -O2 -fPIC -mtls-dialect=gnu2" } */
+/* { dg-final { scan-assembler-not "cmpl\[ 
\t\]+%fs:previous_emax@dtpoff\\(%eax\\)" } } */
+
+typedef long mpfr_prec_t;
+typedef long mpfr_exp_t;
+typedef struct {
+  mpfr_prec_t _mpfr_prec;
+} __mpfr_struct;
+typedef __mpfr_struct mpfr_t[1];
+extern _Thread_local mpfr_exp_t __gmpfr_emax;
+static _Thread_local mpfr_exp_t previous_emax;
+static _Thread_local mpfr_t bound_emax;
+extern const mpfr_t __gmpfr_const_log2_RNDD;
+extern const mpfr_t __gmpfr_const_log2_RNDU;
+
+typedef enum {
+  MPFR_RNDN=0,
+  MPFR_RNDZ,
+  MPFR_RNDU,
+  MPFR_RNDD,
+  MPFR_RNDA,
+  MPFR_RNDF,
+  MPFR_RNDNA=-1
+} mpfr_rnd_t;
+typedef __mpfr_struct *mpfr_ptr;
+typedef const __mpfr_struct *mpfr_srcptr;
+void mpfr_mul (mpfr_ptr, mpfr_srcptr, mpfr_rnd_t);
+
+void
+foo (void)
+{
+  mpfr_exp_t saved_emax;
+
+  if (__gmpfr_emax != previous_emax)
+{
+  saved_emax = __gmpfr_emax;
+
+  bound_emax->_mpfr_prec = 32;
+
+  mpfr_mul (bound_emax, saved_emax < 0 ?
+__gmpfr_const_log2_RNDD : __gmpfr_const_log2_RNDU,
+MPFR_RNDU);
+  previous_emax = saved_emax;
+  __gmpfr_emax = saved_emax;
+}
+}