[gcc r15-3942] Implement CSHIFT and EOSHIFT for unsigned.
https://gcc.gnu.org/g:1c928004cf0bc2131b6199905d11133d23a7cef2 commit r15-3942-g1c928004cf0bc2131b6199905d11133d23a7cef2 Author: Thomas Koenig Date: Sat Sep 28 22:28:59 2024 +0200 Implement CSHIFT and EOSHIFT for unsigned. gcc/fortran/ChangeLog: * check.cc (gfc_check_eoshift): Handle BT_UNSIGNED. * simplify.cc (gfc_simplify_eoshift): Likewise. * gfortran.texi: Document CSHIFT and EOSHIFT for UNSIGNED. gcc/testsuite/ChangeLog: * gfortran.dg/unsigned_31.f90: New test. * gfortran.dg/unsigned_32.f90: New test. Diff: --- gcc/fortran/check.cc | 6 ++ gcc/fortran/gfortran.texi | 3 ++- gcc/fortran/simplify.cc | 4 gcc/testsuite/gfortran.dg/unsigned_31.f90 | 27 +++ gcc/testsuite/gfortran.dg/unsigned_32.f90 | 27 +++ 5 files changed, 66 insertions(+), 1 deletion(-) diff --git a/gcc/fortran/check.cc b/gcc/fortran/check.cc index 1851cfb8d4ad..1da269f5b725 100644 --- a/gcc/fortran/check.cc +++ b/gcc/fortran/check.cc @@ -3073,6 +3073,12 @@ gfc_check_eoshift (gfc_expr *array, gfc_expr *shift, gfc_expr *boundary, case BT_CHARACTER: break; + case BT_UNSIGNED: + if (flag_unsigned) + break; + + gcc_fallthrough(); + default: gfc_error ("Missing %qs argument to %qs intrinsic at %L for %qs " "of type %qs", gfc_current_intrinsic_arg[2]->name, diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi index a5ebadff3bb8..b42d0095e571 100644 --- a/gcc/fortran/gfortran.texi +++ b/gcc/fortran/gfortran.texi @@ -2790,7 +2790,8 @@ As of now, the following intrinsics take unsigned arguments: @item @code{TRANSFER} @item @code{SUM}, @code{PRODUCT}, @code{MATMUL} and @code{DOT_PRODUCT} @item @code{IANY}, @code{IALL} and @code{IPARITY} -@item @code{RANDOM_NUMBER}. +@item @code{RANDOM_NUMBER} +@item @code{CSHIFT} and @code{EOSHIFT}. @end itemize This list will grow in the near future. @c - diff --git a/gcc/fortran/simplify.cc b/gcc/fortran/simplify.cc index bd2f6485c95e..2f6c3c39dad8 100644 --- a/gcc/fortran/simplify.cc +++ b/gcc/fortran/simplify.cc @@ -2630,6 +2630,10 @@ gfc_simplify_eoshift (gfc_expr *array, gfc_expr *shift, gfc_expr *boundary, bnd = gfc_get_int_expr (array->ts.kind, NULL, 0); break; + case BT_UNSIGNED: + bnd = gfc_get_unsigned_expr (array->ts.kind, NULL, 0); + break; + case BT_LOGICAL: bnd = gfc_get_logical_expr (array->ts.kind, NULL, 0); break; diff --git a/gcc/testsuite/gfortran.dg/unsigned_31.f90 b/gcc/testsuite/gfortran.dg/unsigned_31.f90 new file mode 100644 index ..2a7c08ddba86 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/unsigned_31.f90 @@ -0,0 +1,27 @@ +! { dg-do run } +! { dg-options "-funsigned" } +program memain + call test1 + call test2 +contains + subroutine test1 +unsigned, dimension(3) :: v +unsigned, dimension(3,3) :: w, x +integer, dimension(3) :: shft +v = [1u, 2u, 3u] +if (any(eoshift(v,1) /= [2u,3u,0u])) error stop 1 +w = reshape([1u,2u,3u,4u,5u,6u,7u,8u,9u],[3,3]) +x = eoshift(w, shift=[1,-2,1], boundary=10u, dim=1) +if (any(x /= reshape([2u,3u,10u,10u,10u,4u,8u,9u,10u],[3,3]))) error stop 2 +shft = [2,-1,-2] +x = eoshift(w,shift=shft,boundary=20u,dim=2) +if (any(x /= reshape([7u,20u,20u,20u,2u,20u,20u,5u,3u],[3,3]))) error stop 3 + end subroutine test1 + subroutine test2 +unsigned, dimension(3), parameter :: v = eoshift([1u,2u,3u],1) +unsigned, dimension(3,3), parameter :: w = reshape([1u,2u,3u,4u,5u,6u,7u,8u,9u],[3,3]) +unsigned, dimension(3,3), parameter :: x = eoshift(w,shift=[1,-2,1], boundary=10u, dim=1) +if (any(v /= [2u,3u,0u])) error stop 11 +if (any(x /= reshape([2u,3u,10u,10u,10u,4u,8u,9u,10u],[3,3]))) error stop 2 + end subroutine test2 +end program memain diff --git a/gcc/testsuite/gfortran.dg/unsigned_32.f90 b/gcc/testsuite/gfortran.dg/unsigned_32.f90 new file mode 100644 index ..7d41988b0420 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/unsigned_32.f90 @@ -0,0 +1,27 @@ +! { dg-do run } +! { dg-options "-funsigned" } +program memain + call test1 + call test2 +contains + subroutine test1 +unsigned, dimension(3) :: v +unsigned, dimension(3,3) :: w, x +integer, dimension(3) :: shft +v = [1u, 2u, 3u] +if (any(cshift(v,1) /= [2u,3u,1u])) error stop 1 +w = reshape([1u,2u,3u,4u,5u,6u,7u,8u,9u],[3,3]) +x = cshift(w, shift=[1,-2,1], dim=1) +if (any(x /= reshape([2u,3u,1u,5u,6u,4u,8u,9u,7u],[3,3]))) error stop 2 +shft = [2,-1,-2] +x = cshift(w,shift=shft,dim=2) +if (any(x /= reshape([7u,8u,6u,1u,2u,9u,4u,5u,3u],[3,3]))) error stop 3 + end subroutine test1 + subroutine test2 +unsigned, dimens
[gcc r13-9061] Reduce recursive inlining of always_inline functions
https://gcc.gnu.org/g:2532944e3588cf69bce019eaf03de9c63b78568f commit r13-9061-g2532944e3588cf69bce019eaf03de9c63b78568f Author: Jan Hubicka Date: Tue May 14 12:58:56 2024 +0200 Reduce recursive inlining of always_inline functions this patch tames down inliner on (mutiply) self-recursive always_inline functions. While we already have caps on recursive inlning, the testcase combines early inliner and late inliner to get very wide recursive inlining tree. The basic idea is to ignore DISREGARD_INLINE_LIMITS when deciding on inlining self recursive functions (so we cut on function being large) and clear the flag once it is detected. I did not include the testcase since it still produces a lot of code and would slow down testing. It also outputs many inlining failed messages that is not very nice, but it is hard to detect self recursin cycles in full generality when indirect calls and other tricks may happen. gcc/ChangeLog: PR ipa/113291 * ipa-inline.cc (enum can_inline_edge_by_limits_flags): New enum. (can_inline_edge_by_limits_p): Take flags instead of multiple bools; add flag for forcing inlinie limits. (can_early_inline_edge_p): Update. (want_inline_self_recursive_call_p): Update; use FORCE_LIMITS mode. (check_callers): Update. (update_caller_keys): Update. (update_callee_keys): Update. (recursive_inlining): Update. (add_new_edges_to_heap): Update. (speculation_useful_p): Update. (inline_small_functions): Clear DECL_DISREGARD_INLINE_LIMITS on self recursion. (flatten_function): Update. (inline_to_all_callers_1): Update. (cherry picked from commit 1ec49897253e093e1ef6261eb104ac0c111bac83) Diff: --- gcc/ipa-inline.cc | 79 +-- 1 file changed, 53 insertions(+), 26 deletions(-) diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc index 474fbff20574..77cb0726f9f0 100644 --- a/gcc/ipa-inline.cc +++ b/gcc/ipa-inline.cc @@ -443,24 +443,33 @@ inline_insns_auto (cgraph_node *n, bool hint, bool hint2) return max_inline_insns_auto; } +enum can_inline_edge_by_limits_flags +{ + /* True if we are early inlining. */ + CAN_INLINE_EARLY = 1, + /* Ignore size limits. */ + CAN_INLINE_DISREGARD_LIMITS = 2, + /* Force size limits (ignore always_inline). This is used for + recrusive inlining where always_inline may lead to inline bombs + and technically it is non-sential anyway. */ + CAN_INLINE_FORCE_LIMITS = 4, + /* Report decision to dump file. */ + CAN_INLINE_REPORT = 8, +}; + /* Decide if we can inline the edge and possibly update inline_failed reason. We check whether inlining is possible at all and whether - caller growth limits allow doing so. - - if REPORT is true, output reason to the dump file. - - if DISREGARD_LIMITS is true, ignore size limits. */ + caller growth limits allow doing so. */ static bool -can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report, -bool disregard_limits = false, bool early = false) +can_inline_edge_by_limits_p (struct cgraph_edge *e, int flags) { gcc_checking_assert (e->inline_failed); if (cgraph_inline_failed_type (e->inline_failed) == CIF_FINAL_ERROR) { - if (report) + if (flags & CAN_INLINE_REPORT) report_inline_failed_reason (e); return false; } @@ -474,10 +483,11 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report, tree callee_tree = callee ? DECL_FUNCTION_SPECIFIC_OPTIMIZATION (callee->decl) : NULL; /* Check if caller growth allows the inlining. */ - if (!DECL_DISREGARD_INLINE_LIMITS (callee->decl) - && !disregard_limits - && !lookup_attribute ("flatten", -DECL_ATTRIBUTES (caller->decl)) + if (!(flags & CAN_INLINE_DISREGARD_LIMITS) + && ((flags & CAN_INLINE_FORCE_LIMITS) + || (!DECL_DISREGARD_INLINE_LIMITS (callee->decl) + && !lookup_attribute ("flatten", +DECL_ATTRIBUTES (caller->decl && !caller_growth_limits (e)) inlinable = false; else if (callee->externally_visible @@ -505,7 +515,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report, to inline library always_inline functions. See PR65873. Disable the check for early inlining for now until better solution is found. */ - if (always_inline && early) + if (always_inline && (flags & CAN_INLINE_EARLY)) ; /* There are some options that change IL semantics which means we cannot inline in these cases for correctness reason. @@ -541,7 +551,7 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report, /* When devirtualization is disabled for callee, it is not safe
[gcc r14-10717] Zen5 tuning part 1: avoid FMA chains
https://gcc.gnu.org/g:fce2fe0406aa66c5d6f6465984a6af9ccc63370d commit r14-10717-gfce2fe0406aa66c5d6f6465984a6af9ccc63370d Author: Jan Hubicka Date: Tue Sep 3 13:38:33 2024 +0200 Zen5 tuning part 1: avoid FMA chains testing matrix multiplication benchmarks shows that FMA on a critical chain is a perofrmance loss over separate multiply and add. While the latency of 4 is lower than multiply + add (3+2) the problem is that all values needs to be ready before computation starts. While on znver4 AVX512 code fared well with FMA, it was because of the split registers. Znver5 benefits from avoding FMA on all widths. This may be different with the mobile version though. On naive matrix multiplication benchmark the difference is 8% with -O3 only since with -Ofast loop interchange solves the problem differently. It is 30% win, for example, on S323 from TSVC: real_t s323(struct args_t * func_args) { //recurrences //coupled recurrence initialise_arrays(__func__); gettimeofday(&func_args->t1, NULL); for (int nl = 0; nl < iterations/2; nl++) { for (int i = 1; i < LEN_1D; i++) { a[i] = b[i-1] + c[i] * d[i]; b[i] = a[i] + c[i] * e[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } gettimeofday(&func_args->t2, NULL); return calc_checksum(__func__); } gcc/ChangeLog: * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable for znver5. (X86_TUNE_AVOID_256FMA_CHAINS): Likewise. (X86_TUNE_AVOID_512FMA_CHAINS): Likewise. (cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4) Diff: --- gcc/config/i386/x86-tune.def | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 1ab2f444b569..4a3bd15d0ad2 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -515,17 +515,18 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 +DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER | m_YONGFENG | m_GENERIC) /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4 - | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC) +DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", + m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID + | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC) /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE) +DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5) /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd for v2df vector reduction. */
[gcc r14-10718] Zen5 tuning part 2: disable gather and scatter
https://gcc.gnu.org/g:3d0a91130eceaf428387ba314cfdfceb99b51709 commit r14-10718-g3d0a91130eceaf428387ba314cfdfceb99b51709 Author: Jan Hubicka Date: Tue Sep 3 15:07:41 2024 +0200 Zen5 tuning part 2: disable gather and scatter We disable gathers for zen4. It seems that gather has improved a bit compared to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions when the indices are known ahead of time. Vector loads followed by shuffles result in a higher load bandwidth." however the situation seems to be more complicated. gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot products in TSVC. Curiously enough breaking these out into microbenchmark reversed the situation and it turns out that the performance depends on how indices are distributed. gather is loss if indices are sequential, neutral if they are random and win for some strides (4, 8). This seems to be similar to earlier zens, so I think (especially for backporting znver5 support) that it makes sense to be conistent and disable gather unless we work out a good heuristics on when to use it. Since we typically do not know the indices in advance, I don't see how that can be done. I opened PR116582 with some examples of wins and loses gcc/ChangeLog: * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for ZNVER5. (X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5. (X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5. (X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5. (X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5. (X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5. (cherry picked from commit d82edbe92eed53a479736fcbbe6d54d0fb42daa4) Diff: --- gcc/config/i386/x86-tune.def | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 4a3bd15d0ad2..01324a88a3be 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -483,35 +483,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2 elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID + ~(m_ZNVER | m_CORE_HYBRID | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts", - ~(m_ZNVER4)) + ~(m_ZNVER4 | m_ZNVER5)) /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID + ~(m_ZNVER | m_CORE_HYBRID | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", - ~(m_ZNVER4)) + ~(m_ZNVER4 | m_ZNVER5)) /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID | m_CORE_ATOM | m_YONGFENG | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", - ~(m_ZNVER4)) + ~(m_ZNVER4 | m_ZNVER5)) /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */
[gcc r14-10721] Zen5 tuning part 4: update reassocation width
https://gcc.gnu.org/g:b17cb7ed709ea7250eaa4ddc4a713ebbb6b94b37 commit r14-10721-gb17cb7ed709ea7250eaa4ddc4a713ebbb6b94b37 Author: Jan Hubicka Date: Tue Sep 3 18:20:34 2024 +0200 Zen5 tuning part 4: update reassocation width Zen5 has 6 instead of 4 ALUs and the integer multiplication can now execute in 3 of them. FP units can do 2 additions and 2 multiplications with latency 2 and 3. This patch updates reassociation width accordingly. This has potential of increasing register pressure but unlike while benchmarking znver1 tuning I did not noticed this actually causing problem on spec, so this patch bumps up reassociation width to 6 for everything except for integer vectors, where there are 4 units with typical latency of 1. Bootstrapped/regtested x86_64-linux, comitted. gcc/ChangeLog: * config/i386/i386.cc (ix86_reassociation_width): Update for Znver5. * config/i386/x86-tune-costs.h (znver5_costs): Update reassociation widths. (cherry picked from commit f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5) Diff: --- gcc/config/i386/i386.cc | 10 +++--- gcc/config/i386/x86-tune-costs.h | 23 +-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 93d05a301c92..2a0a79888be3 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -24537,13 +24537,17 @@ ix86_reassociation_width (unsigned int op, machine_mode mode) if (width == 1) return 1; - /* Integer vector instructions execute in FP unit + /* Znver1-4 Integer vector instructions execute in FP unit and can execute 3 additions and one multiplication per cycle. */ if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2 - || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4 - || ix86_tune == PROCESSOR_ZNVER5) + || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4) && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS) return 1; + /* Znver5 can do 2 integer multiplications per cycle with latency +of 3. */ + if (ix86_tune == PROCESSOR_ZNVER5 + && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS) + width = 6; /* Account for targets that splits wide vectors into multiple parts. */ if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256) diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 8348ab8230ad..da36d2adfeca 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -2100,16 +2100,19 @@ struct processor_costs znver5_cost = { COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */ - /* Zen can execute 4 integer operations per cycle. FP operations - take 3 cycles and it can execute 2 integer additions and 2 - multiplications thus reassociation may make sense up to with of 6. - SPEC2k6 bencharks suggests - that 4 works better than 6 probably due to register pressure. - - Integer vector operations are taken by FP unit and execute 3 vector - plus/minus operations per cycle but only one multiply. This is adjusted - in ix86_reassociation_width. */ - 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + /* Zen5 can execute: + - integer ops: 6 per cycle, at most 3 multiplications. + latency 1 for additions, 3 for multiplications (pipelined) + + Setting width of 9 for multiplication is probably excessive + for register pressure. + - fp ops: 2 additions per cycle, latency 2-3 + 2 multiplicaitons per cycle, latency 3 + - vector intger ops: 4 additions, latency 1 + 2 multiplications, latency 4 + We increase width to 6 for multiplications + in ix86_reassociation_width. */ + 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
[gcc r13-9062] Add AMD znver5 processor enablement with scheduler model
https://gcc.gnu.org/g:499afa8e6899d8e866bbd1e6cc340e5a52557883 commit r13-9062-g499afa8e6899d8e866bbd1e6cc340e5a52557883 Author: Jan Hubicka Date: Mon Mar 18 10:22:44 2024 +0100 Add AMD znver5 processor enablement with scheduler model 2024-02-14 Jan Hubicka Karthiban Anbazhagan gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver5. * common/config/i386/i386-common.cc (processor_names): Add znver5. (processor_alias_table): Likewise. * common/config/i386/i386-cpuinfo.h (processor_types): Add new zen family. (processor_subtypes): Add znver5. * config.gcc (x86_64-*-* |...): Likewise. * config/i386/driver-i386.cc (host_detect_local_cpu): Let march=native detect znver5 cpu's. * config/i386/i386-c.cc (ix86_target_macros_internal): Add znver5. * config/i386/i386-options.cc (m_ZNVER5): New definition (processor_cost_table): Add znver5. * config/i386/i386.cc (ix86_reassociation_width): Likewise. * config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER5 (PTA_ZNVER5): New definition. * config/i386/i386.md (define_attr "cpu"): Add znver5. (Scheduling descriptions) Add znver5.md. * config/i386/x86-tune-costs.h (znver5_cost): New definition. * config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver5. (ix86_adjust_cost): Likewise. * config/i386/x86-tune.def (avx512_move_by_pieces): Add m_ZNVER5. (avx512_store_by_pieces): Add m_ZNVER5. * doc/extend.texi: Add znver5. * doc/invoke.texi: Likewise. * config/i386/znver4.md: Rename to zn4zn5.md; combine znver4 and znver5 Scheduler. gcc/testsuite/ChangeLog: * g++.target/i386/mv29.C: Handle znver5 arch. * gcc.target/i386/funcspec-56.inc:Likewise. (cherry picked from commit d0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a) Diff: --- gcc/common/config/i386/cpuinfo.h | 16 + gcc/common/config/i386/i386-common.cc | 6 +- gcc/common/config/i386/i386-cpuinfo.h | 2 + gcc/config.gcc| 14 +- gcc/config/i386/driver-i386.cc| 5 + gcc/config/i386/i386-c.cc | 7 + gcc/config/i386/i386-options.cc | 6 +- gcc/config/i386/i386.cc | 3 +- gcc/config/i386/i386.h| 3 + gcc/config/i386/i386.md | 4 +- gcc/config/i386/x86-tune-costs.h | 136 + gcc/config/i386/x86-tune-sched.cc | 2 + gcc/config/i386/x86-tune.def | 4 +- gcc/config/i386/{znver4.md => zn4zn5.md} | 817 -- gcc/doc/extend.texi | 3 + gcc/doc/invoke.texi | 10 + gcc/testsuite/g++.target/i386/mv29.C | 6 + gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + 18 files changed, 985 insertions(+), 61 deletions(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 441fae0cdc9f..a2e28e47a7d2 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -310,6 +310,22 @@ get_amd_cpu (struct __processor_model *cpu_model, cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3; } break; +case 0x1a: + cpu_model->__cpu_type = AMDFAM1AH; + if (model <= 0x77) + { + cpu = "znver5"; + CHECK___builtin_cpu_is ("znver5"); + cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5; + } + else if (has_cpu_feature (cpu_model, cpu_features2, + FEATURE_AVX512VP2INTERSECT)) + { + cpu = "znver5"; + CHECK___builtin_cpu_is ("znver5"); + cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5; + } + break; default: break; } diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index a8809889360b..f36101558077 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -1983,7 +1983,8 @@ const char *const processor_names[] = "znver1", "znver2", "znver3", - "znver4" + "znver4", + "znver5" }; /* Guarantee that the array is aligned with enum processor_type. */ @@ -2243,6 +2244,9 @@ const pta processor_alias_table[] = {"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4, PTA_ZNVER4, M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F}, + {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5, +PTA_ZNVER5, +M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F}, {"btver1", PROCESSOR_BTVER1, CPU_GENERIC, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW diff --git a/gcc/common
[gcc r13-9063] Fixup unaligned load/store cost for znver5
https://gcc.gnu.org/g:22f4e4a5043fa69c54b1010d04bcd77958646189 commit r13-9063-g22f4e4a5043fa69c54b1010d04bcd77958646189 Author: Richard Biener Date: Tue Jul 16 10:45:27 2024 +0200 Fixup unaligned load/store cost for znver5 Currently unaligned YMM and ZMM load and store costs are cheaper than aligned which causes the vectorizer to purposely mis-align accesses by adding an alignment prologue. It looks like the unaligned costs were simply copied from the bogus znver4 costs. The following makes the unaligned costs equal to the aligned costs like in the fixed znver4 version. * config/i386/x86-tune-costs.h (znver5_cost): Update unaligned load and store cost from the aligned costs. (cherry picked from commit 896393791ee34ffc176c87d232dfee735db3aaab) Diff: --- gcc/config/i386/x86-tune-costs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 4d3194323e14..02fad74c4d1c 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -2060,8 +2060,8 @@ struct processor_costs znver5_cost = { in 32bit, 64bit, 128bit, 256bit and 512bit */ {8, 8, 8, 12, 12}, /* cost of storing SSE register in 32bit, 64bit, 128bit, 256bit and 512bit */ - {6, 6, 6, 6, 6}, /* cost of unaligned loads. */ - {8, 8, 8, 8, 8}, /* cost of unaligned stores. */ + {6, 6, 10, 10, 12}, /* cost of unaligned loads. */ + {8, 8, 8, 12, 12}, /* cost of unaligned stores. */ 2, 2, 2, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */
[gcc r14-10719] Zen5 tuning part 3: scheduler tweaks
https://gcc.gnu.org/g:2c01292411044adbd67f79355c1e24decd2fd3c0 commit r14-10719-g2c01292411044adbd67f79355c1e24decd2fd3c0 Author: Jan Hubicka Date: Tue Sep 3 16:26:16 2024 +0200 Zen5 tuning part 3: scheduler tweaks this patch adds support for new fussion in znver5 documented in the optimization manual: The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions with certain ALU instructions. The following conditions need to be met for fusion to happen: - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B - The MOV is followed by an ALU instruction where the MOV and ALU destination register match. - The ALU instruction may source only registers or immediate data. There cannot be any memory source. - The ALU instruction sources either the source or dest of MOV instruction. - If ALU instruction has 2 reg sources, they should be different. - The following ALU instructions can fuse with an older qualified MOV instruction: ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR (I assume OP is OR) I also increased issue rate from 4 to 6. Theoretically znver5 can do more, but with our model we can't realy use it. Increasing issue rate to 8 leads to infinite loop in scheduler. Finally, I also enabled fuse_alu_and_branch since it is supported by znver5 (I think by earlier zens too). New fussion pattern moves quite few instructions around in common code: @@ -2210,13 +2210,13 @@ .cfi_offset 3, -32 leaq63(%rsi), %rbx movq%rbx, %rbp + shrq$6, %rbp + salq$3, %rbp subq$16, %rsp .cfi_def_cfa_offset 48 movq%rdi, %r12 - shrq$6, %rbp - movq%rsi, 8(%rsp) - salq$3, %rbp movq%rbp, %rdi + movq%rsi, 8(%rsp) call_Znwm movq8(%rsp), %rsi movl$0, 8(%r12) @@ -2224,8 +2224,8 @@ movq%rax, (%r12) movq%rbp, 32(%r12) testq %rsi, %rsi - movq%rsi, %rdx cmovns %rsi, %rbx + movq%rsi, %rdx sarq$63, %rdx shrq$58, %rdx sarq$6, %rbx which should help decoder bandwidth and perhaps also cache, though I was not able to measure off-noise effect on SPEC. gcc/ChangeLog: * config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune. * config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5. (ix86_adjust_cost): Add TODO about znver5 memory latency. (ix86_fuse_mov_alu_p): New. (ix86_macro_fusion_pair_p): Use it. * config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add ZNVER5. (X86_TUNE_FUSE_MOV_AND_ALU): New tune; (cherry picked from commit e2125a600552bc6e0329e3f1224eea14804db8d3) Diff: --- gcc/config/i386/i386.h| 2 ++ gcc/config/i386/x86-tune-sched.cc | 67 ++- gcc/config/i386/x86-tune.def | 11 +-- 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 26e15d2677fb..2de838ef15ce 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -427,6 +427,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS] #define TARGET_FUSE_ALU_AND_BRANCH \ ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH] +#define TARGET_FUSE_MOV_AND_ALU \ + ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU] #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] #define TARGET_AVOID_LEA_FOR_ADDR \ ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR] diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index 578ba57e6b22..07b79876c36f 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -69,7 +69,6 @@ ix86_issue_rate (void) case PROCESSOR_ZNVER2: case PROCESSOR_ZNVER3: case PROCESSOR_ZNVER4: -case PROCESSOR_ZNVER5: case PROCESSOR_CORE2: case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: @@ -92,6 +91,13 @@ ix86_issue_rate (void) return 5; case PROCESSOR_SAPPHIRERAPIDS: +/* For znver5 decoder can handle 4 or 8 instructions per cycle, + op cache 12 instruction/cycle, dispatch 8 instructions + integer rename 8 instructions and Fp 6 instructions. + + The scheduler, without understanding out of order nature of the CPU + is unlikely going to be able to fill all of these. */ +case PROCESSOR_ZNVER5: return 6; default: @@ -435,6 +441,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, enum attr_un
[gcc r14-10720] Zen5 tuning part 3: fix typo in previous patch
https://gcc.gnu.org/g:2eade72b0e2ac9dd18ef517bc3b868157f1ddf48 commit r14-10720-g2eade72b0e2ac9dd18ef517bc3b868157f1ddf48 Author: Jan Hubicka Date: Tue Sep 3 17:25:05 2024 +0200 Zen5 tuning part 3: fix typo in previous patch gcc/ChangeLog: * config/i386/x86-tune-sched.cc (ix86_fuse_mov_alu_p): Fix typo. (cherry picked from commit 910e1769a0653ac32bd8c1d6aabb39c797d5d773) Diff: --- gcc/config/i386/x86-tune-sched.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index 07b79876c36f..746f23b3cbc4 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -615,7 +615,7 @@ ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu) /* One of operands should be register. */ if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg))) std::swap (op0, op1); - if (!REG_P (op0) || REGNO (op1) != REGNO (reg)) + if (!REG_P (op0) || REGNO (op0) != REGNO (reg)) return false; if (op1 && !REG_P (op1)
[gcc r12-10732] Add AMD znver5 processor enablement with scheduler model
https://gcc.gnu.org/g:54806268b47775449c7e237f8f03e922d6da26f6 commit r12-10732-g54806268b47775449c7e237f8f03e922d6da26f6 Author: Jan Hubicka Date: Mon Mar 18 10:22:44 2024 +0100 Add AMD znver5 processor enablement with scheduler model 2024-02-14 Jan Hubicka Karthiban Anbazhagan gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_amd_cpu): Recognize znver5. * common/config/i386/i386-common.cc (processor_names): Add znver5. (processor_alias_table): Likewise. * common/config/i386/i386-cpuinfo.h (processor_types): Add new zen family. (processor_subtypes): Add znver5. * config.gcc (x86_64-*-* |...): Likewise. * config/i386/driver-i386.cc (host_detect_local_cpu): Let march=native detect znver5 cpu's. * config/i386/i386-c.cc (ix86_target_macros_internal): Add znver5. * config/i386/i386-options.cc (m_ZNVER5): New definition (processor_cost_table): Add znver5. * config/i386/i386.cc (ix86_reassociation_width): Likewise. * config/i386/i386.h (processor_type): Add PROCESSOR_ZNVER5 (PTA_ZNVER5): New definition. * config/i386/i386.md (define_attr "cpu"): Add znver5. (Scheduling descriptions) Add znver5.md. * config/i386/x86-tune-costs.h (znver5_cost): New definition. * config/i386/x86-tune-sched.cc (ix86_issue_rate): Add znver5. (ix86_adjust_cost): Likewise. * config/i386/x86-tune.def (avx512_move_by_pieces): Add m_ZNVER5. (avx512_store_by_pieces): Add m_ZNVER5. * doc/extend.texi: Add znver5. * doc/invoke.texi: Likewise. * config/i386/znver4.md: Rename to zn4zn5.md; combine znver4 and znver5 Scheduler. gcc/testsuite/ChangeLog: * g++.target/i386/mv29.C: Handle znver5 arch. * gcc.target/i386/funcspec-56.inc:Likewise. (cherry picked from commit d0aa0af9a9b7dd709a8c7ff6604ed6b7da0fc23a) Diff: --- gcc/common/config/i386/cpuinfo.h | 16 + gcc/common/config/i386/i386-common.cc | 6 +- gcc/common/config/i386/i386-cpuinfo.h | 2 + gcc/config.gcc| 14 +- gcc/config/i386/driver-i386.cc| 5 + gcc/config/i386/i386-c.cc | 7 + gcc/config/i386/i386-options.cc | 6 +- gcc/config/i386/i386.cc | 3 +- gcc/config/i386/i386.h| 3 + gcc/config/i386/i386.md | 4 +- gcc/config/i386/x86-tune-costs.h | 134 + gcc/config/i386/x86-tune-sched.cc | 2 + gcc/config/i386/x86-tune.def | 4 +- gcc/config/i386/{znver4.md => zn4zn5.md} | 817 -- gcc/doc/extend.texi | 3 + gcc/doc/invoke.texi | 10 + gcc/testsuite/g++.target/i386/mv29.C | 6 + gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + 18 files changed, 983 insertions(+), 61 deletions(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 316ad3cb3e9b..d79534331f77 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -282,6 +282,22 @@ get_amd_cpu (struct __processor_model *cpu_model, cpu_model->__cpu_subtype = AMDFAM19H_ZNVER3; } break; +case 0x1a: + cpu_model->__cpu_type = AMDFAM1AH; + if (model <= 0x77) + { + cpu = "znver5"; + CHECK___builtin_cpu_is ("znver5"); + cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5; + } + else if (has_cpu_feature (cpu_model, cpu_features2, + FEATURE_AVX512VP2INTERSECT)) + { + cpu = "znver5"; + CHECK___builtin_cpu_is ("znver5"); + cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5; + } + break; default: break; } diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index e2594cae4cc1..a01172cab2fb 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -1831,7 +1831,8 @@ const char *const processor_names[] = "znver1", "znver2", "znver3", - "znver4" + "znver4", + "znver5" }; /* Guarantee that the array is aligned with enum processor_type. */ @@ -2067,6 +2068,9 @@ const pta processor_alias_table[] = {"znver4", PROCESSOR_ZNVER4, CPU_ZNVER4, PTA_ZNVER4, M_CPU_SUBTYPE (AMDFAM19H_ZNVER4), P_PROC_AVX512F}, + {"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5, +PTA_ZNVER5, +M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F}, {"btver1", PROCESSOR_BTVER1, CPU_GENERIC, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 | PTA_PRFCHW diff --git a/gcc/commo
[gcc r12-10733] Fixup unaligned load/store cost for znver5
https://gcc.gnu.org/g:c77b1c833e84b62928a729556c502e1311782b2d commit r12-10733-gc77b1c833e84b62928a729556c502e1311782b2d Author: Richard Biener Date: Tue Jul 16 10:45:27 2024 +0200 Fixup unaligned load/store cost for znver5 Currently unaligned YMM and ZMM load and store costs are cheaper than aligned which causes the vectorizer to purposely mis-align accesses by adding an alignment prologue. It looks like the unaligned costs were simply copied from the bogus znver4 costs. The following makes the unaligned costs equal to the aligned costs like in the fixed znver4 version. * config/i386/x86-tune-costs.h (znver5_cost): Update unaligned load and store cost from the aligned costs. (cherry picked from commit 896393791ee34ffc176c87d232dfee735db3aaab) Diff: --- gcc/config/i386/x86-tune-costs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 11a9dd0ff9ed..b8e7ab9372ea 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -2028,8 +2028,8 @@ struct processor_costs znver5_cost = { in 32bit, 64bit, 128bit, 256bit and 512bit */ {8, 8, 8, 12, 12}, /* cost of storing SSE register in 32bit, 64bit, 128bit, 256bit and 512bit */ - {6, 6, 6, 6, 6}, /* cost of unaligned loads. */ - {8, 8, 8, 8, 8}, /* cost of unaligned stores. */ + {6, 6, 10, 10, 12}, /* cost of unaligned loads. */ + {8, 8, 8, 12, 12}, /* cost of unaligned stores. */ 2, 2, 2, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */
[gcc r13-9065] Re-add m_ZNVER4 to X86_TUNE_AVOID_256FMA_CHAINS
https://gcc.gnu.org/g:ad9ba1eccec5086b84f1030fb3e87947242ba904 commit r13-9065-gad9ba1eccec5086b84f1030fb3e87947242ba904 Author: Jan Hubicka Date: Sun Sep 29 02:10:14 2024 +0200 Re-add m_ZNVER4 to X86_TUNE_AVOID_256FMA_CHAINS * config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Re-add m_ZNVER4 accidentally removed during znver5 merge. Diff: --- gcc/config/i386/x86-tune.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 0ef75e986be9..629e1fdf5f77 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -518,7 +518,7 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER) /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 - | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER5) + | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER4 | m_ZNVER5) /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or smaller FMA chain. */
[gcc r13-9064] Zen5 tuning part 1: avoid FMA chains
https://gcc.gnu.org/g:7c0c772e4fb89bf4d9bc09f7d8e41c6bc0b0e093 commit r13-9064-g7c0c772e4fb89bf4d9bc09f7d8e41c6bc0b0e093 Author: Jan Hubicka Date: Tue Sep 3 13:38:33 2024 +0200 Zen5 tuning part 1: avoid FMA chains testing matrix multiplication benchmarks shows that FMA on a critical chain is a perofrmance loss over separate multiply and add. While the latency of 4 is lower than multiply + add (3+2) the problem is that all values needs to be ready before computation starts. While on znver4 AVX512 code fared well with FMA, it was because of the split registers. Znver5 benefits from avoding FMA on all widths. This may be different with the mobile version though. On naive matrix multiplication benchmark the difference is 8% with -O3 only since with -Ofast loop interchange solves the problem differently. It is 30% win, for example, on S323 from TSVC: real_t s323(struct args_t * func_args) { //recurrences //coupled recurrence initialise_arrays(__func__); gettimeofday(&func_args->t1, NULL); for (int nl = 0; nl < iterations/2; nl++) { for (int i = 1; i < LEN_1D; i++) { a[i] = b[i-1] + c[i] * d[i]; b[i] = a[i] + c[i] * e[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } gettimeofday(&func_args->t2, NULL); return calc_checksum(__func__); } gcc/ChangeLog: * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable for znver5. (X86_TUNE_AVOID_256FMA_CHAINS): Likewise. (X86_TUNE_AVOID_512FMA_CHAINS): Likewise. (cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4) Diff: --- gcc/config/i386/x86-tune.def | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 9cc44e2b628c..0ef75e986be9 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -513,16 +513,16 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3) +DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER) /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 - | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER4) + | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC | m_ZNVER5) /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE) +DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5) /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd for v2df vector reduction. */
[gcc r14-10716] x86: Don't use address override with segment regsiter
https://gcc.gnu.org/g:25cb153f93bb9ff3543ba8e31bbe7be4f6168aa4 commit r14-10716-g25cb153f93bb9ff3543ba8e31bbe7be4f6168aa4 Author: H.J. Lu Date: Wed Sep 25 16:39:04 2024 +0800 x86: Don't use address override with segment regsiter Address override only applies to the (reg32) part in the thread address fs:(reg32). Don't rewrite thread address like (set (reg:CCZ 17 flags) (compare:CCZ (reg:SI 98 [ __gmpfr_emax.0_1 ]) (mem/c:SI (plus:SI (plus:SI (unspec:SI [ (const_int 0 [0]) ] UNSPEC_TP) (reg:SI 107)) (const:SI (unspec:SI [ (symbol_ref:SI ("previous_emax") [flags 0x1a] ) ] UNSPEC_DTPOFF))) [1 previous_emax+0 S4 A32]))) if address override is used to avoid the invalid memory operand like cmpl%fs:previous_emax@dtpoff(%eax), %r12d gcc/ PR target/116839 * config/i386/i386.cc (ix86_rewrite_tls_address_1): Make it static. Return if TLS address is thread register plus an integer register. gcc/testsuite/ PR target/116839 * gcc.target/i386/pr116839.c: New file. Signed-off-by: H.J. Lu (cherry picked from commit c79cc30862d7255ca15884aa956d1ccfa279d86a) Diff: --- gcc/config/i386/i386.cc | 9 +- gcc/testsuite/gcc.target/i386/pr116839.c | 48 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 8f1c1f9ccd0a..93d05a301c92 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -12458,7 +12458,7 @@ ix86_tls_address_pattern_p (rtx op) } /* Rewrite *LOC so that it refers to a default TLS address space. */ -void +static void ix86_rewrite_tls_address_1 (rtx *loc) { subrtx_ptr_iterator::array_type array; @@ -12480,6 +12480,13 @@ ix86_rewrite_tls_address_1 (rtx *loc) if (GET_CODE (u) == UNSPEC && XINT (u, 1) == UNSPEC_TP) { + /* NB: Since address override only applies to the +(reg32) part in fs:(reg32), return if address +override is used. */ + if (Pmode != word_mode + && REG_P (XEXP (*x, 1 - i))) + return; + addr_space_t as = DEFAULT_TLS_SEG_REG; *x = XEXP (*x, 1 - i); diff --git a/gcc/testsuite/gcc.target/i386/pr116839.c b/gcc/testsuite/gcc.target/i386/pr116839.c new file mode 100644 index ..e5df82562518 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr116839.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-require-effective-target maybe_x32 } */ +/* { dg-options "-mx32 -O2 -fPIC -mtls-dialect=gnu2" } */ +/* { dg-final { scan-assembler-not "cmpl\[ \t\]+%fs:previous_emax@dtpoff\\(%eax\\)" } } */ + +typedef long mpfr_prec_t; +typedef long mpfr_exp_t; +typedef struct { + mpfr_prec_t _mpfr_prec; +} __mpfr_struct; +typedef __mpfr_struct mpfr_t[1]; +extern _Thread_local mpfr_exp_t __gmpfr_emax; +static _Thread_local mpfr_exp_t previous_emax; +static _Thread_local mpfr_t bound_emax; +extern const mpfr_t __gmpfr_const_log2_RNDD; +extern const mpfr_t __gmpfr_const_log2_RNDU; + +typedef enum { + MPFR_RNDN=0, + MPFR_RNDZ, + MPFR_RNDU, + MPFR_RNDD, + MPFR_RNDA, + MPFR_RNDF, + MPFR_RNDNA=-1 +} mpfr_rnd_t; +typedef __mpfr_struct *mpfr_ptr; +typedef const __mpfr_struct *mpfr_srcptr; +void mpfr_mul (mpfr_ptr, mpfr_srcptr, mpfr_rnd_t); + +void +foo (void) +{ + mpfr_exp_t saved_emax; + + if (__gmpfr_emax != previous_emax) +{ + saved_emax = __gmpfr_emax; + + bound_emax->_mpfr_prec = 32; + + mpfr_mul (bound_emax, saved_emax < 0 ? +__gmpfr_const_log2_RNDD : __gmpfr_const_log2_RNDU, +MPFR_RNDU); + previous_emax = saved_emax; + __gmpfr_emax = saved_emax; +} +}
[gcc r12-10731] x86: Don't use address override with segment regsiter
https://gcc.gnu.org/g:2e66eb7e7eae82bcd6675e79eabbdd6decfa9fe5 commit r12-10731-g2e66eb7e7eae82bcd6675e79eabbdd6decfa9fe5 Author: H.J. Lu Date: Wed Sep 25 16:39:04 2024 +0800 x86: Don't use address override with segment regsiter Address override only applies to the (reg32) part in the thread address fs:(reg32). Don't rewrite thread address like (set (reg:CCZ 17 flags) (compare:CCZ (reg:SI 98 [ __gmpfr_emax.0_1 ]) (mem/c:SI (plus:SI (plus:SI (unspec:SI [ (const_int 0 [0]) ] UNSPEC_TP) (reg:SI 107)) (const:SI (unspec:SI [ (symbol_ref:SI ("previous_emax") [flags 0x1a] ) ] UNSPEC_DTPOFF))) [1 previous_emax+0 S4 A32]))) if address override is used to avoid the invalid memory operand like cmpl%fs:previous_emax@dtpoff(%eax), %r12d gcc/ PR target/116839 * config/i386/i386.cc (ix86_rewrite_tls_address_1): Make it static. Return if TLS address is thread register plus an integer register. gcc/testsuite/ PR target/116839 * gcc.target/i386/pr116839.c: New file. Signed-off-by: H.J. Lu (cherry picked from commit c79cc30862d7255ca15884aa956d1ccfa279d86a) Diff: --- gcc/config/i386/i386.cc | 9 +- gcc/testsuite/gcc.target/i386/pr116839.c | 48 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index b52eb0d5f7b7..bf8553e3dd00 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -11787,7 +11787,7 @@ ix86_tls_address_pattern_p (rtx op) } /* Rewrite *LOC so that it refers to a default TLS address space. */ -void +static void ix86_rewrite_tls_address_1 (rtx *loc) { subrtx_ptr_iterator::array_type array; @@ -11809,6 +11809,13 @@ ix86_rewrite_tls_address_1 (rtx *loc) if (GET_CODE (u) == UNSPEC && XINT (u, 1) == UNSPEC_TP) { + /* NB: Since address override only applies to the +(reg32) part in fs:(reg32), return if address +override is used. */ + if (Pmode != word_mode + && REG_P (XEXP (*x, 1 - i))) + return; + addr_space_t as = DEFAULT_TLS_SEG_REG; *x = XEXP (*x, 1 - i); diff --git a/gcc/testsuite/gcc.target/i386/pr116839.c b/gcc/testsuite/gcc.target/i386/pr116839.c new file mode 100644 index ..e5df82562518 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr116839.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-require-effective-target maybe_x32 } */ +/* { dg-options "-mx32 -O2 -fPIC -mtls-dialect=gnu2" } */ +/* { dg-final { scan-assembler-not "cmpl\[ \t\]+%fs:previous_emax@dtpoff\\(%eax\\)" } } */ + +typedef long mpfr_prec_t; +typedef long mpfr_exp_t; +typedef struct { + mpfr_prec_t _mpfr_prec; +} __mpfr_struct; +typedef __mpfr_struct mpfr_t[1]; +extern _Thread_local mpfr_exp_t __gmpfr_emax; +static _Thread_local mpfr_exp_t previous_emax; +static _Thread_local mpfr_t bound_emax; +extern const mpfr_t __gmpfr_const_log2_RNDD; +extern const mpfr_t __gmpfr_const_log2_RNDU; + +typedef enum { + MPFR_RNDN=0, + MPFR_RNDZ, + MPFR_RNDU, + MPFR_RNDD, + MPFR_RNDA, + MPFR_RNDF, + MPFR_RNDNA=-1 +} mpfr_rnd_t; +typedef __mpfr_struct *mpfr_ptr; +typedef const __mpfr_struct *mpfr_srcptr; +void mpfr_mul (mpfr_ptr, mpfr_srcptr, mpfr_rnd_t); + +void +foo (void) +{ + mpfr_exp_t saved_emax; + + if (__gmpfr_emax != previous_emax) +{ + saved_emax = __gmpfr_emax; + + bound_emax->_mpfr_prec = 32; + + mpfr_mul (bound_emax, saved_emax < 0 ? +__gmpfr_const_log2_RNDD : __gmpfr_const_log2_RNDU, +MPFR_RNDU); + previous_emax = saved_emax; + __gmpfr_emax = saved_emax; +} +}
[gcc r13-9060] x86: Don't use address override with segment regsiter
https://gcc.gnu.org/g:bf5d8d44f7a8f90a2ebfe3f28689bc3d86e185fb commit r13-9060-gbf5d8d44f7a8f90a2ebfe3f28689bc3d86e185fb Author: H.J. Lu Date: Wed Sep 25 16:39:04 2024 +0800 x86: Don't use address override with segment regsiter Address override only applies to the (reg32) part in the thread address fs:(reg32). Don't rewrite thread address like (set (reg:CCZ 17 flags) (compare:CCZ (reg:SI 98 [ __gmpfr_emax.0_1 ]) (mem/c:SI (plus:SI (plus:SI (unspec:SI [ (const_int 0 [0]) ] UNSPEC_TP) (reg:SI 107)) (const:SI (unspec:SI [ (symbol_ref:SI ("previous_emax") [flags 0x1a] ) ] UNSPEC_DTPOFF))) [1 previous_emax+0 S4 A32]))) if address override is used to avoid the invalid memory operand like cmpl%fs:previous_emax@dtpoff(%eax), %r12d gcc/ PR target/116839 * config/i386/i386.cc (ix86_rewrite_tls_address_1): Make it static. Return if TLS address is thread register plus an integer register. gcc/testsuite/ PR target/116839 * gcc.target/i386/pr116839.c: New file. Signed-off-by: H.J. Lu (cherry picked from commit c79cc30862d7255ca15884aa956d1ccfa279d86a) Diff: --- gcc/config/i386/i386.cc | 9 +- gcc/testsuite/gcc.target/i386/pr116839.c | 48 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index e4d3ce1acc87..b39d18ab5767 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -11852,7 +11852,7 @@ ix86_tls_address_pattern_p (rtx op) } /* Rewrite *LOC so that it refers to a default TLS address space. */ -void +static void ix86_rewrite_tls_address_1 (rtx *loc) { subrtx_ptr_iterator::array_type array; @@ -11874,6 +11874,13 @@ ix86_rewrite_tls_address_1 (rtx *loc) if (GET_CODE (u) == UNSPEC && XINT (u, 1) == UNSPEC_TP) { + /* NB: Since address override only applies to the +(reg32) part in fs:(reg32), return if address +override is used. */ + if (Pmode != word_mode + && REG_P (XEXP (*x, 1 - i))) + return; + addr_space_t as = DEFAULT_TLS_SEG_REG; *x = XEXP (*x, 1 - i); diff --git a/gcc/testsuite/gcc.target/i386/pr116839.c b/gcc/testsuite/gcc.target/i386/pr116839.c new file mode 100644 index ..e5df82562518 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr116839.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-require-effective-target maybe_x32 } */ +/* { dg-options "-mx32 -O2 -fPIC -mtls-dialect=gnu2" } */ +/* { dg-final { scan-assembler-not "cmpl\[ \t\]+%fs:previous_emax@dtpoff\\(%eax\\)" } } */ + +typedef long mpfr_prec_t; +typedef long mpfr_exp_t; +typedef struct { + mpfr_prec_t _mpfr_prec; +} __mpfr_struct; +typedef __mpfr_struct mpfr_t[1]; +extern _Thread_local mpfr_exp_t __gmpfr_emax; +static _Thread_local mpfr_exp_t previous_emax; +static _Thread_local mpfr_t bound_emax; +extern const mpfr_t __gmpfr_const_log2_RNDD; +extern const mpfr_t __gmpfr_const_log2_RNDU; + +typedef enum { + MPFR_RNDN=0, + MPFR_RNDZ, + MPFR_RNDU, + MPFR_RNDD, + MPFR_RNDA, + MPFR_RNDF, + MPFR_RNDNA=-1 +} mpfr_rnd_t; +typedef __mpfr_struct *mpfr_ptr; +typedef const __mpfr_struct *mpfr_srcptr; +void mpfr_mul (mpfr_ptr, mpfr_srcptr, mpfr_rnd_t); + +void +foo (void) +{ + mpfr_exp_t saved_emax; + + if (__gmpfr_emax != previous_emax) +{ + saved_emax = __gmpfr_emax; + + bound_emax->_mpfr_prec = 32; + + mpfr_mul (bound_emax, saved_emax < 0 ? +__gmpfr_const_log2_RNDD : __gmpfr_const_log2_RNDU, +MPFR_RNDU); + previous_emax = saved_emax; + __gmpfr_emax = saved_emax; +} +}