https://gcc.gnu.org/g:be6334fffdf2a7df3b7f92ea933b804664dfc383

commit r12-10736-gbe6334fffdf2a7df3b7f92ea933b804664dfc383
Author: Jan Hubicka <j...@suse.cz>
Date:   Tue Sep 3 13:38:33 2024 +0200

    Zen5 tuning part 1: avoid FMA chains
    
    testing matrix multiplication benchmarks shows that FMA on a critical chain
    is a perofrmance loss over separate multiply and add. While the latency of 4
    is lower than multiply + add (3+2) the problem is that all values needs to
    be ready before computation starts.
    
    While on znver4 AVX512 code fared well with FMA, it was because of the split
    registers. Znver5 benefits from avoding FMA on all widths.  This may be 
different
    with the mobile version though.
    
    On naive matrix multiplication benchmark the difference is 8% with -O3
    only since with -Ofast loop interchange solves the problem differently.
    It is 30% win, for example, on S323 from TSVC:
    
    real_t s323(struct args_t * func_args)
    {
    
    //    recurrences
    //    coupled recurrence
    
        initialise_arrays(__func__);
        gettimeofday(&func_args->t1, NULL);
    
        for (int nl = 0; nl < iterations/2; nl++) {
            for (int i = 1; i < LEN_1D; i++) {
                a[i] = b[i-1] + c[i] * d[i];
                b[i] = a[i] + c[i] * e[i];
            }
            dummy(a, b, c, d, e, aa, bb, cc, 0.);
        }
    
        gettimeofday(&func_args->t2, NULL);
        return calc_checksum(__func__);
    }
    
    gcc/ChangeLog:
    
            * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable 
for
            znver5.
            (X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
            (X86_TUNE_AVOID_512FMA_CHAINS): Likewise.
    
    (cherry picked from commit d6360b4083695970789fd65b9c515c11a5ce25b4)

Diff:
---
 gcc/config/i386/x86-tune.def | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index f5bf331242aa..249a239de775 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -499,16 +499,16 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
    smaller FMA chain.  */
 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
-         | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC | m_ZNVER4)
+         | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC | m_ZNVER4 | m_ZNVER5)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
    smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
    for v2df vector reduction.  */

Reply via email to