https://gcc.gnu.org/g:d82edbe92eed53a479736fcbbe6d54d0fb42daa4

commit r15-3417-gd82edbe92eed53a479736fcbbe6d54d0fb42daa4
Author: Jan Hubicka <j...@suse.cz>
Date:   Tue Sep 3 15:07:41 2024 +0200

    Zen5 tuning part 2: disable gather and scatter
    
    We disable gathers for zen4.  It seems that gather has improved a bit 
compared
    to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions 
when
    the indices are known ahead of time. Vector loads followed by shuffles 
result
    in a higher load bandwidth." however the situation seems to be more
    complicated.
    
    gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
    products in TSVC. Curiously enough breaking these out into microbenchmark
    reversed the situation and it turns out that the performance depends on
    how indices are distributed.  gather is loss if indices are sequential,
    neutral if they are random and win for some strides (4, 8).
    
    This seems to be similar to earlier zens, so I think (especially for
    backporting znver5 support) that it makes sense to be conistent and disable
    gather unless we work out a good heuristics on when to use it. Since we
    typically do not know the indices in advance, I don't see how that can be 
done.
    
    I opened PR116582 with some examples of wins and loses
    
    gcc/ChangeLog:
    
            * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
            ZNVER5.
            (X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
            (X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
            (X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
            (X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
            (X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.

Diff:
---
 gcc/config/i386/x86-tune.def | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index da1a3d6a3c6c..ed26136faee5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -476,35 +476,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
-         ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+         ~(m_ZNVER | m_CORE_HYBRID
            | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
    elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
-         ~(m_ZNVER4))
+         ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
-         ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+         ~(m_ZNVER | m_CORE_HYBRID
            | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
    elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
-         ~(m_ZNVER4))
+         ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
    elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
-         ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
+         ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
            | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
    elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
-         ~(m_ZNVER4))
+         ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */

Reply via email to