Hi,
as seen on TSVC, Spec2017, the Zen3 gather instruction is a win only for
vectors with 8 elements.  At the time I was implementing the tuning vectorizer
did not know how to open-code gather and thus it was still a win to enable it
for shorter vector, but this has changed.

The following are results on Zen3 machine:

| Benchmark       | Master | Rate | Patch | Rate |     % |
|-----------------+--------+------+-------+------+-------|
| 500.perlbench_r |    246 | 6.47 |   250 | 6.36 |  1.63 |
| 502.gcc_r       |    215 | 6.59 |   215 | 6.59 |  0.00 |
| 505.mcf_r       |    299 | 5.40 |   299 | 5.41 |  0.00 |
| 520.omnetpp_r   |    250 | 5.25 |   249 | 5.27 | -0.40 |
| 523.xalancbmk_r |    197 | 5.37 |   195 | 5.43 | -1.02 |
| 525.x264_r      |    160 | 11.0 |   160 | 11.0 |  0.00 |
| 531.deepsjeng_r |    242 | 4.73 |   240 | 4.78 | -0.83 |
| 541.leela_r     |    353 | 4.70 |   355 | 4.67 |  0.57 |
| 548.exchange2_r |    146 | 17.9 |   146 | 17.9 |  0.00 |
| 557.xz_r        |    290 | 3.72 |   291 | 3.71 |  0.34 |
|-----------------+--------+------+-------+------+-------|
| Geomean         |        | 6.34 |       | 6.34 |       |

| Benchmark       | Master | Rate | Patch | Rate |      % |
|-----------------+--------+------+-------+------+--------|
| 503.bwaves_r    |    130 | 77.2 |   130 | 77.1 |   0.00 |
| 507.cactuBSSN_r |    246 | 5.16 |   245 | 5.17 |  -0.41 |
| 508.namd_r      |    163 | 5.84 |   162 | 5.85 |  -0.61 |
| 510.parest_r    |    277 | 9.45 |   218 | 12.0 | -21.30 |
| 511.povray_r    |    286 | 8.17 |   281 | 8.31 |  -1.75 |
| 519.lbm_r       |    138 | 7.62 |   137 | 7.67 |  -0.72 |
| 521.wrf_r       |    166 | 13.5 |   167 | 13.5 |   0.60 |
| 526.blender_r   |    214 | 7.13 |   215 | 7.10 |   0.47 |
| 527.cam4_r      |    176 | 9.92 |   173 | 10.1 |  -1.70 |
| 538.imagick_r   |    306 | 8.13 |   315 | 7.90 |   2.94 |
| 544.nab_r       |    199 | 8.46 |   199 | 8.44 |   0.00 |
| 549.fotonik3d_r |    254 | 15.4 |   243 | 16.1 |  -4.33 |
| 554.roms_r      |    210 | 7.57 |   210 | 7.58 |   0.00 |
|-----------------+--------+------+-------+------+--------|
| Geomean         |        | 10.0 |       | 10.3 |        |

So main wins are on parest and fotonik.  I looked into imagemagick and it looks
like a noise - benchmarks was run by Martin and it did not reproduce for me on
my zen box.

Bootstrapped/regtested x8_64-linux.  I plan to commit tomorrow if there are no
complains.

Honza

gcc/ChangeLog:

2022-03-28  Jan Hubicka  <hubi...@ucw.cz>

        * config/i386/i386-builtins.cc (ix86_vectorize_builtin_gather): Test
        TARGET_USE_GATHER_2PARTS and TARGET_USE_GATHER_4PARTS.
        * config/i386/i386.h (TARGET_USE_GATHER_2PARTS): New macro.
        (TARGET_USE_GATHER_4PARTS): New macro.
        * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): New tune
        (X86_TUNE_USE_GATHER_4PARTS): New tune

diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index 2570501ae7e..4a222c9f2c7 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -1785,7 +1785,12 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   bool si;
   enum ix86_builtins code;
 
-  if (! TARGET_AVX2 || !TARGET_USE_GATHER)
+  if (! TARGET_AVX2
+      || (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)2)
+         ? !TARGET_USE_GATHER_2PARTS
+         : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)4)
+            ? !TARGET_USE_GATHER_4PARTS
+            : !TARGET_USE_GATHER)))
     return NULL_TREE;
 
   if ((TREE_CODE (index_type) != INTEGER_TYPE
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index b92955177fe..363082ba47b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -390,6 +390,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
        ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
 #define TARGET_AVOID_4BYTE_PREFIXES \
        ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES]
+#define TARGET_USE_GATHER_2PARTS \
+       ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS]
+#define TARGET_USE_GATHER_4PARTS \
+       ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
 #define TARGET_USE_GATHER \
        ix86_tune_features[X86_TUNE_USE_GATHER]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 82ca0ae63ac..09e3cf794db 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -464,7 +464,18 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
          m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
          | m_INTEL)
 
-/* X86_TUNE_USE_GATHER: Use gather instructions.  */
+/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
+   elements.  */
+DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
+         ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
+
+/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
+   elements.  */
+DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
+         ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
+
+/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 4 or more
+   elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
          ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_GENERIC))
 

Reply via email to