Hi, as seen on TSVC, Spec2017, the Zen3 gather instruction is a win only for vectors with 8 elements. At the time I was implementing the tuning vectorizer did not know how to open-code gather and thus it was still a win to enable it for shorter vector, but this has changed.
The following are results on Zen3 machine: | Benchmark | Master | Rate | Patch | Rate | % | |-----------------+--------+------+-------+------+-------| | 500.perlbench_r | 246 | 6.47 | 250 | 6.36 | 1.63 | | 502.gcc_r | 215 | 6.59 | 215 | 6.59 | 0.00 | | 505.mcf_r | 299 | 5.40 | 299 | 5.41 | 0.00 | | 520.omnetpp_r | 250 | 5.25 | 249 | 5.27 | -0.40 | | 523.xalancbmk_r | 197 | 5.37 | 195 | 5.43 | -1.02 | | 525.x264_r | 160 | 11.0 | 160 | 11.0 | 0.00 | | 531.deepsjeng_r | 242 | 4.73 | 240 | 4.78 | -0.83 | | 541.leela_r | 353 | 4.70 | 355 | 4.67 | 0.57 | | 548.exchange2_r | 146 | 17.9 | 146 | 17.9 | 0.00 | | 557.xz_r | 290 | 3.72 | 291 | 3.71 | 0.34 | |-----------------+--------+------+-------+------+-------| | Geomean | | 6.34 | | 6.34 | | | Benchmark | Master | Rate | Patch | Rate | % | |-----------------+--------+------+-------+------+--------| | 503.bwaves_r | 130 | 77.2 | 130 | 77.1 | 0.00 | | 507.cactuBSSN_r | 246 | 5.16 | 245 | 5.17 | -0.41 | | 508.namd_r | 163 | 5.84 | 162 | 5.85 | -0.61 | | 510.parest_r | 277 | 9.45 | 218 | 12.0 | -21.30 | | 511.povray_r | 286 | 8.17 | 281 | 8.31 | -1.75 | | 519.lbm_r | 138 | 7.62 | 137 | 7.67 | -0.72 | | 521.wrf_r | 166 | 13.5 | 167 | 13.5 | 0.60 | | 526.blender_r | 214 | 7.13 | 215 | 7.10 | 0.47 | | 527.cam4_r | 176 | 9.92 | 173 | 10.1 | -1.70 | | 538.imagick_r | 306 | 8.13 | 315 | 7.90 | 2.94 | | 544.nab_r | 199 | 8.46 | 199 | 8.44 | 0.00 | | 549.fotonik3d_r | 254 | 15.4 | 243 | 16.1 | -4.33 | | 554.roms_r | 210 | 7.57 | 210 | 7.58 | 0.00 | |-----------------+--------+------+-------+------+--------| | Geomean | | 10.0 | | 10.3 | | So main wins are on parest and fotonik. I looked into imagemagick and it looks like a noise - benchmarks was run by Martin and it did not reproduce for me on my zen box. Bootstrapped/regtested x8_64-linux. I plan to commit tomorrow if there are no complains. Honza gcc/ChangeLog: 2022-03-28 Jan Hubicka <hubi...@ucw.cz> * config/i386/i386-builtins.cc (ix86_vectorize_builtin_gather): Test TARGET_USE_GATHER_2PARTS and TARGET_USE_GATHER_4PARTS. * config/i386/i386.h (TARGET_USE_GATHER_2PARTS): New macro. (TARGET_USE_GATHER_4PARTS): New macro. * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): New tune (X86_TUNE_USE_GATHER_4PARTS): New tune diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc index 2570501ae7e..4a222c9f2c7 100644 --- a/gcc/config/i386/i386-builtins.cc +++ b/gcc/config/i386/i386-builtins.cc @@ -1785,7 +1785,12 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype, bool si; enum ix86_builtins code; - if (! TARGET_AVX2 || !TARGET_USE_GATHER) + if (! TARGET_AVX2 + || (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)2) + ? !TARGET_USE_GATHER_2PARTS + : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), (unsigned)4) + ? !TARGET_USE_GATHER_4PARTS + : !TARGET_USE_GATHER))) return NULL_TREE; if ((TREE_CODE (index_type) != INTEGER_TYPE diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index b92955177fe..363082ba47b 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -390,6 +390,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_SLOW_PSHUFB] #define TARGET_AVOID_4BYTE_PREFIXES \ ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES] +#define TARGET_USE_GATHER_2PARTS \ + ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] +#define TARGET_USE_GATHER_4PARTS \ + ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] #define TARGET_USE_GATHER \ ix86_tune_features[X86_TUNE_USE_GATHER] #define TARGET_FUSE_CMP_AND_BRANCH_32 \ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 82ca0ae63ac..09e3cf794db 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -464,7 +464,18 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_INTEL) -/* X86_TUNE_USE_GATHER: Use gather instructions. */ +/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2 + elements. */ +DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC)) + +/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 + elements. */ +DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC)) + +/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 4 or more + elements. */ DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_GENERIC))