On Wed, Feb 9, 2022 at 10:53 AM H.J. Lu via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > commit 9775e465c1fbfc32656de77c618c61acf5bd905d > Author: H.J. Lu <hjl.to...@gmail.com> > Date: Tue Jul 27 07:46:04 2021 -0700 > > x86: Don't set AVX_U128_DIRTY when zeroing YMM/ZMM register > > called ix86_check_avx_upper_register to check mode on source operand. > But ix86_check_avx_upper_register doesn't work on source operand like > The new function ix86_avx_u128_mode_source just takes the code from the *else if (ix86_check_avx_upper_register (src))* branch to check each component of src that meets the ix86_check_avx_upper_register condition, which seems reasonable.
The patch LGTM. > (vec_select:V2DI (reg/v:V4DI 23 xmm3 [orig:91 ymm ] [91]) > (parallel [ > (const_int 2 [0x2]) > (const_int 3 [0x3]) > ])) > > Add ix86_avx_u128_mode_source to check mode for each component of source > operand. > > gcc/ > > PR target/104441 > * config/i386/i386.cc (ix86_avx_u128_mode_source): New function. > (ix86_avx_u128_mode_needed): Return AVX_U128_ANY for debug INSN. > Call ix86_avx_u128_mode_source to check mode for each component > of source operand. > > gcc/testsuite/ > > PR target/104441 > * gcc.target/i386/pr104441-1a.c: New test. > * gcc.target/i386/pr104441-1b.c: Likewise. > --- > gcc/config/i386/i386.cc | 145 +++++++++++--------- > gcc/testsuite/gcc.target/i386/pr104441-1a.c | 57 ++++++++ > gcc/testsuite/gcc.target/i386/pr104441-1b.c | 32 +++++ > 3 files changed, 168 insertions(+), 66 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr104441-1b.c > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index dd5584fb8ed..2d87acca7ff 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -14365,11 +14365,82 @@ ix86_check_avx_upper_stores (rtx dest, const_rtx, > void *data) > } > } > > +/* For YMM/ZMM store or YMM/ZMM extract. Return mode for the source > + operand of SRC DEFs in the same basic block before INSN. */ > + > +static int > +ix86_avx_u128_mode_source (rtx_insn *insn, const_rtx src) > +{ > + basic_block bb = BLOCK_FOR_INSN (insn); > + rtx_insn *end = BB_END (bb); > + > + /* Return AVX_U128_DIRTY if there is no DEF in the same basic > + block. */ > + int status = AVX_U128_DIRTY; > + > + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src)); > + def; def = DF_REF_NEXT_REG (def)) > + if (DF_REF_BB (def) == bb) > + { > + /* Ignore DEF from different basic blocks. */ > + rtx_insn *def_insn = DF_REF_INSN (def); > + > + /* Check if DEF_INSN is before INSN. */ > + rtx_insn *next; > + for (next = NEXT_INSN (def_insn); > + next != nullptr && next != end && next != insn; > + next = NEXT_INSN (next)) > + ; > + > + /* Skip if DEF_INSN isn't before INSN. */ > + if (next != insn) > + continue; > + > + /* Return AVX_U128_DIRTY if the source operand of DEF_INSN > + isn't constant zero. */ > + > + if (CALL_P (def_insn)) > + { > + bool avx_upper_reg_found = false; > + note_stores (def_insn, > + ix86_check_avx_upper_stores, > + &avx_upper_reg_found); > + > + /* Return AVX_U128_DIRTY if call returns AVX. */ > + if (avx_upper_reg_found) > + return AVX_U128_DIRTY; > + > + continue; > + } > + > + rtx set = single_set (def_insn); > + if (!set) > + return AVX_U128_DIRTY; > + > + rtx dest = SET_DEST (set); > + > + /* Skip if DEF_INSN is not an AVX load. Return AVX_U128_DIRTY > + if the source operand isn't constant zero. */ > + if (ix86_check_avx_upper_register (dest) > + && standard_sse_constant_p (SET_SRC (set), > + GET_MODE (dest)) != 1) > + return AVX_U128_DIRTY; > + > + /* We get here only if all AVX loads are from constant zero. */ > + status = AVX_U128_ANY; > + } > + > + return status; > +} > + > /* Return needed mode for entity in optimize_mode_switching pass. */ > > static int > ix86_avx_u128_mode_needed (rtx_insn *insn) > { > + if (DEBUG_INSN_P (insn)) > + return AVX_U128_ANY; > + > if (CALL_P (insn)) > { > rtx link; > @@ -14409,6 +14480,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) > return AVX_U128_CLEAN; > } > > + subrtx_iterator::array_type array; > + > rtx set = single_set (insn); > if (set) > { > @@ -14423,74 +14496,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) > else > return AVX_U128_ANY; > } > - else if (ix86_check_avx_upper_register (src)) > + else > { > - /* This is an YMM/ZMM store. Check for the source operand > - of SRC DEFs in the same basic block before INSN. */ > - basic_block bb = BLOCK_FOR_INSN (insn); > - rtx_insn *end = BB_END (bb); > - > - /* Return AVX_U128_DIRTY if there is no DEF in the same basic > - block. */ > - int status = AVX_U128_DIRTY; > - > - for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src)); > - def; def = DF_REF_NEXT_REG (def)) > - if (DF_REF_BB (def) == bb) > + FOR_EACH_SUBRTX (iter, array, src, NONCONST) > + if (ix86_check_avx_upper_register (*iter)) > { > - /* Ignore DEF from different basic blocks. */ > - rtx_insn *def_insn = DF_REF_INSN (def); > - > - /* Check if DEF_INSN is before INSN. */ > - rtx_insn *next; > - for (next = NEXT_INSN (def_insn); > - next != nullptr && next != end && next != insn; > - next = NEXT_INSN (next)) > - ; > - > - /* Skip if DEF_INSN isn't before INSN. */ > - if (next != insn) > - continue; > - > - /* Return AVX_U128_DIRTY if the source operand of > - DEF_INSN isn't constant zero. */ > - > - if (CALL_P (def_insn)) > - { > - bool avx_upper_reg_found = false; > - note_stores (def_insn, ix86_check_avx_upper_stores, > - &avx_upper_reg_found); > - > - /* Return AVX_U128_DIRTY if call returns AVX. */ > - if (avx_upper_reg_found) > - return AVX_U128_DIRTY; > - > - continue; > - } > - > - set = single_set (def_insn); > - if (!set) > - return AVX_U128_DIRTY; > - > - dest = SET_DEST (set); > - > - /* Skip if DEF_INSN is not an AVX load. */ > - if (ix86_check_avx_upper_register (dest)) > - { > - src = SET_SRC (set); > - /* Return AVX_U128_DIRTY if the source operand isn't > - constant zero. */ > - if (standard_sse_constant_p (src, GET_MODE (dest)) > - != 1) > - return AVX_U128_DIRTY; > - } > - > - /* We get here only if all AVX loads are from constant > - zero. */ > - status = AVX_U128_ANY; > + int status = ix86_avx_u128_mode_source (insn, *iter); > + if (status == AVX_U128_DIRTY) > + return status; > } > - > - return status; > } > > /* This isn't YMM/ZMM load/store. */ > @@ -14501,7 +14515,6 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) > Hardware changes state only when a 256bit register is written to, > but we need to prevent the compiler from moving optimal insertion > point above eventual read from 256bit or 512 bit register. */ > - subrtx_iterator::array_type array; > FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) > if (ix86_check_avx_upper_register (*iter)) > return AVX_U128_DIRTY; > diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1a.c > b/gcc/testsuite/gcc.target/i386/pr104441-1a.c > new file mode 100644 > index 00000000000..f4d263205f8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr104441-1a.c > @@ -0,0 +1,57 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mtune=skylake -Wno-attributes" } */ > + > +#include <x86intrin.h> > +#include <stdint.h> > + > +__attribute__((always_inline, target("avx2"))) > +static __m256i > +load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride) > +{ > + __m128i src01, src23; > + src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride)); > + src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1); > + return _mm256_setr_m128i(src01, src23); > +} > + > +__attribute__ ((noinline, noipa, target("avx2"))) > +uint32_t > +compute4x_m_sad_avx2_intrin(uint8_t *src, uint32_t src_stride, > + uint8_t *ref, uint32_t ref_stride, > + uint32_t height) > +{ > + __m128i xmm0; > + __m256i ymm = _mm256_setzero_si256(); > + uint32_t y; > + > + for (y = 0; y < height; y += 4) { > + const __m256i src0123 = load8bit_4x4_avx2(src, src_stride); > + const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride); > + ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123)); > + src += src_stride << 2; > + ref += ref_stride << 2; > + } > + > + xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm), > + _mm256_extracti128_si256(ymm, 1)); > + > + return (uint32_t)_mm_cvtsi128_si32(xmm0); > +} > + > +/* Expect assembly like: > + > + vextracti128 $0x1, %ymm3, %xmm3 > + vpaddd %xmm3, %xmm0, %xmm0 > + vmovd %xmm0, %eax > + vzeroupper > + > +rather than: > + > + vzeroupper > + vextracti128 $0x1, %ymm3, %xmm3 > + vpaddd %xmm3, %xmm0, %xmm0 > + vmovd %xmm0, %eax > + > + */ > + > +/* { dg-final { scan-assembler "\[ \t\]+vextracti128\[ \t\]+\[^\n\]+\n\[ > \t\]+vpaddd\[ \t\]+\[^\n\]+\n\[ \t\]+vmovd\[ \t\]+\[^\n\]+\n\[ > \t\]+vzeroupper" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1b.c > b/gcc/testsuite/gcc.target/i386/pr104441-1b.c > new file mode 100644 > index 00000000000..0b8a796d93c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr104441-1b.c > @@ -0,0 +1,32 @@ > +/* { dg-do run } */ > +/* { dg-options "-O3 -mvzeroupper -Wno-attributes" } */ > + > +#include "pr104441-1a.c" > + > +#define ARRAY_SIZE 255 > + > +__attribute__ ((noinline, noipa)) > +static void > +do_test (void) > +{ > + uint8_t src[ARRAY_SIZE]; > + uint8_t ref[ARRAY_SIZE]; > + uint32_t x; > + uint32_t i; > + for (i = 0; i < ARRAY_SIZE; i++) > + { > + src[i] = i; > + ref[i] = i; > + } > + x = compute4x_m_sad_avx2_intrin(src, 64 >> 2, ref, 64, 4); > + if (x != 0x240) > + __builtin_abort (); > +} > + > +int > +main () > +{ > + if (__builtin_cpu_supports ("avx2")) > + do_test (); > + return 0; > +} > -- > 2.34.1 > -- BR, Hongtao