On Tue, Mar 16, 2021 at 10:51 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > My r10-6451-gb7b3378f91c0641f2ef4d88db22af62a571c9359 fix for > vzeroupper vs. ms ABI apparently broke the explicit vzeroupper handling > when the implicit vzeroupper handling is disabled. > The epilogue_completed splitter for vzeroupper now adds clobbers for all > registers which don't have explicit sets in the pattern and the sets are > added during vzeroupper pass. Before my changes, for explicit user > vzeroupper, we just weren't modelling its effects at all, it was just > unspec that didn't tell that it clobbers the upper parts of all XMM < %xmm16 > registers. But now the splitter will even for those add clobbers and as > it has no sets, it will add clobbers for all registers, which means > we optimize away anything that lived across that vzeroupper. > > The vzeroupper pass has two parts, one is the mode switching that computes > where to put the implicit vzeroupper calls and puts them there, and then > another that uses df to figure out what sets to add to all the vzeroupper. > The former part should be done only under the conditions we have in the > gate, but the latter as this PR shows needs to happen either if we perform > the implicit vzeroupper additions, or if there are (or could be) any > explicit vzeroupper instructions. As that function does df_analyze and > walks the whole IL, I think it would be too expensive to run it always > whenever TARGET_AVX, so this patch remembers if we've expanded at least > one __builtin_ia32_vzeroupper in the function and runs that part of the > vzeroupper pass both when the old condition is true or when this new > flag is set. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2021-03-16 Jakub Jelinek <ja...@redhat.com> > > PR target/99563 > * config/i386/i386.h (struct machine_function): Add > has_explicit_vzeroupper bitfield. > * config/i386/i386-expand.c (ix86_expand_builtin): Set > cfun->machine->has_explicit_vzeroupper when expanding > IX86_BUILTIN_VZEROUPPER. > * config/i386/i386-features.c (rest_of_handle_insert_vzeroupper): > Do the mode switching only when TARGET_VZEROUPPER, expensive > optimizations turned on and not optimizing for size. > (pass_insert_vzeroupper::gate): Enable even when > cfun->machine->has_explicit_vzeroupper is set. > > * gcc.target/i386/avx-pr99563.c: New test.
OK. Thanks, Uros. > > --- gcc/config/i386/i386.h.jj 2021-02-22 17:54:05.617799002 +0100 > +++ gcc/config/i386/i386.h 2021-03-15 12:30:00.814841624 +0100 > @@ -2941,6 +2941,10 @@ struct GTY(()) machine_function { > /* True if the function needs a stack frame. */ > BOOL_BITFIELD stack_frame_required : 1; > > + /* True if __builtin_ia32_vzeroupper () has been expanded in current > + function. */ > + BOOL_BITFIELD has_explicit_vzeroupper : 1; > + > /* The largest alignment, in bytes, of stack slot actually used. */ > unsigned int max_used_stack_alignment; > > --- gcc/config/i386/i386-expand.c.jj 2021-02-09 12:28:14.069323264 +0100 > +++ gcc/config/i386/i386-expand.c 2021-03-15 12:34:26.549901726 +0100 > @@ -13210,6 +13210,10 @@ rdseed_step: > > return 0; > > + case IX86_BUILTIN_VZEROUPPER: > + cfun->machine->has_explicit_vzeroupper = true; > + break; > + > default: > break; > } > --- gcc/config/i386/i386-features.c.jj 2021-02-01 09:55:45.953519272 +0100 > +++ gcc/config/i386/i386-features.c 2021-03-15 12:37:07.886116827 +0100 > @@ -1837,19 +1837,22 @@ ix86_add_reg_usage_to_vzerouppers (void) > static unsigned int > rest_of_handle_insert_vzeroupper (void) > { > - int i; > - > - /* vzeroupper instructions are inserted immediately after reload to > - account for possible spills from 256bit or 512bit registers. The pass > - reuses mode switching infrastructure by re-running mode insertion > - pass, so disable entities that have already been processed. */ > - for (i = 0; i < MAX_386_ENTITIES; i++) > - ix86_optimize_mode_switching[i] = 0; > + if (TARGET_VZEROUPPER > + && flag_expensive_optimizations > + && !optimize_size) > + { > + /* vzeroupper instructions are inserted immediately after reload to > + account for possible spills from 256bit or 512bit registers. The > pass > + reuses mode switching infrastructure by re-running mode insertion > + pass, so disable entities that have already been processed. */ > + for (int i = 0; i < MAX_386_ENTITIES; i++) > + ix86_optimize_mode_switching[i] = 0; > > - ix86_optimize_mode_switching[AVX_U128] = 1; > + ix86_optimize_mode_switching[AVX_U128] = 1; > > - /* Call optimize_mode_switching. */ > - g->get_passes ()->execute_pass_mode_switching (); > + /* Call optimize_mode_switching. */ > + g->get_passes ()->execute_pass_mode_switching (); > + } > ix86_add_reg_usage_to_vzerouppers (); > return 0; > } > @@ -1880,8 +1883,10 @@ public: > virtual bool gate (function *) > { > return TARGET_AVX > - && TARGET_VZEROUPPER && flag_expensive_optimizations > - && !optimize_size; > + && ((TARGET_VZEROUPPER > + && flag_expensive_optimizations > + && !optimize_size) > + || cfun->machine->has_explicit_vzeroupper); > } > > virtual unsigned int execute (function *) > --- gcc/testsuite/gcc.target/i386/avx-pr99563.c.jj 2021-03-15 > 13:18:08.896950279 +0100 > +++ gcc/testsuite/gcc.target/i386/avx-pr99563.c 2021-03-15 13:17:28.881392012 > +0100 > @@ -0,0 +1,38 @@ > +/* PR target/99563 */ > +/* { dg-do run { target avx } } */ > +/* { dg-options "-O2 -mavx -mno-vzeroupper" } */ > + > +#include "avx-check.h" > +#include <immintrin.h> > + > + > +__attribute__((noipa)) float > +compute_generic (void) > +{ > + return 0.0f; > +} > + > +static inline __attribute__((always_inline)) > +float compute_avx (unsigned long block_count) > +{ > + __m128d mm_res = _mm_set1_pd (256.0); > + float res = (float) (_mm_cvtsd_f64 (mm_res) / (double) block_count); > + _mm256_zeroupper (); > + return res; > +} > + > +__attribute__((noipa)) float > +compute (unsigned long block_count) > +{ > + if (block_count >= 64) > + return compute_avx (block_count); > + else > + return compute_generic (); > +} > + > +static void > +avx_test (void) > +{ > + if (compute (128) != 2.0f || compute (32) != 0.0f) > + abort (); > +} > > Jakub >