On Wed, 5 Feb 2025, Jakub Jelinek wrote: > On Tue, Feb 04, 2025 at 01:11:10PM +0100, Richard Biener wrote: > > OK, fair enough. I think there should be a comment indicating this > > isn't a full conservative approach but handling a certain class of > > accesses only, with the hope that this is all that's actually needed. > > Ok, here is an updated patch. > > cselib_invalidate_mem (callmem[1]) is now done solely for > !ACCUMULATE_OUTGOING_ARGS || cfun->calls_alloca functions, I've > renamed the variables to x_{base,off} and sp_{base,off} - > the first pair is for x's address, the second pair is for current > value of stack pointer, tweaked/extended comments and sp_{base,off} > is now initialized just once per cselib_invalidate_mem call, though still > only lazily (if we actually need it for something). > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
LGTM. Richard. > 2025-02-05 Jakub Jelinek <ja...@redhat.com> > > PR rtl-optimization/117239 > * cselib.cc: Include predict.h. > (callmem): Change type from rtx to rtx[2]. > (cselib_preserve_only_values): Use callmem[0] rather than callmem. > (cselib_invalidate_mem): Optimize and don't try to invalidate > for the mem_rtx == callmem[1] case MEMs which clearly can't be > below the stack pointer. > (cselib_process_insn): Use callmem[0] rather than callmem. > For const/pure calls also call cselib_invalidate_mem (callmem[1]) > in !ACCUMULATE_OUTGOING_ARGS or cfun->calls_alloca functions. > (cselib_init): Initialize callmem[0] rather than callmem and also > initialize callmem[1]. > > * gcc.dg/pr117239.c: New test. > > --- gcc/cselib.cc.jj 2025-02-01 00:46:53.073275225 +0100 > +++ gcc/cselib.cc 2025-02-05 09:15:47.394065496 +0100 > @@ -33,6 +33,7 @@ along with GCC; see the file COPYING3. > #include "cselib.h" > #include "function-abi.h" > #include "alias.h" > +#include "predict.h" > > /* A list of cselib_val structures. */ > struct elt_list > @@ -248,8 +249,9 @@ static unsigned int *used_regs; > static unsigned int n_used_regs; > > /* We pass this to cselib_invalidate_mem to invalidate all of > - memory for a non-const call instruction. */ > -static GTY(()) rtx callmem; > + memory for a non-const call instruction and memory below stack pointer > + for const/pure calls. */ > +static GTY(()) rtx callmem[2]; > > /* Set by discard_useless_locs if it deleted the last location of any > value. */ > @@ -808,7 +810,7 @@ cselib_preserve_only_values (void) > for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) > cselib_invalidate_regno (i, reg_raw_mode[i]); > > - cselib_invalidate_mem (callmem); > + cselib_invalidate_mem (callmem[0]); > > remove_useless_values (); > > @@ -2630,6 +2632,8 @@ cselib_invalidate_mem (rtx mem_rtx) > struct elt_loc_list **p = &v->locs; > bool had_locs = v->locs != NULL; > rtx_insn *setting_insn = v->locs ? v->locs->setting_insn : NULL; > + rtx sp_base = NULL_RTX; > + HOST_WIDE_INT sp_off = 0; > > while (*p) > { > @@ -2644,6 +2648,114 @@ cselib_invalidate_mem (rtx mem_rtx) > p = &(*p)->next; > continue; > } > + > + /* When invalidating memory below the stack pointer for const/pure > + calls and alloca/VLAs aren't used, attempt to optimize. Values > + stored into area sometimes below the stack pointer shouldn't be > + addressable and should be stored just through stack pointer > + derived expressions, so don't invalidate MEMs not using stack > + derived addresses, or if the MEMs clearly aren't below the stack > + pointer. This isn't a fully conservative approach, the hope is > + that invalidating more MEMs than this isn't actually needed. */ > + if (mem_rtx == callmem[1] > + && num_mems < param_max_cselib_memory_locations > + && GET_CODE (XEXP (x, 0)) == VALUE > + && !cfun->calls_alloca) > + { > + cselib_val *v2 = CSELIB_VAL_PTR (XEXP (x, 0)); > + rtx x_base = NULL_RTX; > + HOST_WIDE_INT x_off = 0; > + if (SP_DERIVED_VALUE_P (v2->val_rtx)) > + x_base = v2->val_rtx; > + else > + for (struct elt_loc_list *l = v2->locs; l; l = l->next) > + if (GET_CODE (l->loc) == PLUS > + && GET_CODE (XEXP (l->loc, 0)) == VALUE > + && SP_DERIVED_VALUE_P (XEXP (l->loc, 0)) > + && CONST_INT_P (XEXP (l->loc, 1))) > + { > + x_base = XEXP (l->loc, 0); > + x_off = INTVAL (XEXP (l->loc, 1)); > + break; > + } > + /* If x_base is NULL here, don't invalidate x as its address > + isn't derived from sp such that it could be in outgoing > + argument area of some call in !ACCUMULATE_OUTGOING_ARGS > + function. */ > + if (x_base) > + { > + if (sp_base == NULL_RTX) > + { > + if (cselib_val *v3 > + = cselib_lookup_1 (stack_pointer_rtx, Pmode, 0, > + VOIDmode)) > + { > + if (SP_DERIVED_VALUE_P (v3->val_rtx)) > + sp_base = v3->val_rtx; > + else > + for (struct elt_loc_list *l = v3->locs; > + l; l = l->next) > + if (GET_CODE (l->loc) == PLUS > + && GET_CODE (XEXP (l->loc, 0)) == VALUE > + && SP_DERIVED_VALUE_P (XEXP (l->loc, 0)) > + && CONST_INT_P (XEXP (l->loc, 1))) > + { > + sp_base = XEXP (l->loc, 0); > + sp_off = INTVAL (XEXP (l->loc, 1)); > + break; > + } > + } > + if (sp_base == NULL_RTX) > + sp_base = pc_rtx; > + } > + /* Otherwise, if x_base and sp_base are the same, > + we know that x_base + x_off is the x's address and > + sp_base + sp_off is current value of stack pointer, > + so try to determine if x is certainly not below stack > + pointer. */ > + if (sp_base == x_base) > + { > + if (STACK_GROWS_DOWNWARD) > + { > + HOST_WIDE_INT off = sp_off; > +#ifdef STACK_ADDRESS_OFFSET > + /* On SPARC take stack pointer bias into account as > + well. */ > + off += (STACK_ADDRESS_OFFSET > + - FIRST_PARM_OFFSET (current_function_decl)); > +#endif > + if (x_off >= off) > + /* x is at or above the current stack pointer, > + no need to invalidate it. */ > + x_base = NULL_RTX; > + } > + else > + { > + HOST_WIDE_INT sz; > + enum machine_mode mode = GET_MODE (x); > + if ((MEM_SIZE_KNOWN_P (x) > + && MEM_SIZE (x).is_constant (&sz)) > + || (mode != BLKmode > + && GET_MODE_SIZE (mode).is_constant (&sz))) > + if (x_off < sp_off > + && ((HOST_WIDE_INT) ((unsigned HOST_WIDE_INT) > + x_off + sz) <= sp_off)) > + /* x's end is below or at the current stack > + pointer in !STACK_GROWS_DOWNWARD target, > + no need to invalidate it. */ > + x_base = NULL_RTX; > + } > + } > + } > + if (x_base == NULL_RTX) > + { > + has_mem = true; > + num_mems++; > + p = &(*p)->next; > + continue; > + } > + } > + > if (num_mems < param_max_cselib_memory_locations > && ! canon_anti_dependence (x, false, mem_rtx, > GET_MODE (mem_rtx), mem_addr)) > @@ -3196,14 +3308,24 @@ cselib_process_insn (rtx_insn *insn) > as if they were regular functions. */ > if (RTL_LOOPING_CONST_OR_PURE_CALL_P (insn) > || !(RTL_CONST_OR_PURE_CALL_P (insn))) > - cselib_invalidate_mem (callmem); > + cselib_invalidate_mem (callmem[0]); > else > - /* For const/pure calls, invalidate any argument slots because > - they are owned by the callee. */ > - for (x = CALL_INSN_FUNCTION_USAGE (insn); x; x = XEXP (x, 1)) > - if (GET_CODE (XEXP (x, 0)) == USE > - && MEM_P (XEXP (XEXP (x, 0), 0))) > - cselib_invalidate_mem (XEXP (XEXP (x, 0), 0)); > + { > + /* For const/pure calls, invalidate any argument slots because > + they are owned by the callee. */ > + for (x = CALL_INSN_FUNCTION_USAGE (insn); x; x = XEXP (x, 1)) > + if (GET_CODE (XEXP (x, 0)) == USE > + && MEM_P (XEXP (XEXP (x, 0), 0))) > + cselib_invalidate_mem (XEXP (XEXP (x, 0), 0)); > + /* And invalidate memory below the stack (or above for > + !STACK_GROWS_DOWNWARD), as even const/pure call can invalidate > + that. Do this only if !ACCUMULATE_OUTGOING_ARGS or if > + cfun->calls_alloca, otherwise the stack pointer shouldn't be > + changing in the middle of the function and nothing should be > + stored below the stack pointer. */ > + if (!ACCUMULATE_OUTGOING_ARGS || cfun->calls_alloca) > + cselib_invalidate_mem (callmem[1]); > + } > } > > cselib_record_sets (insn); > @@ -3256,8 +3378,31 @@ cselib_init (int record_what) > > /* (mem:BLK (scratch)) is a special mechanism to conflict with everything, > see canon_true_dependence. This is only created once. */ > - if (! callmem) > - callmem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode)); > + if (! callmem[0]) > + callmem[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode)); > + /* Similarly create a MEM representing roughly everything below > + the stack for STACK_GROWS_DOWNWARD targets or everything above > + it otherwise. Do this only when !ACCUMULATE_OUTGOING_ARGS or > + if cfun->calls_alloca, otherwise the stack pointer shouldn't be > + changing in the middle of the function and nothing should be stored > + below the stack pointer. */ > + if (!callmem[1] && (!ACCUMULATE_OUTGOING_ARGS || cfun->calls_alloca)) > + { > + if (STACK_GROWS_DOWNWARD) > + { > + unsigned HOST_WIDE_INT off = -(GET_MODE_MASK (Pmode) >> 1); > +#ifdef STACK_ADDRESS_OFFSET > + /* On SPARC take stack pointer bias into account as well. */ > + off += (STACK_ADDRESS_OFFSET > + - FIRST_PARM_OFFSET (current_function_decl))); > +#endif > + callmem[1] = plus_constant (Pmode, stack_pointer_rtx, off); > + } > + else > + callmem[1] = stack_pointer_rtx; > + callmem[1] = gen_rtx_MEM (BLKmode, callmem[1]); > + set_mem_size (callmem[1], GET_MODE_MASK (Pmode) >> 1); > + } > > cselib_nregs = max_reg_num (); > > --- gcc/testsuite/gcc.dg/pr117239.c.jj 2025-02-03 11:13:59.399159640 > +0100 > +++ gcc/testsuite/gcc.dg/pr117239.c 2025-02-03 11:13:59.399159640 +0100 > @@ -0,0 +1,42 @@ > +/* PR rtl-optimization/117239 */ > +/* { dg-do run } */ > +/* { dg-options "-fno-inline -O2" } */ > +/* { dg-additional-options "-fschedule-insns" { target i?86-*-* x86_64-*-* } > } */ > + > +int a, b, c = 1, d; > + > +int > +foo (void) > +{ > + return a; > +} > + > +struct A { > + int e, f, g, h; > + short i; > + int j; > +}; > + > +void > +bar (int x, struct A y) > +{ > + if (y.j == 1) > + c = 0; > +} > + > +int > +baz (struct A x) > +{ > + return b; > +} > + > +int > +main () > +{ > + struct A k = { 0, 0, 0, 0, 0, 1 }; > + d = baz (k); > + bar (foo (), k); > + if (c != 0) > + __builtin_abort (); > + return 0; > +} > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)