On Mon, May 26, 2025 at 1:40 PM Andrew Pinski <pins...@gmail.com> wrote: > > On Mon, May 26, 2025 at 5:36 AM Richard Biener > <richard.guent...@gmail.com> wrote: > > > > On Sun, May 18, 2025 at 10:58 PM Andrew Pinski <quic_apin...@quicinc.com> > > wrote: > > > > > > This implements a simple copy propagation for aggregates in the similar > > > fashion as we already do for copy prop of zeroing. > > > > > > Right now this only looks at the previous vdef statement but this allows > > > us > > > to catch a lot of cases that show up in C++ code. > > > > > > Also deletes aggregate copies that are to the same location (PR57361), > > > this was > > > already done in DSE but we should do it here also since it is simple to > > > add and > > > when doing a copy to a temporary and back to itself should be deleted too. > > > So we need a variant that tests DSE and one for forwprop. > > > > > > Also adds a variant of pr22237.c which was found while working on this > > > patch. > > > > > > PR tree-optimization/14295 > > > PR tree-optimization/108358 > > > PR tree-optimization/114169 > > > > > > gcc/ChangeLog: > > > > > > * tree-ssa-forwprop.cc (optimize_agr_copyprop): New function. > > > (pass_forwprop::execute): Call optimize_agr_copyprop for > > > load/store statements. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.dg/tree-ssa/20031106-6.c: Un-xfail. Add scan for forwprop1. > > > * g++.dg/opt/pr66119.C: Disable forwprop since that does > > > the copy prop now. > > > * gcc.dg/tree-ssa/pr108358-a.c: New test. > > > * gcc.dg/tree-ssa/pr114169-1.c: New test. > > > * gcc.c-torture/execute/builtins/pr22237-1-lib.c: New test. > > > * gcc.c-torture/execute/builtins/pr22237-1.c: New test. > > > * gcc.dg/tree-ssa/pr57361.c: Disable forwprop1. > > > * gcc.dg/tree-ssa/pr57361-1.c: New test. > > > > > > Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com> > > > --- > > > gcc/testsuite/g++.dg/opt/pr66119.C | 2 +- > > > .../execute/builtins/pr22237-1-lib.c | 27 +++++ > > > .../execute/builtins/pr22237-1.c | 57 ++++++++++ > > > gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c | 8 +- > > > gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c | 33 ++++++ > > > gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c | 39 +++++++ > > > gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c | 9 ++ > > > gcc/testsuite/gcc.dg/tree-ssa/pr57361.c | 2 +- > > > gcc/tree-ssa-forwprop.cc | 103 ++++++++++++++++++ > > > 9 files changed, 276 insertions(+), 4 deletions(-) > > > create mode 100644 > > > gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c > > > create mode 100644 > > > gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c > > > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c > > > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c > > > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c > > > > > > diff --git a/gcc/testsuite/g++.dg/opt/pr66119.C > > > b/gcc/testsuite/g++.dg/opt/pr66119.C > > > index d1b1845a258..52362e44434 100644 > > > --- a/gcc/testsuite/g++.dg/opt/pr66119.C > > > +++ b/gcc/testsuite/g++.dg/opt/pr66119.C > > > @@ -3,7 +3,7 @@ > > > the value of MOVE_RATIO now is. */ > > > > > > /* { dg-do compile { target { { i?86-*-* x86_64-*-* } && c++11 } } } > > > */ > > > -/* { dg-options "-O3 -mavx -fdump-tree-sra -march=slm -mtune=slm > > > -fno-early-inlining" } */ > > > +/* { dg-options "-O3 -mavx -fdump-tree-sra -fno-tree-forwprop -march=slm > > > -mtune=slm -fno-early-inlining" } */ > > > // { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! > > > hostedlib } } > > > > > > #include <immintrin.h> > > > diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c > > > b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c > > > new file mode 100644 > > > index 00000000000..44032357405 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c > > > @@ -0,0 +1,27 @@ > > > +extern void abort (void); > > > + > > > +void * > > > +memcpy (void *dst, const void *src, __SIZE_TYPE__ n) > > > +{ > > > + const char *srcp; > > > + char *dstp; > > > + > > > + srcp = src; > > > + dstp = dst; > > > + > > > + if (dst < src) > > > + { > > > + if (dst + n > src) > > > + abort (); > > > + } > > > + else > > > + { > > > + if (src + n > dst) > > > + abort (); > > > + } > > > + > > > + while (n-- != 0) > > > + *dstp++ = *srcp++; > > > + > > > + return dst; > > > +} > > > diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c > > > b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c > > > new file mode 100644 > > > index 00000000000..0a12b0fc9a1 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c > > > @@ -0,0 +1,57 @@ > > > +extern void abort (void); > > > +extern void exit (int); > > > +struct s { unsigned char a[256]; }; > > > +union u { struct { struct s b; int c; } d; struct { int c; struct s b; } > > > e; }; > > > +static union u v; > > > +static union u v0; > > > +static struct s *p = &v.d.b; > > > +static struct s *q = &v.e.b; > > > + > > > +struct outers > > > +{ > > > + struct s inner; > > > +}; > > > + > > > +static inline struct s rp (void) { return *p; } > > > +static inline struct s rq (void) { return *q; } > > > +static void pq (void) > > > +{ > > > + struct outers o = {rq () }; > > > + *p = o.inner; > > > +} > > > +static void qp (void) > > > +{ > > > + struct outers o = {rp () }; > > > + *q = o.inner; > > > +} > > > + > > > +static void > > > +init (struct s *sp) > > > +{ > > > + int i; > > > + for (i = 0; i < 256; i++) > > > + sp->a[i] = i; > > > +} > > > + > > > +static void > > > +check (struct s *sp) > > > +{ > > > + int i; > > > + for (i = 0; i < 256; i++) > > > + if (sp->a[i] != i) > > > + abort (); > > > +} > > > + > > > +void > > > +main_test (void) > > > +{ > > > + v = v0; > > > + init (p); > > > + qp (); > > > + check (q); > > > + v = v0; > > > + init (q); > > > + pq (); > > > + check (p); > > > + exit (0); > > > +} > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c > > > b/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c > > > index 56d1887bd78..c7e00887c16 100644 > > > --- a/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c > > > @@ -1,5 +1,7 @@ > > > /* { dg-do compile } */ > > > -/* { dg-options "-O1 -fno-tree-sra -fdump-tree-optimized" } */ > > > +/* { dg-options "-O1 -fno-tree-sra -fdump-tree-optimized > > > -fdump-tree-forwprop1-details" } */ > > > + > > > +/* PR tree-optimization/14295 */ > > > > > > extern void link_error (void); > > > > > > @@ -25,4 +27,6 @@ struct s foo (struct s r) > > > > > > /* There should be no references to any of "temp_struct*" > > > temporaries. */ > > > -/* { dg-final { scan-tree-dump-times "temp_struct" 0 "optimized" { xfail > > > *-*-* } } } */ > > > +/* { dg-final { scan-tree-dump-times "temp_struct" 0 "optimized" } } */ > > > +/* Also check that forwprop pass did the copy prop. */ > > > +/* { dg-final { scan-tree-dump-times "after previous" 3 "forwprop1" } } > > > */ > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c > > > b/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c > > > new file mode 100644 > > > index 00000000000..342e1c1a5c2 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c > > > @@ -0,0 +1,33 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-Os -fdump-tree-optimized" } */ > > > + > > > +/* PR tree-optimization/108358 */ > > > + > > > +struct a { > > > + int b; > > > + int c; > > > + short d; > > > + int e; > > > + int f; > > > +}; > > > +struct g { > > > + struct a f; > > > + struct a h; > > > +}; > > > +int i; > > > +void foo(); > > > +void bar31_(void); > > > +int main() { > > > + struct g j, l = {2, 1, 6, 1, 1, 7, 5, 1, 0, 1}; > > > + for (; i; ++i) > > > + bar31_(); > > > + j = l; > > > + struct g m = j; > > > + struct g k = m; > > > + if (k.h.b) > > > + ; > > > + else > > > + foo(); > > > +} > > > +/* The call to foo should be optimized away. */ > > > +/* { dg-final { scan-tree-dump-not "foo " "optimized" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c > > > b/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c > > > new file mode 100644 > > > index 00000000000..37766fbe296 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c > > > @@ -0,0 +1,39 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-forwprop-details -fdump-tree-optimized" > > > } */ > > > + > > > + > > > +/* PR tree-optimization/114169 */ > > > + > > > +#include <stdint.h> > > > + > > > +struct S1 { > > > + uint32_t f0; > > > + uint8_t f1; > > > + uint64_t f2; > > > + uint64_t f3; > > > + int32_t f4; > > > +}; > > > + > > > +union U8 { > > > + struct S1 f0; > > > + int32_t f1; > > > + int64_t f2; > > > + uint8_t f3; > > > + const int64_t f4; > > > +}; > > > + > > > +/* --- GLOBAL VARIABLES --- */ > > > +struct S1 g_16 = {4294967293UL,1UL,1UL,0xA9C1C73B017290B1LL,0x5ADF851FL}; > > > +union U8 g_37 = {{1UL,1UL,0x2361AE7D51263067LL,0xEEFD7F9B64A47447LL,0L}}; > > > +struct S1 g_50 = > > > {0x0CFC2012L,1UL,0x43E1243B3BE7B8BBLL,0x03C5CEC10C1A6FE1LL,1L}; > > > + > > > + > > > +/* --- FORWARD DECLARATIONS --- */ > > > + > > > +void func_32(union U8 e) { > > > + e.f3 = e.f0.f4; > > > + g_16 = e.f0 = g_50; > > > +} > > > +/* The union e should not make a difference here. */ > > > +/* { dg-final { scan-tree-dump-times "after previous" 1 "forwprop1" } } > > > */ > > > +/* { dg-final { scan-tree-dump "g_16 = g_50;" "optimized" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c > > > b/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c > > > new file mode 100644 > > > index 00000000000..0accffd7fe4 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr57361-1.c > > > @@ -0,0 +1,9 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O -fdump-tree-forwprop1-details" } */ > > > + > > > +struct A { int x; double y; }; > > > +void f (struct A *a) { > > > + *a = *a; > > > +} > > > + > > > +/* { dg-final { scan-tree-dump "into a NOP" "forwprop1"} } */ > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c > > > b/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c > > > index 81f27b3cd1f..7e273dbe2e1 100644 > > > --- a/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr57361.c > > > @@ -1,5 +1,5 @@ > > > /* { dg-do compile } */ > > > -/* { dg-options "-O -fdump-tree-dse1-details" } */ > > > +/* { dg-options "-O -fdump-tree-dse1-details -fno-tree-forwprop" } */ > > > > > > struct A { int x; double y; }; > > > void f (struct A *a) { > > > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc > > > index 4c048a9a298..412e5faa106 100644 > > > --- a/gcc/tree-ssa-forwprop.cc > > > +++ b/gcc/tree-ssa-forwprop.cc > > > @@ -1341,6 +1341,103 @@ optimize_memcpy_to_memset (gimple_stmt_iterator > > > *gsip, tree dest, tree src, tree > > > } > > > return true; > > > } > > > +/* Optimizes > > > + a = c; > > > + b = a; > > > + into > > > + a = c; > > > + b = c; > > > + GSIP is the second statement and SRC is the common > > > + between the statements. > > > +*/ > > > +static bool > > > +optimize_agr_copyprop (gimple_stmt_iterator *gsip, tree dest, tree src) > > > +{ > > > + gimple *stmt = gsi_stmt (*gsip); > > > > Since you get at the stmt itself here, there seems to be no need to pass > > dest and src separately? > > Yes, that makes sense. I was copying what was originally done for > optimize_memcpy_to_memset here. > > > > > > + if (gimple_has_volatile_ops (stmt)) > > > + return false; > > > + > > > + tree vuse = gimple_vuse (stmt); > > > + if (vuse == NULL || TREE_CODE (vuse) != SSA_NAME) > > > > hopefully the latter doesn't happen too often and the former never? > > This was copied from optimize_memcpy_to_memset; let me look into why > it was done. I think the latter now shows up and the former was what > it was when it was part of fold-all-builtins. > > > > > > > + return false; > > > + /* If the statement is `src = src;` change it to be a NOP. */ > > > + if (operand_equal_p (dest, src, 0)) > > > + { > > > + if (dump_file && (dump_flags & TDF_DETAILS)) > > > + { > > > + fprintf (dump_file, "Simplified\n "); > > > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > > > + fprintf (dump_file, "into a NOP\n "); > > > + } > > > + unlink_stmt_vdef (stmt); > > > + release_defs (stmt); > > > + gsi_replace (gsip, gimple_build_nop (), false); > > > + return true; > > > + } > > > + gimple *defstmt = SSA_NAME_DEF_STMT (vuse); > > > + if (!gimple_assign_load_p (defstmt) > > > + || !gimple_store_p (defstmt)) > > > + return false; > > > + if (gimple_has_volatile_ops (defstmt)) > > > + return false; > > > + > > > + tree dest2 = gimple_assign_lhs (defstmt); > > > + tree src2 = gimple_assign_rhs1 (defstmt); > > > + > > > + /* If the original store is `src2 = src2;` skip over it. */ > > > + if (operand_equal_p (src2, dest2, 0)) > > > + return false; > > > + if (!operand_equal_p (src, dest2, 0)) > > > + return false; > > > + > > > + /* If replacing with the same thing, then change the statement > > > + into a NOP. */ > > > + if (operand_equal_p (src2, dest, 0)) > > > > Note this is redundant store removal - I'm not sure operand_equal_p > > is good enough to catch all cases of effective type changes done? > > Esp. as infering the old effective type from the read side (src2) > > is impossible in principle. Consider > > > > (T{ref-all} *)dest = {}; > > dest2 = dest; > > dest = dest2; > > > > now, the last stmt is good to elide (we'll keep a ref-all store), but can > > we construct a case where it goes wrong? Possibly when 'dest' > > is a may-alias type but the first store (not visible in the IL as far > > as you analyze) was done with the ref-all stripped? > > > > I realize all the other redundant store removal instances have > > similar problems (and there's at least one open PR still). > > operand_equal_p has some code for MEM_REF to check alias compatible. I > will add a few testcases for this. I did try to find some testcases > where we might delete the store but I could not find it. > Note tree-ssa-dse uses exactly operand_equal_p check on the MEM_REF to > see if it is storing the same thing to itself. > I do wonder if that might break in some cases too. > > > > > > We have refs_same_for_tbaa_p that is used by DOM and RTL CSE. > > FRE open-codes something, likewise postreload. > > I will look into that in a little bit (as I mentioned tree-ssa-dse > might be broken too) > > > > > > > + { > > > + if (dump_file && (dump_flags & TDF_DETAILS)) > > > + { > > > + fprintf (dump_file, "Simplified\n "); > > > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > > > + fprintf (dump_file, "after previous\n "); > > > + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > > > + fprintf (dump_file, "into a NOP\n "); > > > > Can you say "Deleted redundant store" or so for cases we remove > > a later store? That makes it easy to grep for such cases. > > Yes that makes sense. > > > > > > + } > > > + unlink_stmt_vdef (stmt); > > > + release_defs (stmt); > > > + gsi_replace (gsip, gimple_build_nop (), false); > > > + return true; > > > + } > > > + /* For 2 memory refences and using a temporary to do the copy, > > > + don't remove the temporary as the 2 memory references might overlap. > > > + Note t does not need to be decl as it could be field. > > > + See PR 22237 for full details. > > > + E.g. > > > + t = *a; > > > + *b = t; > > > + Cannot be convert into > > > + t = *a; > > > + *b = *a; > > > + */ > > > + if (!DECL_P (dest) && !DECL_P (src2)) > > > + return false; > > > > FRE also uses size vs. alignment for further cases but I guess > > for aggregates (aka large size) alignment is usually too small to help. > > So operand_equal_p already checks the alignment and size if we have a > MEM_REF. Unless I am misunderstanding the point here. > > > > > What I see is that you now increase the lifetime of '*a' when > > there are other uses of 't' between t = *a; and *b = t; This > > might for example kill stack-reuse. It might also confuse SRA > > heuristics with respect to total scalarization? > > Yes we do increase the lifetime of `*a`. And the SRA heuristics for > total scalarization is part of the problem and what this change is > trying to avoid in the first place. > > > > > IMO it would be worth gating this on single_use (vuse) and thus > > elide the t = *a; copy as well here? Can you add simple statistics > > via statistics_counter_event and check whether this makes a difference > > on say GCC itself? > > I will add the statistics for both this and the memset one which is missing > one. > > Yes it does makes a difference for GCC itself (at least with earlier > versions of the patch); wi::wide_int and gimple_stmt_iterator are the > structs where the prop happened the most. Though those do get fully > SRA scalarized anyways so overall I think it was just a slight > decrease in compile time.
So here is a summary of what I noticed when looking at code generation of GCC before/after this (using preprocessed sources from June 2021): most places where this happens don't make a difference because SRA will do a similar job. But the places where it does make a difference: * wide-int-print.cc: SRA does something slightly different for the setting of this->D.18729 in _ZN20wide_int_ref_storageILb0ELb1EEC2I16generic_wide_intIS0_EEERKT_j which GCC seems to do a better job at doing DSE afterwards (due to type info is already longer?) and then function splitting and inlining of the function split happens in some more cases * value-range.cc: GCC inlines tree_class_check in one case more than before. GCC does not inline irange::tree_upper_bound.isra compared to before (previously it was not inlined once but afterwards twice) * ubsan.cc: before ``` gsi = MEM[(const struct gimple_stmt_iterator &)gsip_41(D)]; i$ptr_61 = gsi.ptr; movdqu (%rdi), %xmm0 movq %xmm0, %rbx ``` after ``` gsi = MEM[(const struct gimple_stmt_iterator &)gsip_41(D)]; i$ptr_61 = MEM[(const struct gimple_stmt_iterator &)gsip_41(D)].ptr; movq (%rdi), %rbx movdqu (%rdi), %xmm0 ``` So similar enough. * tree-vect-loop.cc: Inlining difference due to removal of extra copy after call to profile_count::from_gcov_type NOTE: there is still one extra copy; I have a patch later on which should be able to remove that one too. * tree-vect-data-refs.cc: (vect_permute_store_chain): ssa names are slightly different which looks causes RA difference and increase of stack? (that is just happens accidently) * tree-switch-conversion.cc : since we removed a reference to a copy but kept it around due to using it in a BIT_FIELD_REF, SRA decides to make the final copy as a scalized code and for profile_probability, it is horrible generated code for x86 (profile_probability). I suspect my later patch will fix this. * tree-ssa-threadupdate.cc: inlining difference due to removal of an extra copy of `profile_count` struct * tree-ssa-reassoc.cc: Removal of one copy of gimple_stmt_iterator; loads ptr directly from gsip instead of a full copy (code improvement) * tree-ssa-loop-unswitch.cc: similar issue as tree-switch-conversion.cc (profile_probability) * tree-loop-distribution.cc: (loop_distribution::finalize_partitions): Some jump threading differences due to similar SRA change of wide-int-print.cc * tree-if-conv.cc: removes (correctly) copies during forwprop2; dse removes the variables too; then things differ afterwards due to removal of some clobbers * tree-complex.cc: 2x 2 bb order swapped on gimple level; no real code gen change * tree-cfg.cc: profile_count::operator> is inlined now due to removal of a copy * tracer.cc: profile_count::operator> is inlined now due to removal of a copy > > Thanks, > Andrew Pinski > > > > > Otherwise thanks for working on this. > > Richard. > > > > > + > > > + if (dump_file && (dump_flags & TDF_DETAILS)) > > > + { > > > + fprintf (dump_file, "Simplified\n "); > > > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > > > + fprintf (dump_file, "after previous\n "); > > > + print_gimple_stmt (dump_file, defstmt, 0, dump_flags); > > > + } > > > + gimple_assign_set_rhs_from_tree (gsip, unshare_expr (src2)); > > > + update_stmt (stmt); > > > + > > > + if (dump_file && (dump_flags & TDF_DETAILS)) > > > + { > > > + fprintf (dump_file, "into\n "); > > > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > > > + } > > > + return true; > > > +} > > > > > > /* *GSI_P is a GIMPLE_CALL to a builtin function. > > > Optimize > > > @@ -4717,6 +4814,12 @@ pass_forwprop::execute (function *fun) > > > changed = true; > > > break; > > > } > > > + if (optimize_agr_copyprop (&gsi, > > > gimple_assign_lhs (stmt), > > > + gimple_assign_rhs1 (stmt))) > > > + { > > > + changed = true; > > > + break; > > > + } > > > } > > > > > > if (TREE_CODE_CLASS (code) == tcc_comparison) > > > -- > > > 2.43.0 > > >