On Sat, Sep 20, 2025 at 4:24 AM Andrew Pinski <[email protected]> wrote: > > > > On Fri, Sep 19, 2025, 7:08 PM Peter0x44 <[email protected]> wrote: >> >> On 2025-09-20 02:33, Andrew Pinski wrote: >> > On Fri, Sep 19, 2025, 6:22 PM Peter Damianov <[email protected]> >> > wrote: >> > >> >> This patch implements folding of aggregate assignments (*dest = >> >> *src) >> >> by converting them to scalar MEM_REF operations when the size >> >> permits. This enables vectorization opportunities. >> > >> > I am not sure this will work with my recent changes to forwprop. Plus >> > it might make sra not handle them later on. >> > >> > Plus INDIRECT_REF will never shows up in gimple. >> > >> > Iirc it is ldist that should be handling aggregate assignments to get >> > it converted into memcpys. >> > It is not the vectorizer really. >> >> Wouldn't converting it into memcpy then lose the stride info of the >> copying loop? >> Does that matter? > > > Stride? Huh? > The loop in the bug report is just > void > foo_memmov (pixel* p, pixel* q, int n) > { > for (int i = 0; i != n; i++) > *p++ = *q++; > } > > There is no stride there. This is copying 4 bytes each time through the loop > so it should be converted into just > memmove(p, q, n *sizeof(pixel)); > > Yes that does lose some aliasing information but I think that is ok. > The reason why your conversion of an aggregate copy works is because it > converts it into an integer load/store which currently ldist understands. > (Specifically dependence analysis). > > Now for the vectorizer, maybe this lowering should happen in ifcvt rather > than forwprop.
I agree this doesn't belong to forwprop. I agree the loop should be turned into memcpy by ldist (there are more cases missed by ldist). I think for a single aggregate copy stmt the simplification would belong to stmt folding - it should really behave the same as memmove(p, q, sizeof(T)) which would inline if the size is appropriate. But I also agree it might 'break' SRA for cases like 'a = b', which why I think we should work to get these kind of things integrated into SRA. There is no good reason why the vectorizer doesn't handle aggregate copies - besides lack of actual implementation. It's an obscure case, and the plain copy case is better handled by ldist indeed. Richard. > Thanks, > Andrew > > >> >> > >> > Thanks, >> > Andrew >> > >> >> gcc/ChangeLog: >> >> >> >> PR tree-optimization/99504 >> >> * tree-ssa-forwprop.cc (fold_aggregate_assignment): New >> >> function. >> >> Folds aggregate assignments to scalar MEM_REF operations/ >> >> (pass_forwprop::execute): Call fold_aggregate_assignment for >> >> applicable assignment statements. >> >> >> >> gcc/testsuite/ChangeLog: >> >> >> >> PR tree-optimization/99504 >> >> * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that >> >> aggregate >> >> assignments of various sizes get folded to scalar MEM_REF >> >> operations. >> >> >> >> Signed-off-by: Peter Damianov <[email protected]> >> >> --- >> >> gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c | 66 +++++++++ >> >> gcc/tree-ssa-forwprop.cc | 140 >> >> ++++++++++++++++++++ >> >> 2 files changed, 206 insertions(+) >> >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c >> >> >> >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c >> >> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c >> >> new file mode 100644 >> >> index 00000000000..7fef9821e9e >> >> --- /dev/null >> >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c >> >> @@ -0,0 +1,66 @@ >> >> +/* PR tree-optimization/99504 */ >> >> +/* Test that aggregate assignments get folded to scalar MEM_REF >> >> operations */ >> >> +/* { dg-do compile } */ >> >> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */ >> >> + >> >> +#include <stdint.h> >> >> + >> >> +struct pixel_4 { >> >> + uint8_t r, g, b, a; >> >> +}; >> >> + >> >> +struct pixel_8 { >> >> + uint16_t r, g, b, a; >> >> +}; >> >> + >> >> +struct pixel_16 { >> >> + uint32_t r, g, b, a; >> >> +}; >> >> + >> >> +struct pixel_32 { >> >> + uint64_t r, g, b, a; >> >> +}; >> >> + >> >> +#ifdef __SIZEOF_INT128__ >> >> +struct pixel_64 { >> >> + __int128 r, g, b, a; >> >> +}; >> >> +#endif >> >> + >> >> +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src) >> >> +{ >> >> + *dest = *src; >> >> +} >> >> + >> >> +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src) >> >> +{ >> >> + *dest = *src; >> >> +} >> >> + >> >> +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src) >> >> +{ >> >> + *dest = *src; >> >> +} >> >> + >> >> +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src) >> >> +{ >> >> + *dest = *src; >> >> +} >> >> + >> >> +#ifdef __SIZEOF_INT128__ >> >> +void test_64_bytes(struct pixel_64 *dest, struct pixel_64 *src) >> >> +{ >> >> + *dest = *src; >> >> +} >> >> +#endif >> >> + >> >> +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n) >> >> +{ >> >> + for (int i = 0; i < n; i++) >> >> + dest[i] = src[i]; >> >> +} >> >> + >> >> +/* { dg-final { scan-tree-dump-times "MEM\\\[" 10 "forwprop1" } } >> >> */ >> >> +/* Check that we generate scalar temporaries for the folded >> >> assignments */ >> >> +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" } >> >> } */ >> >> +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+" >> >> "forwprop1" } } */ >> >> \ No newline at end of file >> >> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc >> >> index 43b1c9d696f..3ce94a737c6 100644 >> >> --- a/gcc/tree-ssa-forwprop.cc >> >> +++ b/gcc/tree-ssa-forwprop.cc >> >> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq >> >> typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq; >> >> >> >> static bool forward_propagate_addr_expr (tree, tree, bool); >> >> +static bool fold_aggregate_assignment (gimple_stmt_iterator *); >> >> >> >> /* Set to true if we delete dead edges during the optimization. */ >> >> static bool cfg_changed; >> >> @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree >> >> rhs, bool parent_single_use_p) >> >> } >> >> >> >> +/* Try to optimize aggregate assignments by converting them to >> >> scalar >> >> + MEM_REF operations when profitable for vectorization. >> >> + This applies the same folding as memcpy to aggregate >> >> assignments. */ >> >> + >> >> +static bool >> >> +fold_aggregate_assignment (gimple_stmt_iterator *gsi) >> >> +{ >> >> + gimple *stmt = gsi_stmt (*gsi); >> >> + >> >> + if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt)) >> >> + return false; >> >> + >> >> + tree lhs = gimple_assign_lhs (stmt); >> >> + tree rhs = gimple_assign_rhs1 (stmt); >> >> + >> >> + /* Check if this is an aggregate assignment: *dest = *src >> >> + where both sides are aggregate types (can be MEM_REF or >> >> indirection). */ >> >> + bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF); >> >> + bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF); >> >> + >> >> + if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect) >> >> + || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect)) >> >> + return false; >> >> + >> >> + tree lhs_type = TREE_TYPE (lhs); >> >> + tree rhs_type = TREE_TYPE (rhs); >> >> + >> >> + if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type)) >> >> + return false; >> >> + >> >> + if (!types_compatible_p (lhs_type, rhs_type)) >> >> + return false; >> >> + >> >> + if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type))) >> >> + return false; >> >> + >> >> + unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT >> >> (lhs_type)); >> >> + if (!pow2p_hwi (ilen) || ilen > MOVE_MAX) >> >> + return false; >> >> + >> >> + tree lhs_base = TREE_OPERAND (lhs, 0); >> >> + tree rhs_base = TREE_OPERAND (rhs, 0); >> >> + >> >> + unsigned int lhs_align = get_pointer_alignment (lhs_base); >> >> + unsigned int rhs_align = get_pointer_alignment (rhs_base); >> >> + >> >> + scalar_int_mode imode; >> >> + machine_mode mode; >> >> + if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode) >> >> + || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists >> >> (&mode) >> >> + || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT)) >> >> + return false; >> >> + >> >> + if ((lhs_align < GET_MODE_ALIGNMENT (mode) >> >> + && targetm.slow_unaligned_access (mode, lhs_align) >> >> + && optab_handler (movmisalign_optab, mode) == >> >> CODE_FOR_nothing) >> >> + || (rhs_align < GET_MODE_ALIGNMENT (mode) >> >> + && targetm.slow_unaligned_access (mode, rhs_align) >> >> + && optab_handler (movmisalign_optab, mode) == >> >> CODE_FOR_nothing)) >> >> + return false; >> >> + >> >> + tree type = bitwise_type_for_mode (mode); >> >> + tree srctype = type; >> >> + tree desttype = type; >> >> + >> >> + if (rhs_align < GET_MODE_ALIGNMENT (mode)) >> >> + srctype = build_aligned_type (type, rhs_align); >> >> + if (lhs_align < GET_MODE_ALIGNMENT (mode)) >> >> + desttype = build_aligned_type (type, lhs_align); >> >> + >> >> + tree off0 = build_int_cst (build_pointer_type_for_mode >> >> (char_type_node, >> >> + ptr_mode, >> >> true), 0); >> >> + >> >> + tree srcmem, destmem; >> >> + >> >> + if (rhs_is_indirect) >> >> + { >> >> + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0); >> >> + } >> >> + else >> >> + { >> >> + tree rhs_offset = TREE_OPERAND (rhs, 1); >> >> + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, >> >> rhs_offset); >> >> + } >> >> + >> >> + if (lhs_is_indirect) >> >> + { >> >> + destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0); >> >> + } >> >> + else >> >> + { >> >> + tree lhs_offset = TREE_OPERAND (lhs, 1); >> >> + destmem = fold_build2 (MEM_REF, desttype, lhs_base, >> >> lhs_offset); >> >> + } >> >> + gimple *new_stmt; >> >> + if (is_gimple_reg_type (srctype)) >> >> + { >> >> + new_stmt = gimple_build_assign (NULL_TREE, srcmem); >> >> + tree tmp_var = make_ssa_name (srctype, new_stmt); >> >> + gimple_assign_set_lhs (new_stmt, tmp_var); >> >> + gimple_set_vuse (new_stmt, gimple_vuse (stmt)); >> >> + gimple_set_location (new_stmt, gimple_location (stmt)); >> >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); >> >> + >> >> + new_stmt = gimple_build_assign (destmem, tmp_var); >> >> + gimple_move_vops (new_stmt, stmt); >> >> + gimple_set_location (new_stmt, gimple_location (stmt)); >> >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); >> >> + gsi_remove (gsi, true); >> >> + } >> >> + else >> >> + { >> >> + new_stmt = gimple_build_assign (destmem, srcmem); >> >> + gimple_move_vops (new_stmt, stmt); >> >> + gimple_set_location (new_stmt, gimple_location (stmt)); >> >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); >> >> + gsi_remove (gsi, true); >> >> + } >> >> + >> >> + if (dump_file && (dump_flags & TDF_DETAILS)) >> >> + { >> >> + fprintf (dump_file, >> >> + "Converted aggregate assignment to scalar >> >> MEM_REF:\n"); >> >> + fprintf (dump_file, " Original: "); >> >> + print_gimple_stmt (dump_file, stmt, 0, dump_flags); >> >> + fprintf (dump_file, " Size: %u bytes, Mode: %s\n", >> >> + (unsigned)ilen, GET_MODE_NAME (mode)); >> >> + } >> >> + >> >> + statistics_counter_event (cfun, "aggregate assignment to scalar >> >> MEM_REF", 1); >> >> + >> >> + return true; >> >> +} >> >> + >> >> + >> >> /* Helper function for simplify_gimple_switch. Remove case labels >> >> that >> >> have values outside the range of the new type. */ >> >> >> >> @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun) >> >> if (TREE_CODE (lhs) != SSA_NAME >> >> || has_zero_uses (lhs)) >> >> { >> >> + if (TREE_CODE (lhs) != SSA_NAME >> >> + && fold_aggregate_assignment (&gsi)) >> >> + continue; >> >> + >> >> process_vec_perm_simplify_seq_list >> >> (&vec_perm_simplify_seq_list); >> >> gsi_next (&gsi); >> >> continue; >> >> -- >> >> 2.39.5
