On Fri, Sep 19, 2025 at 6:44 PM Andrew Pinski <[email protected]> wrote:
>
>
>
> On Fri, Sep 19, 2025, 6:28 PM Peter Damianov <[email protected]> wrote:
>>
>> This patch implements folding of aggregate assignments (*dest = *src)
>> by converting them to scalar MEM_REF operations when the size
>> permits. This enables vectorization opportunities.
>>
>> gcc/ChangeLog:
>>
>> PR tree-optimization/99504
>> * tree-ssa-forwprop.cc (fold_aggregate_assignment): New function.
>> Folds aggregate assignments to scalar MEM_REF operations/
>> (pass_forwprop::execute): Call fold_aggregate_assignment for
>> applicable assignment statements.
>>
>> gcc/testsuite/ChangeLog:
>>
>> PR tree-optimization/99504
>> * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that aggregate
>> assignments of various sizes get folded to scalar MEM_REF
>> operations.
>>
>> Signed-off-by: Peter Damianov <[email protected]>
>> ---
>> v2: Remove int128 part of test because it cannot apply to every target
>> and would throw off the count of MEM\\\[
>> gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c | 53 ++++++++
>> gcc/tree-ssa-forwprop.cc | 140 ++++++++++++++++++++
>> 2 files changed, 193 insertions(+)
>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>>
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>> new file mode 100644
>> index 00000000000..aa49cf6a238
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>> @@ -0,0 +1,53 @@
>> +/* PR tree-optimization/99504 */
>> +/* Test that aggregate assignments get folded to scalar MEM_REF operations
>> */
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */
>> +
>> +#include <stdint.h>
>> +
>> +struct pixel_4 {
>> + uint8_t r, g, b, a;
>> +};
>> +
>> +struct pixel_8 {
>> + uint16_t r, g, b, a;
>> +};
>> +
>> +struct pixel_16 {
>> + uint32_t r, g, b, a;
>> +};
>> +
>> +struct pixel_32 {
>> + uint64_t r, g, b, a;
>> +};
>> +
>> +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src)
>> +{
>> + *dest = *src;
>> +}
>> +
>> +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src)
>> +{
>> + *dest = *src;
>> +}
>> +
>> +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src)
>> +{
>> + *dest = *src;
>> +}
>> +
>> +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src)
>> +{
>> + *dest = *src;
>> +}
>> +
>> +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n)
>> +{
>> + for (int i = 0; i < n; i++)
>> + dest[i] = src[i];
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "MEM\\\[" 8 "forwprop1" } } */
>>
>> +/* Check that we generate scalar temporaries for the folded assignments */
>> +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" } } */
>> +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+" "forwprop1" } } */
>> \ No newline at end of file
>> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
>> index 43b1c9d696f..3ce94a737c6 100644
>> --- a/gcc/tree-ssa-forwprop.cc
>> +++ b/gcc/tree-ssa-forwprop.cc
>> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
>> typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
>>
>> static bool forward_propagate_addr_expr (tree, tree, bool);
>> +static bool fold_aggregate_assignment (gimple_stmt_iterator *);
>>
>> /* Set to true if we delete dead edges during the optimization. */
>> static bool cfg_changed;
>> @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree rhs, bool
>> parent_single_use_p)
>> }
>>
>>
>> +/* Try to optimize aggregate assignments by converting them to scalar
>> + MEM_REF operations when profitable for vectorization.
>> + This applies the same folding as memcpy to aggregate assignments. */
>> +
>> +static bool
>> +fold_aggregate_assignment (gimple_stmt_iterator *gsi)
>> +{
>> + gimple *stmt = gsi_stmt (*gsi);
>> +
>> + if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt))
>> + return false;
>> +
>> + tree lhs = gimple_assign_lhs (stmt);
>> + tree rhs = gimple_assign_rhs1 (stmt);
>> +
>> + /* Check if this is an aggregate assignment: *dest = *src
>> + where both sides are aggregate types (can be MEM_REF or indirection).
>> */
>> + bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF);
>> + bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF);
>> +
>> + if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect)
>> + || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect))
>> + return false;
>> +
>> + tree lhs_type = TREE_TYPE (lhs);
>> + tree rhs_type = TREE_TYPE (rhs);
>> +
>> + if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type))
>> + return false;
>> +
>> + if (!types_compatible_p (lhs_type, rhs_type))
>> + return false;
>> +
>> + if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type)))
>> + return false;
>> +
>> + unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT (lhs_type));
>> + if (!pow2p_hwi (ilen) || ilen > MOVE_MAX)
>> + return false;
>> +
>> + tree lhs_base = TREE_OPERAND (lhs, 0);
>> + tree rhs_base = TREE_OPERAND (rhs, 0);
>> +
>> + unsigned int lhs_align = get_pointer_alignment (lhs_base);
>> + unsigned int rhs_align = get_pointer_alignment (rhs_base);
>> +
>> + scalar_int_mode imode;
>> + machine_mode mode;
>> + if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode)
>> + || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists (&mode)
>> + || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT))
>> + return false;
>> +
>> + if ((lhs_align < GET_MODE_ALIGNMENT (mode)
>> + && targetm.slow_unaligned_access (mode, lhs_align)
>> + && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
>> + || (rhs_align < GET_MODE_ALIGNMENT (mode)
>> + && targetm.slow_unaligned_access (mode, rhs_align)
>> + && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing))
>> + return false;
>> +
>> + tree type = bitwise_type_for_mode (mode);
>> + tree srctype = type;
>> + tree desttype = type;
>> +
>> + if (rhs_align < GET_MODE_ALIGNMENT (mode))
>> + srctype = build_aligned_type (type, rhs_align);
>> + if (lhs_align < GET_MODE_ALIGNMENT (mode))
>> + desttype = build_aligned_type (type, lhs_align);
>> +
>> + tree off0 = build_int_cst (build_pointer_type_for_mode (char_type_node,
>> + ptr_mode, true),
>> 0);
>
>
> So you lose the aliasing type here?
> Wait that is only used for INDIRECT_REF which does not exist in gimple.
>
> So this does not handle:
> A->field1 = b->field1;
Like this code:
```
struct s1
{
char t[4];
};
struct s2 { struct s1 field1; };
void f(struct s2 *a, struct s2 *b, int n)
{
for(int i =0; i < n; i++)
{
a->field1 = b->field1;
a++;
b++;
}
}
```
> Where field1 is an aggregate which is the size as the outer aggregate.
> There is a few other things. Like non power of 2.
> Plus fre/VN have a harder time reading through a copy via integer.
> Also the copy prop for aggregates does not handle copies via scalars yet.
> It is one thing to remove a call, e.g. memcpy but it is another to remove
> aggregate copies.
>
> Thanks,
> Andrew
>
>
>
>> +
>> + tree srcmem, destmem;
>> +
>> + if (rhs_is_indirect)
>> + {
>> + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0);
>> + }
>> + else
>> + {
>> + tree rhs_offset = TREE_OPERAND (rhs, 1);
>> + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, rhs_offset);
>> + }
>> +
>> + if (lhs_is_indirect)
>> + {
>> + destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0);
>> + }
>> + else
>> + {
>> + tree lhs_offset = TREE_OPERAND (lhs, 1);
>> + destmem = fold_build2 (MEM_REF, desttype, lhs_base, lhs_offset);
>> + }
>> + gimple *new_stmt;
>> + if (is_gimple_reg_type (srctype))
>> + {
>> + new_stmt = gimple_build_assign (NULL_TREE, srcmem);
>> + tree tmp_var = make_ssa_name (srctype, new_stmt);
>> + gimple_assign_set_lhs (new_stmt, tmp_var);
>> + gimple_set_vuse (new_stmt, gimple_vuse (stmt));
>> + gimple_set_location (new_stmt, gimple_location (stmt));
>> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>> +
>> + new_stmt = gimple_build_assign (destmem, tmp_var);
>> + gimple_move_vops (new_stmt, stmt);
>> + gimple_set_location (new_stmt, gimple_location (stmt));
>> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>> + gsi_remove (gsi, true);
>> + }
>> + else
>> + {
>> + new_stmt = gimple_build_assign (destmem, srcmem);
>> + gimple_move_vops (new_stmt, stmt);
>> + gimple_set_location (new_stmt, gimple_location (stmt));
>> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>> + gsi_remove (gsi, true);
>> + }
>> +
>> + if (dump_file && (dump_flags & TDF_DETAILS))
>> + {
>> + fprintf (dump_file,
>> + "Converted aggregate assignment to scalar MEM_REF:\n");
>> + fprintf (dump_file, " Original: ");
>> + print_gimple_stmt (dump_file, stmt, 0, dump_flags);
>> + fprintf (dump_file, " Size: %u bytes, Mode: %s\n",
>> + (unsigned)ilen, GET_MODE_NAME (mode));
>> + }
>> +
>> + statistics_counter_event (cfun, "aggregate assignment to scalar MEM_REF",
>> 1);
>> +
>> + return true;
>> +}
>> +
>> +
>> /* Helper function for simplify_gimple_switch. Remove case labels that
>> have values outside the range of the new type. */
>>
>> @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun)
>> if (TREE_CODE (lhs) != SSA_NAME
>> || has_zero_uses (lhs))
>> {
>> + if (TREE_CODE (lhs) != SSA_NAME
>> + && fold_aggregate_assignment (&gsi))
>> + continue;
>> +
>> process_vec_perm_simplify_seq_list
>> (&vec_perm_simplify_seq_list);
>> gsi_next (&gsi);
>> continue;
>> --
>> 2.39.5
>>