On Fri, Sep 19, 2025 at 6:44 PM Andrew Pinski <[email protected]> wrote:
>
>
>
> On Fri, Sep 19, 2025, 6:28 PM Peter Damianov <[email protected]> wrote:
>>
>> This patch implements folding of aggregate assignments (*dest = *src)
>> by converting them to scalar MEM_REF operations when the size
>> permits. This enables vectorization opportunities.
>>
>> gcc/ChangeLog:
>>
>>         PR tree-optimization/99504
>>         * tree-ssa-forwprop.cc (fold_aggregate_assignment): New function.
>>         Folds aggregate assignments to scalar MEM_REF operations/
>>         (pass_forwprop::execute): Call fold_aggregate_assignment for
>>         applicable assignment statements.
>>
>> gcc/testsuite/ChangeLog:
>>
>>         PR tree-optimization/99504
>>         * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that aggregate
>>         assignments of various sizes get folded to scalar MEM_REF
>>         operations.
>>
>> Signed-off-by: Peter Damianov <[email protected]>
>> ---
>> v2: Remove int128 part of test because it cannot apply to every target
>> and would throw off the count of MEM\\\[
>>  gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c |  53 ++++++++
>>  gcc/tree-ssa-forwprop.cc                    | 140 ++++++++++++++++++++
>>  2 files changed, 193 insertions(+)
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>>
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c 
>> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>> new file mode 100644
>> index 00000000000..aa49cf6a238
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>> @@ -0,0 +1,53 @@
>> +/* PR tree-optimization/99504 */
>> +/* Test that aggregate assignments get folded to scalar MEM_REF operations 
>> */
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */
>> +
>> +#include <stdint.h>
>> +
>> +struct pixel_4 {
>> +  uint8_t r, g, b, a;
>> +};
>> +
>> +struct pixel_8 {
>> +  uint16_t r, g, b, a;
>> +};
>> +
>> +struct pixel_16 {
>> +  uint32_t r, g, b, a;
>> +};
>> +
>> +struct pixel_32 {
>> +  uint64_t r, g, b, a;
>> +};
>> +
>> +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src)
>> +{
>> +  *dest = *src;
>> +}
>> +
>> +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src)
>> +{
>> +  *dest = *src;
>> +}
>> +
>> +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src)
>> +{
>> +  *dest = *src;
>> +}
>> +
>> +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src)
>> +{
>> +  *dest = *src;
>> +}
>> +
>> +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n)
>> +{
>> +  for (int i = 0; i < n; i++)
>> +    dest[i] = src[i];
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "MEM\\\[" 8 "forwprop1" } } */
>>
>> +/* Check that we generate scalar temporaries for the folded assignments */
>> +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" } } */
>> +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+" "forwprop1" } } */
>> \ No newline at end of file
>> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
>> index 43b1c9d696f..3ce94a737c6 100644
>> --- a/gcc/tree-ssa-forwprop.cc
>> +++ b/gcc/tree-ssa-forwprop.cc
>> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
>>  typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
>>
>>  static bool forward_propagate_addr_expr (tree, tree, bool);
>> +static bool fold_aggregate_assignment (gimple_stmt_iterator *);
>>
>>  /* Set to true if we delete dead edges during the optimization.  */
>>  static bool cfg_changed;
>> @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree rhs, bool 
>> parent_single_use_p)
>>  }
>>
>>
>> +/* Try to optimize aggregate assignments by converting them to scalar
>> +   MEM_REF operations when profitable for vectorization.
>> +   This applies the same folding as memcpy to aggregate assignments.  */
>> +
>> +static bool
>> +fold_aggregate_assignment (gimple_stmt_iterator *gsi)
>> +{
>> +  gimple *stmt = gsi_stmt (*gsi);
>> +
>> +  if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt))
>> +    return false;
>> +
>> +  tree lhs = gimple_assign_lhs (stmt);
>> +  tree rhs = gimple_assign_rhs1 (stmt);
>> +
>> +  /* Check if this is an aggregate assignment: *dest = *src
>> +     where both sides are aggregate types (can be MEM_REF or indirection).  
>> */
>> +  bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF);
>> +  bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF);
>> +
>> +  if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect)
>> +      || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect))
>> +    return false;
>> +
>> +  tree lhs_type = TREE_TYPE (lhs);
>> +  tree rhs_type = TREE_TYPE (rhs);
>> +
>> +  if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type))
>> +    return false;
>> +
>> +  if (!types_compatible_p (lhs_type, rhs_type))
>> +    return false;
>> +
>> +  if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type)))
>> +    return false;
>> +
>> +  unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT (lhs_type));
>> +  if (!pow2p_hwi (ilen) || ilen > MOVE_MAX)
>> +    return false;
>> +
>> +  tree lhs_base = TREE_OPERAND (lhs, 0);
>> +  tree rhs_base = TREE_OPERAND (rhs, 0);
>> +
>> +  unsigned int lhs_align = get_pointer_alignment (lhs_base);
>> +  unsigned int rhs_align = get_pointer_alignment (rhs_base);
>> +
>> +  scalar_int_mode imode;
>> +  machine_mode mode;
>> +  if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode)
>> +      || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists (&mode)
>> +      || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT))
>> +    return false;
>> +
>> +  if ((lhs_align < GET_MODE_ALIGNMENT (mode)
>> +       && targetm.slow_unaligned_access (mode, lhs_align)
>> +       && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
>> +      || (rhs_align < GET_MODE_ALIGNMENT (mode)
>> +         && targetm.slow_unaligned_access (mode, rhs_align)
>> +         && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing))
>> +    return false;
>> +
>> +  tree type = bitwise_type_for_mode (mode);
>> +  tree srctype = type;
>> +  tree desttype = type;
>> +
>> +  if (rhs_align < GET_MODE_ALIGNMENT (mode))
>> +    srctype = build_aligned_type (type, rhs_align);
>> +  if (lhs_align < GET_MODE_ALIGNMENT (mode))
>> +    desttype = build_aligned_type (type, lhs_align);
>> +
>> +  tree off0 = build_int_cst (build_pointer_type_for_mode (char_type_node,
>> +                                                         ptr_mode, true), 
>> 0);
>
>
> So you lose the aliasing type here?
> Wait that is only used for INDIRECT_REF which does not exist in gimple.
>
> So this does not handle:
> A->field1 = b->field1;

Like this code:
```
struct s1
{
  char t[4];
};
struct s2 { struct s1 field1; };

void f(struct s2 *a, struct s2 *b, int n)
{
  for(int i =0; i < n; i++)
  {
        a->field1 = b->field1;
        a++;
        b++;
  }
}
```


> Where field1 is an aggregate which is the size as the outer aggregate.
> There is a few other things. Like non power of 2.
> Plus fre/VN have a harder time reading through a copy via integer.
> Also the copy prop for aggregates does not handle copies via scalars yet.
> It is one thing to remove a call, e.g. memcpy but it is another to remove 
> aggregate copies.
>
> Thanks,
> Andrew
>
>
>
>> +
>> +  tree srcmem, destmem;
>> +
>> +  if (rhs_is_indirect)
>> +    {
>> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0);
>> +    }
>> +  else
>> +    {
>> +      tree rhs_offset = TREE_OPERAND (rhs, 1);
>> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base, rhs_offset);
>> +    }
>> +
>> +  if (lhs_is_indirect)
>> +    {
>> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0);
>> +    }
>> +  else
>> +    {
>> +      tree lhs_offset = TREE_OPERAND (lhs, 1);
>> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base, lhs_offset);
>> +    }
>> +  gimple *new_stmt;
>> +  if (is_gimple_reg_type (srctype))
>> +    {
>> +      new_stmt = gimple_build_assign (NULL_TREE, srcmem);
>> +      tree tmp_var = make_ssa_name (srctype, new_stmt);
>> +      gimple_assign_set_lhs (new_stmt, tmp_var);
>> +      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
>> +      gimple_set_location (new_stmt, gimple_location (stmt));
>> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>> +
>> +      new_stmt = gimple_build_assign (destmem, tmp_var);
>> +      gimple_move_vops (new_stmt, stmt);
>> +      gimple_set_location (new_stmt, gimple_location (stmt));
>> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>> +      gsi_remove (gsi, true);
>> +    }
>> +  else
>> +    {
>> +      new_stmt = gimple_build_assign (destmem, srcmem);
>> +      gimple_move_vops (new_stmt, stmt);
>> +      gimple_set_location (new_stmt, gimple_location (stmt));
>> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>> +      gsi_remove (gsi, true);
>> +    }
>> +
>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>> +    {
>> +      fprintf (dump_file,
>> +              "Converted aggregate assignment to scalar MEM_REF:\n");
>> +      fprintf (dump_file, "  Original: ");
>> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
>> +      fprintf (dump_file, "  Size: %u bytes, Mode: %s\n",
>> +              (unsigned)ilen, GET_MODE_NAME (mode));
>> +    }
>> +
>> +  statistics_counter_event (cfun, "aggregate assignment to scalar MEM_REF", 
>> 1);
>> +
>> +  return true;
>> +}
>> +
>> +
>>  /* Helper function for simplify_gimple_switch.  Remove case labels that
>>     have values outside the range of the new type.  */
>>
>> @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun)
>>           if (TREE_CODE (lhs) != SSA_NAME
>>               || has_zero_uses (lhs))
>>             {
>> +             if (TREE_CODE (lhs) != SSA_NAME
>> +                 && fold_aggregate_assignment (&gsi))
>> +               continue;
>> +
>>               process_vec_perm_simplify_seq_list 
>> (&vec_perm_simplify_seq_list);
>>               gsi_next (&gsi);
>>               continue;
>> --
>> 2.39.5
>>

Reply via email to