On Fri, Sep 19, 2025, 6:22 PM Peter Damianov <[email protected]> wrote:

> This patch implements folding of aggregate assignments (*dest = *src)
> by converting them to scalar MEM_REF operations when the size
> permits. This enables vectorization opportunities.
>


I am not sure this will work with my recent changes to forwprop.  Plus it
might make sra not handle them later on.

Plus INDIRECT_REF will never shows up in gimple.

Iirc it is ldist that should be handling aggregate assignments to get it
converted into memcpys.
It is not the vectorizer really.

Thanks,
Andrew





> gcc/ChangeLog:
>
>         PR tree-optimization/99504
>         * tree-ssa-forwprop.cc (fold_aggregate_assignment): New function.
>         Folds aggregate assignments to scalar MEM_REF operations/
>         (pass_forwprop::execute): Call fold_aggregate_assignment for
>         applicable assignment statements.
>
> gcc/testsuite/ChangeLog:
>
>         PR tree-optimization/99504
>         * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that aggregate
>         assignments of various sizes get folded to scalar MEM_REF
>         operations.
>
> Signed-off-by: Peter Damianov <[email protected]>
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c |  66 +++++++++
>  gcc/tree-ssa-forwprop.cc                    | 140 ++++++++++++++++++++
>  2 files changed, 206 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> new file mode 100644
> index 00000000000..7fef9821e9e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> @@ -0,0 +1,66 @@
> +/* PR tree-optimization/99504 */
> +/* Test that aggregate assignments get folded to scalar MEM_REF
> operations */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */
> +
> +#include <stdint.h>
> +
> +struct pixel_4 {
> +  uint8_t r, g, b, a;
> +};
> +
> +struct pixel_8 {
> +  uint16_t r, g, b, a;
> +};
> +
> +struct pixel_16 {
> +  uint32_t r, g, b, a;
> +};
> +
> +struct pixel_32 {
> +  uint64_t r, g, b, a;
> +};
> +
> +#ifdef __SIZEOF_INT128__
> +struct pixel_64 {
> +  __int128 r, g, b, a;
> +};
> +#endif
> +
> +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src)
> +{
> +  *dest = *src;
> +}
> +
> +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src)
> +{
> +  *dest = *src;
> +}
> +
> +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src)
> +{
> +  *dest = *src;
> +}
> +
> +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src)
> +{
> +  *dest = *src;
> +}
> +
> +#ifdef __SIZEOF_INT128__
> +void test_64_bytes(struct pixel_64 *dest, struct pixel_64 *src)
> +{
> +  *dest = *src;
> +}
> +#endif
> +
> +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dest[i] = src[i];
> +}
> +
> +/* { dg-final { scan-tree-dump-times "MEM\\\[" 10 "forwprop1" } } */
> +/* Check that we generate scalar temporaries for the folded assignments */
> +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" } } */
> +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+" "forwprop1" } }
> */
> \ No newline at end of file
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 43b1c9d696f..3ce94a737c6 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
>  typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
>
>  static bool forward_propagate_addr_expr (tree, tree, bool);
> +static bool fold_aggregate_assignment (gimple_stmt_iterator *);
>
>  /* Set to true if we delete dead edges during the optimization.  */
>  static bool cfg_changed;
> @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree rhs,
> bool parent_single_use_p)
>  }
>
>
> +/* Try to optimize aggregate assignments by converting them to scalar
> +   MEM_REF operations when profitable for vectorization.
> +   This applies the same folding as memcpy to aggregate assignments.  */
> +
> +static bool
> +fold_aggregate_assignment (gimple_stmt_iterator *gsi)
> +{
> +  gimple *stmt = gsi_stmt (*gsi);
> +
> +  if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt))
> +    return false;
> +
> +  tree lhs = gimple_assign_lhs (stmt);
> +  tree rhs = gimple_assign_rhs1 (stmt);
> +
> +  /* Check if this is an aggregate assignment: *dest = *src
> +     where both sides are aggregate types (can be MEM_REF or
> indirection).  */
> +  bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF);
> +  bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF);
> +
> +  if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect)
> +      || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect))
> +    return false;
> +
> +  tree lhs_type = TREE_TYPE (lhs);
> +  tree rhs_type = TREE_TYPE (rhs);
> +
> +  if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type))
> +    return false;
> +
> +  if (!types_compatible_p (lhs_type, rhs_type))
> +    return false;
> +
> +  if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type)))
> +    return false;
> +
> +  unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT (lhs_type));
> +  if (!pow2p_hwi (ilen) || ilen > MOVE_MAX)
> +    return false;
> +
> +  tree lhs_base = TREE_OPERAND (lhs, 0);
> +  tree rhs_base = TREE_OPERAND (rhs, 0);
> +
> +  unsigned int lhs_align = get_pointer_alignment (lhs_base);
> +  unsigned int rhs_align = get_pointer_alignment (rhs_base);
> +
> +  scalar_int_mode imode;
> +  machine_mode mode;
> +  if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode)
> +      || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists (&mode)
> +      || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT))
> +    return false;
> +
> +  if ((lhs_align < GET_MODE_ALIGNMENT (mode)
> +       && targetm.slow_unaligned_access (mode, lhs_align)
> +       && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
> +      || (rhs_align < GET_MODE_ALIGNMENT (mode)
> +         && targetm.slow_unaligned_access (mode, rhs_align)
> +         && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing))
> +    return false;
> +
> +  tree type = bitwise_type_for_mode (mode);
> +  tree srctype = type;
> +  tree desttype = type;
> +
> +  if (rhs_align < GET_MODE_ALIGNMENT (mode))
> +    srctype = build_aligned_type (type, rhs_align);
> +  if (lhs_align < GET_MODE_ALIGNMENT (mode))
> +    desttype = build_aligned_type (type, lhs_align);
> +
> +  tree off0 = build_int_cst (build_pointer_type_for_mode (char_type_node,
> +                                                         ptr_mode, true),
> 0);
> +
> +  tree srcmem, destmem;
> +
> +  if (rhs_is_indirect)
> +    {
> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0);
> +    }
> +  else
> +    {
> +      tree rhs_offset = TREE_OPERAND (rhs, 1);
> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base, rhs_offset);
> +    }
> +
> +  if (lhs_is_indirect)
> +    {
> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0);
> +    }
> +  else
> +    {
> +      tree lhs_offset = TREE_OPERAND (lhs, 1);
> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base, lhs_offset);
> +    }
> +  gimple *new_stmt;
> +  if (is_gimple_reg_type (srctype))
> +    {
> +      new_stmt = gimple_build_assign (NULL_TREE, srcmem);
> +      tree tmp_var = make_ssa_name (srctype, new_stmt);
> +      gimple_assign_set_lhs (new_stmt, tmp_var);
> +      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
> +      gimple_set_location (new_stmt, gimple_location (stmt));
> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +
> +      new_stmt = gimple_build_assign (destmem, tmp_var);
> +      gimple_move_vops (new_stmt, stmt);
> +      gimple_set_location (new_stmt, gimple_location (stmt));
> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +      gsi_remove (gsi, true);
> +    }
> +  else
> +    {
> +      new_stmt = gimple_build_assign (destmem, srcmem);
> +      gimple_move_vops (new_stmt, stmt);
> +      gimple_set_location (new_stmt, gimple_location (stmt));
> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +      gsi_remove (gsi, true);
> +    }
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file,
> +              "Converted aggregate assignment to scalar MEM_REF:\n");
> +      fprintf (dump_file, "  Original: ");
> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +      fprintf (dump_file, "  Size: %u bytes, Mode: %s\n",
> +              (unsigned)ilen, GET_MODE_NAME (mode));
> +    }
> +
> +  statistics_counter_event (cfun, "aggregate assignment to scalar
> MEM_REF", 1);
> +
> +  return true;
> +}
> +
> +
>  /* Helper function for simplify_gimple_switch.  Remove case labels that
>     have values outside the range of the new type.  */
>
> @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun)
>           if (TREE_CODE (lhs) != SSA_NAME
>               || has_zero_uses (lhs))
>             {
> +             if (TREE_CODE (lhs) != SSA_NAME
> +                 && fold_aggregate_assignment (&gsi))
> +               continue;
> +
>               process_vec_perm_simplify_seq_list
> (&vec_perm_simplify_seq_list);
>               gsi_next (&gsi);
>               continue;
> --
> 2.39.5
>
>

Reply via email to