On Thu, Apr 03, 2025 at 02:40:43PM +0100, Richard Sandiford wrote:
> > I see.  I understood Jakub thinks my change is too aggressive since
> > it does indeed miss combine attempts from changed log-links.
> >
> > I remember that when allowing added_links_insn to override the
> > i3 return for i2_unchaged_p the bad compile-time re-surfaced.  So
> > now I wonder whether, if i2_unchanged, distribute_links (i2links)
> > does (should do?) anything?  I don't see anything that would avoid
> > adjusting links when there's still a use in newi2_patt (IIRC
> > i0 and i1 are always gone?).

Note, I'd be fine even with the earlier patch you've posted.

> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -4208,16 +4208,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
> rtx_insn *i0,
>        adjust_for_new_dest (i3);
>      }
>  
> -  /* If I2 didn't change, this is not a combination (but a simplification or
> -     canonicalisation with context), which should not be done here.  Doing
> -     it here explodes the algorithm.  Don't.  */
> -  if (rtx_equal_p (newi2pat, PATTERN (i2)))
> -    {
> -      if (dump_file)
> -     fprintf (dump_file, "i2 didn't change, not doing this\n");
> -      undo_all ();
> -      return 0;
> -    }
> +  bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2));
>  
>    /* We now know that we can do this combination.  Merge the insns and
>       update the status of registers and LOG_LINKS.  */
> @@ -4785,6 +4776,9 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
> rtx_insn *i0,
>    combine_successes++;
>    undo_commit ();
>  
> +  if (only_i3_changed)
> +    return i3;
> +
>    rtx_insn *ret = newi2pat ? i2 : i3;
>    if (added_links_insn && DF_INSN_LUID (added_links_insn) < DF_INSN_LUID 
> (ret))
>      ret = added_links_insn;
> 
> so essentially your patch, I see an 11.9x improvement in combine's
> compile time in the original testcase for PR101523, going from 88%
> to 41% of total compile time.  Memory usage improves 110x, from
> 97% to 25%.  The output is unchanged.  Adding:

Though the above looks really nice.

> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -472,7 +472,7 @@ static void move_deaths (rtx, rtx, int, rtx_insn *, rtx 
> *);
>  static bool reg_bitfield_target_p (rtx, rtx);
>  static void distribute_notes (rtx, rtx_insn *, rtx_insn *, rtx_insn *,
>                             rtx, rtx, rtx);
> -static void distribute_links (struct insn_link *);
> +static void distribute_links (struct insn_link *, rtx_insn * = nullptr);
>  static void mark_used_regs_combine (rtx);
>  static void record_promoted_value (rtx_insn *, rtx);
>  static bool unmentioned_reg_p (rtx, rtx);
> @@ -4209,6 +4209,8 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
> rtx_insn *i0,
>      }
>  
>    bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2));
> +  if (only_i3_changed)
> +    swap_i2i3 = false;
>  
>    /* We now know that we can do this combination.  Merge the insns and
>       update the status of registers and LOG_LINKS.  */
> @@ -4584,8 +4586,8 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
> rtx_insn *i0,
>                           NULL_RTX, NULL_RTX, NULL_RTX);
>        }
>  
> -    distribute_links (i3links);
> -    distribute_links (i2links);
> +    distribute_links (i3links, only_i3_changed ? i3 : nullptr);
> +    distribute_links (i2links, !i0 && !i1 ? i2 : nullptr);
>      distribute_links (i1links);
>      distribute_links (i0links);
>  
> @@ -14978,10 +14980,13 @@ distribute_notes (rtx notes, rtx_insn *from_insn, 
> rtx_insn *i3, rtx_insn *i2,
>  
>  /* Similarly to above, distribute the LOG_LINKS that used to be present on
>     I3, I2, and I1 to new locations.  This is also called to add a link
> -   pointing at I3 when I3's destination is changed.  */
> +   pointing at I3 when I3's destination is changed.
> +
> +   If START is nonnull and an insn, we know that the next location for
> +   each link is no earlier than START.  */
>  
>  static void
> -distribute_links (struct insn_link *links)
> +distribute_links (struct insn_link *links, rtx_insn *start)
>  {
>    struct insn_link *link, *next_link;
>  
> @@ -15047,7 +15052,10 @@ distribute_links (struct insn_link *links)
>        I3 to I2.  Also note that not much searching is typically done here
>        since most links don't point very far away.  */
>  
> -      for (insn = NEXT_INSN (link->insn);
> +      insn = start;
> +      if (!insn || NOTE_P (insn))
> +     insn = NEXT_INSN (link->insn);
> +      for (;
>          (insn && (this_basic_block->next_bb == EXIT_BLOCK_PTR_FOR_FN (cfun)
>                    || BB_HEAD (this_basic_block->next_bb) != insn));
>          insn = NEXT_INSN (insn))
> 
> improves compile time a further 17.7%, going from 41% to 33% of
> compile time.  This still generates the same code for the testcase
> as the unpatched compiler, and I checked that a boostrap and regression
> test succeeded with the (independent rather than cumulative) patch:

and this as well, so if the options are revert again for GCC 15 and apply
just the above two patches, I'd go for the latter.  33% we can live with
for GCC 15 and the limiting can be added later.  But we can also add it
for the only_i3_changed case now so that it will only trigger in the cases
where otherwise we'd put in most of the GCC 15 development and make
the limiting for everything be effective only in GCC 16.

        Jakub

Reply via email to