http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60438

--- Comment #32 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Ok, I've done an --enable-checking=yes,rtl bootstrap on x86_64-linux and
i686-linux, and collected statistics without your patch, but with:
--- gcc/combine-stack-adj.c.jj    2014-02-06 17:37:02.173062234 +0100
+++ gcc/combine-stack-adj.c    2014-03-11 12:43:26.472118796 +0100
@@ -82,7 +82,7 @@ static rtx single_set_for_csa (rtx);
 static void free_csa_reflist (struct csa_reflist *);
 static struct csa_reflist *record_one_stack_ref (rtx, rtx *,
                          struct csa_reflist *);
-static int try_apply_stack_adjustment (rtx, struct csa_reflist *,
+static int try_apply_stack_adjustment (rtx, rtx, struct csa_reflist *,
                        HOST_WIDE_INT, HOST_WIDE_INT);
 static void combine_stack_adjustments_for_block (basic_block);
 static int record_stack_refs (rtx *, void *);
@@ -194,7 +194,7 @@ record_one_stack_ref (rtx insn, rtx *ref
    on success.  */

 static int
-try_apply_stack_adjustment (rtx insn, struct csa_reflist *reflist,
+try_apply_stack_adjustment (rtx insn, rtx other, struct csa_reflist *reflist,
                 HOST_WIDE_INT new_adjust, HOST_WIDE_INT delta)
 {
   struct csa_reflist *ml;
@@ -230,6 +230,16 @@ try_apply_stack_adjustment (rtx insn, st
       for (ml = reflist; ml ; ml = ml->next)
     ml->sp_offset -= delta;

+if (RTX_FRAME_RELATED_P (insn) || RTX_FRAME_RELATED_P (other))
+{
+FILE *f = fopen ("/tmp/csa", "a");
+fprintf (f, "@@-@@ %s %d ", main_input_filename ? main_input_filename : "-",
(int) BITS_PER_WORD);
+print_inline_rtx (f, insn, 0);
+fprintf (f, "\n*** ");
+print_inline_rtx (f, other, 0);
+fprintf (f, "\n");
+fclose (f);
+}
       return 1;
     }
   else
@@ -487,7 +497,7 @@ combine_stack_adjustments_for_block (bas
           /* Combine an allocation into the first instruction.  */
           if (STACK_GROWS_DOWNWARD ? this_adjust <= 0 : this_adjust >= 0)
         {
-          if (try_apply_stack_adjustment (last_sp_set, reflist,
+          if (try_apply_stack_adjustment (last_sp_set, insn, reflist,
                           last_sp_adjust + this_adjust,
                           this_adjust))
             {
@@ -504,7 +514,7 @@ combine_stack_adjustments_for_block (bas
           else if (STACK_GROWS_DOWNWARD
                ? last_sp_adjust >= 0 : last_sp_adjust <= 0)
         {
-          if (try_apply_stack_adjustment (insn, reflist,
+          if (try_apply_stack_adjustment (insn, last_sp_set, reflist,
                           last_sp_adjust + this_adjust,
                           -last_sp_adjust))
             {
@@ -526,6 +536,16 @@ combine_stack_adjustments_for_block (bas
         {
           if (last_sp_adjust == 0)
             {
+if (RTX_FRAME_RELATED_P (insn) || RTX_FRAME_RELATED_P (last_sp_set))
+{
+FILE *f = fopen ("/tmp/csa", "a");
+fprintf (f, "@@-@@2 %s %d ", main_input_filename ? main_input_filename : "-",
(int) BITS_PER_WORD);
+print_inline_rtx (f, insn, 0);
+fprintf (f, "\n*** ");
+print_inline_rtx (f, last_sp_set, 0);
+fprintf (f, "\n");
+fclose (f);
+}
               maybe_move_args_size_note (insn, last_sp_set, true);
               delete_insn (last_sp_set);
             }
@@ -564,7 +584,7 @@ combine_stack_adjustments_for_block (bas
           && XEXP (XEXP (dest, 0), 0) == stack_pointer_rtx
           && !reg_mentioned_p (stack_pointer_rtx, src)
           && memory_address_p (GET_MODE (dest), stack_pointer_rtx)
-          && try_apply_stack_adjustment (insn, reflist, 0,
+          && try_apply_stack_adjustment (insn, last_sp_set, reflist, 0,
                          -last_sp_adjust))
         {
           if (last2_sp_set)
@@ -598,6 +618,16 @@ combine_stack_adjustments_for_block (bas
     {
       if (last_sp_set && last_sp_adjust == 0)
         {
+if (RTX_FRAME_RELATED_P (insn) || RTX_FRAME_RELATED_P (last_sp_set))
+{
+FILE *f = fopen ("/tmp/csa", "a");
+fprintf (f, "@@-@@3 %s %d ", main_input_filename ? main_input_filename : "-",
(int) BITS_PER_WORD);
+print_inline_rtx (f, insn, 0);
+fprintf (f, "\n*** ");
+print_inline_rtx (f, last_sp_set, 0);
+fprintf (f, "\n");
+fclose (f);
+}
           force_move_args_size_note (bb, last2_sp_set, last_sp_set);
           delete_insn (last_sp_set);
         }
@@ -611,6 +641,14 @@ combine_stack_adjustments_for_block (bas

   if (last_sp_set && last_sp_adjust == 0)
     {
+if (RTX_FRAME_RELATED_P (last_sp_set))
+{
+FILE *f = fopen ("/tmp/csa", "a");
+fprintf (f, "@@-@@4 %s %d ", main_input_filename ? main_input_filename : "-",
(int) BITS_PER_WORD);
+print_inline_rtx (f, last_sp_set, 0);
+fprintf (f, "\n");
+fclose (f);
+}
       force_move_args_size_note (bb, last2_sp_set, last_sp_set);
       delete_insn (last_sp_set);
     }

The resulting file is too large to attach (316MB), but some statistics from it:
$ grep REG_CFA csa | sort | uniq -c | sort -n
    491             (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:DI 7 sp)
   1290             (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:SI 7 sp)
   8071         (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:DI 7 sp)
 270194         (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:SI 7 sp)
$ grep '^@@-@@[2-4]' csa
@@-@@3 cb1010c.adb 64 (insn 102 21 23 2 (set (reg:DI 4 si [112])
$ grep '^@@-@@ [^(]* 32 ' csa | wc -l
325080
$ grep '^@@-@@ [^(]* 64 ' csa | wc -l
8636

So, while perhaps for -m64 it isn't extremely important, for -m32 it affects
most of the code, the numbers are actual failures to combine some instructions
together caused by the combine-stack-adj.c hunk.

Now, looking at the issue, I believe it would be the first time when we'd use
RTX_FRAME_RELATED_P insns outside of prologues/epilogues, from what I
understand we inhibit various optimizations for it.  For any other stack
adjustments within function body we use REG_ARGS_SIZE notes, and e.g. the most
common typical CSA merging of frame related REG_CFA_ADJUST_CFA with some
following stack adjustments (I think usually that is stack adjustment so that
further argument pushes result in properly aligned stack for call) is done by
keeping the original REG_CFA_ADJUST_CFA amount note and then adding
REG_ARGS_SIZE note on the same insn for the remaining adjustment.

Thus, I'd say we don't want REG_CFA_* notes nor RTX_FRAME_RELATED_P on the
ix86_force_to_memory/ix86_free_from_memory generated stack adjustments, much
better would be to add REG_ARGS_SIZE notes to those.

Now, the problem with that is that REG_ARGS_SIZE is absolute amount, and in the
splitters we actually don't know what is the current REG_ARGS_SIZE depth at
that point.  So, either we'd need to force REG_ARGS_SIZE note even on the insns
that might be split with ix86_{force_to,free_from}_memory, or perhaps we could
introduce REG_ARGS_SIZE_DELTA note that would represent just relative
adjustment of the current REG_ARGS_SIZE amount. 
ix86_{force_to,free_from}_memory could then emit those notes, guess
haifa-sched.c would need to treat those notes the same as REG_ARGS_SIZE,
combine-stack-adj.c either would need to handle both the note kinds, or compute
similarly to dwarf2 pass the absolute args_size depth and transform the
relative notes into absolute ones first.
For the former, REG_ARGS_SIZE note followed by REG_ARGS_SIZE_DELTA note on the
second insn being merged together would result in the first REG_ARG_SIZE with
size incremented by the delta, REG_ARGS_SIZE_DELTA followed by REG_ARGS_SIZE
would result in the second note being kept unmodified, etc.

Thoughts?

Reply via email to