http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60438
--- Comment #32 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Ok, I've done an --enable-checking=yes,rtl bootstrap on x86_64-linux and i686-linux, and collected statistics without your patch, but with: --- gcc/combine-stack-adj.c.jj 2014-02-06 17:37:02.173062234 +0100 +++ gcc/combine-stack-adj.c 2014-03-11 12:43:26.472118796 +0100 @@ -82,7 +82,7 @@ static rtx single_set_for_csa (rtx); static void free_csa_reflist (struct csa_reflist *); static struct csa_reflist *record_one_stack_ref (rtx, rtx *, struct csa_reflist *); -static int try_apply_stack_adjustment (rtx, struct csa_reflist *, +static int try_apply_stack_adjustment (rtx, rtx, struct csa_reflist *, HOST_WIDE_INT, HOST_WIDE_INT); static void combine_stack_adjustments_for_block (basic_block); static int record_stack_refs (rtx *, void *); @@ -194,7 +194,7 @@ record_one_stack_ref (rtx insn, rtx *ref on success. */ static int -try_apply_stack_adjustment (rtx insn, struct csa_reflist *reflist, +try_apply_stack_adjustment (rtx insn, rtx other, struct csa_reflist *reflist, HOST_WIDE_INT new_adjust, HOST_WIDE_INT delta) { struct csa_reflist *ml; @@ -230,6 +230,16 @@ try_apply_stack_adjustment (rtx insn, st for (ml = reflist; ml ; ml = ml->next) ml->sp_offset -= delta; +if (RTX_FRAME_RELATED_P (insn) || RTX_FRAME_RELATED_P (other)) +{ +FILE *f = fopen ("/tmp/csa", "a"); +fprintf (f, "@@-@@ %s %d ", main_input_filename ? main_input_filename : "-", (int) BITS_PER_WORD); +print_inline_rtx (f, insn, 0); +fprintf (f, "\n*** "); +print_inline_rtx (f, other, 0); +fprintf (f, "\n"); +fclose (f); +} return 1; } else @@ -487,7 +497,7 @@ combine_stack_adjustments_for_block (bas /* Combine an allocation into the first instruction. */ if (STACK_GROWS_DOWNWARD ? this_adjust <= 0 : this_adjust >= 0) { - if (try_apply_stack_adjustment (last_sp_set, reflist, + if (try_apply_stack_adjustment (last_sp_set, insn, reflist, last_sp_adjust + this_adjust, this_adjust)) { @@ -504,7 +514,7 @@ combine_stack_adjustments_for_block (bas else if (STACK_GROWS_DOWNWARD ? last_sp_adjust >= 0 : last_sp_adjust <= 0) { - if (try_apply_stack_adjustment (insn, reflist, + if (try_apply_stack_adjustment (insn, last_sp_set, reflist, last_sp_adjust + this_adjust, -last_sp_adjust)) { @@ -526,6 +536,16 @@ combine_stack_adjustments_for_block (bas { if (last_sp_adjust == 0) { +if (RTX_FRAME_RELATED_P (insn) || RTX_FRAME_RELATED_P (last_sp_set)) +{ +FILE *f = fopen ("/tmp/csa", "a"); +fprintf (f, "@@-@@2 %s %d ", main_input_filename ? main_input_filename : "-", (int) BITS_PER_WORD); +print_inline_rtx (f, insn, 0); +fprintf (f, "\n*** "); +print_inline_rtx (f, last_sp_set, 0); +fprintf (f, "\n"); +fclose (f); +} maybe_move_args_size_note (insn, last_sp_set, true); delete_insn (last_sp_set); } @@ -564,7 +584,7 @@ combine_stack_adjustments_for_block (bas && XEXP (XEXP (dest, 0), 0) == stack_pointer_rtx && !reg_mentioned_p (stack_pointer_rtx, src) && memory_address_p (GET_MODE (dest), stack_pointer_rtx) - && try_apply_stack_adjustment (insn, reflist, 0, + && try_apply_stack_adjustment (insn, last_sp_set, reflist, 0, -last_sp_adjust)) { if (last2_sp_set) @@ -598,6 +618,16 @@ combine_stack_adjustments_for_block (bas { if (last_sp_set && last_sp_adjust == 0) { +if (RTX_FRAME_RELATED_P (insn) || RTX_FRAME_RELATED_P (last_sp_set)) +{ +FILE *f = fopen ("/tmp/csa", "a"); +fprintf (f, "@@-@@3 %s %d ", main_input_filename ? main_input_filename : "-", (int) BITS_PER_WORD); +print_inline_rtx (f, insn, 0); +fprintf (f, "\n*** "); +print_inline_rtx (f, last_sp_set, 0); +fprintf (f, "\n"); +fclose (f); +} force_move_args_size_note (bb, last2_sp_set, last_sp_set); delete_insn (last_sp_set); } @@ -611,6 +641,14 @@ combine_stack_adjustments_for_block (bas if (last_sp_set && last_sp_adjust == 0) { +if (RTX_FRAME_RELATED_P (last_sp_set)) +{ +FILE *f = fopen ("/tmp/csa", "a"); +fprintf (f, "@@-@@4 %s %d ", main_input_filename ? main_input_filename : "-", (int) BITS_PER_WORD); +print_inline_rtx (f, last_sp_set, 0); +fprintf (f, "\n"); +fclose (f); +} force_move_args_size_note (bb, last2_sp_set, last_sp_set); delete_insn (last_sp_set); } The resulting file is too large to attach (316MB), but some statistics from it: $ grep REG_CFA csa | sort | uniq -c | sort -n 491 (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:DI 7 sp) 1290 (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:SI 7 sp) 8071 (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:DI 7 sp) 270194 (expr_list:REG_CFA_ADJUST_CFA (set (reg/f:SI 7 sp) $ grep '^@@-@@[2-4]' csa @@-@@3 cb1010c.adb 64 (insn 102 21 23 2 (set (reg:DI 4 si [112]) $ grep '^@@-@@ [^(]* 32 ' csa | wc -l 325080 $ grep '^@@-@@ [^(]* 64 ' csa | wc -l 8636 So, while perhaps for -m64 it isn't extremely important, for -m32 it affects most of the code, the numbers are actual failures to combine some instructions together caused by the combine-stack-adj.c hunk. Now, looking at the issue, I believe it would be the first time when we'd use RTX_FRAME_RELATED_P insns outside of prologues/epilogues, from what I understand we inhibit various optimizations for it. For any other stack adjustments within function body we use REG_ARGS_SIZE notes, and e.g. the most common typical CSA merging of frame related REG_CFA_ADJUST_CFA with some following stack adjustments (I think usually that is stack adjustment so that further argument pushes result in properly aligned stack for call) is done by keeping the original REG_CFA_ADJUST_CFA amount note and then adding REG_ARGS_SIZE note on the same insn for the remaining adjustment. Thus, I'd say we don't want REG_CFA_* notes nor RTX_FRAME_RELATED_P on the ix86_force_to_memory/ix86_free_from_memory generated stack adjustments, much better would be to add REG_ARGS_SIZE notes to those. Now, the problem with that is that REG_ARGS_SIZE is absolute amount, and in the splitters we actually don't know what is the current REG_ARGS_SIZE depth at that point. So, either we'd need to force REG_ARGS_SIZE note even on the insns that might be split with ix86_{force_to,free_from}_memory, or perhaps we could introduce REG_ARGS_SIZE_DELTA note that would represent just relative adjustment of the current REG_ARGS_SIZE amount. ix86_{force_to,free_from}_memory could then emit those notes, guess haifa-sched.c would need to treat those notes the same as REG_ARGS_SIZE, combine-stack-adj.c either would need to handle both the note kinds, or compute similarly to dwarf2 pass the absolute args_size depth and transform the relative notes into absolute ones first. For the former, REG_ARGS_SIZE note followed by REG_ARGS_SIZE_DELTA note on the second insn being merged together would result in the first REG_ARG_SIZE with size incremented by the delta, REG_ARGS_SIZE_DELTA followed by REG_ARGS_SIZE would result in the second note being kept unmodified, etc. Thoughts?