For PR62173, the ideal solution is to resolve the problem on tree level ivopt pass.
While, apart from the tree level issue, PR 62173 also exposed another two RTL level issues. one of them is looks like we could improve RTL level loop invariant hoisting by re-shuffle insns. for Seb's testcase void bar(int i) { char A[10]; int d = 0; while (i > 0) A[d++] = i--; while (d > 0) foo(A[d--]); } the insn sequences to calculate A[I]'s address looks like: (insn 76 75 77 22 (set (reg/f:DI 109) (plus:DI (reg/f:DI 64 sfp) (reg:DI 108 [ i ]))) seb-pop.c:8 84 {*adddi3_aarch64} (expr_list:REG_DEAD (reg:DI 108 [ i ]) (nil))) (insn 77 76 78 22 (set (reg:SI 110 [ D.2633 ]) (zero_extend:SI (mem/j:QI (plus:DI (reg/f:DI 109) (const_int -16 [0xfffffffffffffff0])) [0 A S1 A8]))) seb-pop.c:8 76 {*zero_extendqisi2_aarch64} (expr_list:REG_DEAD (reg/f:DI 109) (nil))) while for most RISC archs, reg + reg addressing is typical, so if we re-shuffle the instruction sequences into the following: (insn 96 94 97 22 (set (reg/f:DI 129) (plus:DI (reg/f:DI 64 sfp) (const_int -16 [0xfffffffffffffff0]))) seb-pop.c:8 84 {*adddi3_aarch64} (nil)) (insn 97 96 98 22 (set (reg:DI 130 [ i ]) (sign_extend:DI (reg/v:SI 97 [ i ]))) seb-pop.c:8 70 {*extendsidi2_aarch64} (expr_list:REG_DEAD (reg/v:SI 97 [ i ]) (nil))) (insn 98 97 99 22 (set (reg:SI 131 [ D.2633 ]) (zero_extend:SI (mem/j:QI (plus:DI (reg/f:DI 129) (reg:DI 130 [ i ])) [0 A S1 A8]))) seb-pop.c:8 76 {*zero_extendqisi2_aarch64} (expr_list:REG_DEAD (reg:DI 130 [ i ]) (expr_list:REG_DEAD (reg/f:DI 129) (nil)))) which means re-associate the constant imm with the virtual frame pointer. transform RA <- fixed_reg + RC RD <- MEM (RA + const_offset) into: RA <- fixed_reg + const_offset RD <- MEM (RA + RC) then RA <- fixed_reg + const_offset is actually loop invariant, so the later RTL GCSE PRE pass could catch it and do the hoisting, and thus ameliorate what tree level ivopts could not sort out. and this patch only tries to re-shuffle instructions within single basic block which is a inner loop which is perf critical. I am reusing the loop info in fwprop because there is loop info and it's run before GCSE. verified on aarch64 and mips64, the array base address hoisted out of loop. bootstrap ok on x86-64 and aarch64. comments? thanks. gcc/ PR62173 fwprop.c (prepare_for_gcse_pre): New function. (fwprop_done): Call it.
diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 377b33c..b2a5918 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -1399,6 +1399,133 @@ forward_propagate_into (df_ref use) return false; } +/* Loop invariant variable hoisting for critical code has + important impact on the performance. + + The RTL GCSE PRE pass could detect more hoisting opportunities + if we re-shuffle the instructions to associate fixed registers + with constant. + + This function try to transform + + RA <- RB_fixed + RC + RD <- MEM (RA + const_offset) + + into: + + RA <- RB_fixed + const_offset + RD <- MEM (RA + RC) + + If RA is DEAD after the second instruction. + + After this change, the first instruction is loop invariant. */ + +static void +prepare_for_gcse_pre () +{ + struct loop *loop; + + if (! current_loops) + return; + + FOR_EACH_LOOP (loop, LI_INCLUDE_ROOT) + { + if (loop && loop->header && loop->latch + && loop->header->index == loop->latch->index) + { + rtx_insn *insn, *next_insn; + rtx single_set1, single_set2, old_dest; + rtx op0, op0_; + rtx op1, op1_; + rtx inner; + rtx *mem_plus_loc; + + basic_block bb = BASIC_BLOCK_FOR_FN (cfun, loop->header->index); + + FOR_BB_INSNS (bb, insn) + { + if (! NONDEBUG_INSN_P (insn)) + continue; + + single_set1 = single_set (insn); + + if (! single_set1 + || GET_CODE (SET_SRC (single_set1)) != PLUS) + continue; + + old_dest = SET_DEST (single_set1); + op0 = XEXP (SET_SRC (single_set1), 0); + op1 = XEXP (SET_SRC (single_set1), 1); + + if (op1 == frame_pointer_rtx + || op1 == stack_pointer_rtx + || op1 == virtual_stack_vars_rtx) + std::swap (op0, op1); + + if (! (REG_P (old_dest) && REG_P (op0) && REG_P (op1) + && (op0 == frame_pointer_rtx + || op0 == stack_pointer_rtx + || op0 == virtual_stack_vars_rtx))) + continue; + + if (! (next_insn = next_real_insn (insn))) + break; + + do + { + if (DEBUG_INSN_P (next_insn)) + continue; + + single_set2 = single_set (next_insn); + + if (!single_set2 || ! REG_P (SET_DEST (single_set2))) + continue; + + inner = SET_SRC (single_set2); + + if (GET_CODE (inner) == ZERO_EXTEND + || GET_CODE (inner) == SIGN_EXTEND + || GET_CODE (inner) == TRUNCATE) + inner = XEXP (inner, 0); + + if (! MEM_P (inner) + || GET_CODE (XEXP (inner, 0)) != PLUS) + continue; + + mem_plus_loc = &XEXP (inner, 0); + op0_ = XEXP (XEXP (inner, 0), 0); + op1_ = XEXP (XEXP (inner, 0), 1); + + if (REG_P (op0_) && CONST_INT_P (op1_) + && rtx_equal_p (op0_, old_dest) + && GET_MODE (op0_) == GET_MODE (op1)) + { + rtx new_src; + + if (find_regno_note (next_insn, REG_DEAD, + REGNO (old_dest))) + { + new_src = plus_constant (GET_MODE (op0), op0, + INTVAL (op1_)); + validate_change (insn, &SET_SRC (single_set1), + new_src, 1); + new_src = gen_rtx_PLUS (GET_MODE (op0_), op0_, op1); + validate_change (next_insn, mem_plus_loc, new_src, 1); + if (apply_change_group () && dump_file) + fprintf (dump_file, + "\nRe-associate insn %d and %d for later" + " RTL loop invariant hoisting.\n", + INSN_UID (insn), INSN_UID (next_insn)); + } + break; + } + } while ((next_insn = next_real_insn (next_insn)) + && bb == BLOCK_FOR_INSN (next_insn)); + } + } + } +} + static void fwprop_init (void) @@ -1424,6 +1551,7 @@ fwprop_init (void) static void fwprop_done (void) { + prepare_for_gcse_pre (); loop_optimizer_finalize (); use_def_ref.release ();