https://gcc.gnu.org/g:2c30f828e4507863713cff44cd30c88aa7f27865
commit r16-1551-g2c30f828e4507863713cff44cd30c88aa7f27865 Author: Lili Cui <lili....@intel.com> Date: Tue Jun 17 21:39:38 2025 +0800 x86: Enable separate shrink wrapping This commit implements the target macros (TARGET_SHRINK_WRAP_*) that enable separate shrink wrapping for function prologues/epilogues in x86. When performing separate shrink wrapping, we choose to use mov instead of push/pop, because using push/pop is more complicated to handle rsp adjustment and may lose performance, so here we choose to use mov, which has a small impact on code size, but guarantees performance. Using mov means we need to use sub/add to maintain the stack frame. In some special cases, we need to use lea to prevent affecting EFlags. Avoid inserting sub between test-je-jle to change EFlags, lea should be used here. foo: xorl %eax, %eax testl %edi, %edi je .L11 sub $16, %rsp ------> leaq -16(%rsp), %rsp movq %r13, 8(%rsp) movl $1, %r13d jle .L4 Tested against SPEC CPU 2017, this change always has a net-positive effect on the dynamic instruction count. See the following table for the breakdown on how this reduces the number of dynamic instructions per workload on a like-for-like (with/without this commit): instruction count base with commit (commit-base)/commit 502.gcc_r 98666845943 96891561634 -1.80% 526.blender_r 6.21226E+11 6.12992E+11 -1.33% 520.omnetpp_r 1.1241E+11 1.11093E+11 -1.17% 500.perlbench_r 1271558717 1263268350 -0.65% 523.xalancbmk_r 2.20103E+11 2.18836E+11 -0.58% 531.deepsjeng_r 2.73591E+11 2.72114E+11 -0.54% 500.perlbench_r 64195557393 63881512409 -0.49% 541.leela_r 2.99097E+11 2.98245E+11 -0.29% 548.exchange2_r 1.27976E+11 1.27784E+11 -0.15% 527.cam4_r 88981458425 88887334679 -0.11% 554.roms_r 2.60072E+11 2.59809E+11 -0.10% Collected spec2017 performance on ZNVER5, EMR and ICELAKE. No performance regression was observed. For O2 multi-copy : 511.povray_r improved by 2.8% on ZNVER5. 511.povray_r improved by 4% on EMR 511.povray_r improved by 3.3 % ~ 4.6% on ICELAKE. gcc/ChangeLog: * config/i386/i386-protos.h (ix86_get_separate_components): New function. (ix86_components_for_bb): Likewise. (ix86_disqualify_components): Likewise. (ix86_emit_prologue_components): Likewise. (ix86_emit_epilogue_components): Likewise. (ix86_set_handled_components): Likewise. * config/i386/i386.cc (save_regs_using_push_pop): Split from ix86_compute_frame_layout. (ix86_compute_frame_layout): Use save_regs_using_push_pop. (pro_epilogue_adjust_stack): Use gen_pro_epilogue_adjust_stack_add_nocc. (ix86_expand_prologue): Add some assertions and adjust the stack frame at the beginning of the prolog for shrink wrapping separate. (ix86_emit_save_regs_using_mov): Skip registers that are wrapped separately. (ix86_emit_restore_regs_using_mov): Likewise. (ix86_expand_epilogue): Add some assertions and set restore_regs_via_mov to true for shrink wrapping separate. (ix86_get_separate_components): New function. (ix86_components_for_bb): Likewise. (ix86_disqualify_components): Likewise. (ix86_emit_prologue_components): Likewise. (ix86_emit_epilogue_components): Likewise. (ix86_set_handled_components): Likewise. (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define. (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise. (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise. (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise. (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise. (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise. * config/i386/i386.h (struct machine_function):Add reg_is_wrapped_separately array for register wrapping information. * config/i386/i386.md (@pro_epilogue_adjust_stack_add_nocc<mode>): New. gcc/testsuite/ChangeLog: * gcc.target/x86_64/abi/callabi/leaf-2.c: Adjust the test. * gcc.target/i386/interrupt-16.c: Likewise. * gfortran.dg/guality/arg1.f90: Likewise. * gcc.target/i386/avx10_2-comibf-1.c: Likewise. * g++.target/i386/shrink_wrap_separate.C: New test. * gcc.target/i386/shrink_wrap_separate_check_lea.c: Likewise. Co-authored-by: Michael Matz <m...@suse.de> Diff: --- gcc/config/i386/i386-protos.h | 7 + gcc/config/i386/i386.cc | 332 ++++++++++++++++++--- gcc/config/i386/i386.h | 4 + gcc/config/i386/i386.md | 22 ++ .../g++.target/i386/shrink_wrap_separate.C | 25 ++ gcc/testsuite/gcc.target/i386/avx10_2-comibf-1.c | 2 +- gcc/testsuite/gcc.target/i386/interrupt-16.c | 4 +- .../i386/shrink_wrap_separate_check_lea.c | 29 ++ .../gcc.target/x86_64/abi/callabi/leaf-2.c | 2 +- gcc/testsuite/gfortran.dg/guality/arg1.f90 | 2 +- 10 files changed, 379 insertions(+), 50 deletions(-) diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 10863ab9e9de..86194b3d319c 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -437,6 +437,13 @@ extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); extern bool ix86_has_no_direct_extern_access; extern bool ix86_rpad_gate (); +extern sbitmap ix86_get_separate_components (void); +extern sbitmap ix86_components_for_bb (basic_block); +extern void ix86_disqualify_components (sbitmap, edge, sbitmap, bool); +extern void ix86_emit_prologue_components (sbitmap); +extern void ix86_emit_epilogue_components (sbitmap); +extern void ix86_set_handled_components (sbitmap); + /* In i386-expand.cc. */ bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*, HOST_WIDE_INT*); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 20ee360dcb0f..9bf198c7416c 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -6905,6 +6905,26 @@ ix86_pro_and_epilogue_can_use_push2pop2 (int nregs) && (nregs + aligned) >= 3; } +/* Check if push/pop should be used to save/restore registers. */ +static bool +save_regs_using_push_pop (HOST_WIDE_INT to_allocate) +{ + return ((!to_allocate && cfun->machine->frame.nregs <= 1) + || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) + /* If static stack checking is enabled and done with probes, + the registers need to be saved before allocating the frame. */ + || flag_stack_check == STATIC_BUILTIN_STACK_CHECK + /* If stack clash probing needs a loop, then it needs a + scratch register. But the returned register is only guaranteed + to be safe to use after register saves are complete. So if + stack clash protections are enabled and the allocated frame is + larger than the probe interval, then use pushes to save + callee saved registers. */ + || (flag_stack_clash_protection + && !ix86_target_stack_probe () + && to_allocate > get_probe_interval ())); +} + /* Fill structure ix86_frame about frame of currently computed function. */ static void @@ -7189,20 +7209,7 @@ ix86_compute_frame_layout (void) /* Size prologue needs to allocate. */ to_allocate = offset - frame->sse_reg_save_offset; - if ((!to_allocate && frame->nregs <= 1) - || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) - /* If static stack checking is enabled and done with probes, - the registers need to be saved before allocating the frame. */ - || flag_stack_check == STATIC_BUILTIN_STACK_CHECK - /* If stack clash probing needs a loop, then it needs a - scratch register. But the returned register is only guaranteed - to be safe to use after register saves are complete. So if - stack clash protections are enabled and the allocated frame is - larger than the probe interval, then use pushes to save - callee saved registers. */ - || (flag_stack_clash_protection - && !ix86_target_stack_probe () - && to_allocate > get_probe_interval ())) + if (save_regs_using_push_pop (to_allocate)) frame->save_regs_using_mov = false; if (ix86_using_red_zone () @@ -7660,7 +7667,9 @@ ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset) for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) { - ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); + /* Skip registers, already processed by shrink wrap separate. */ + if (!cfun->machine->reg_is_wrapped_separately[regno]) + ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); cfa_offset -= UNITS_PER_WORD; } } @@ -7753,8 +7762,15 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, add_frame_related_expr = true; } - insn = emit_insn (gen_pro_epilogue_adjust_stack_add - (Pmode, dest, src, addend)); + /* Shrink wrap separate may insert prologue between TEST and JMP. In order + not to affect EFlags, emit add without reg clobbering. */ + if (crtl->shrink_wrapped_separate) + insn = emit_insn (gen_pro_epilogue_adjust_stack_add_nocc + (Pmode, dest, src, addend)); + else + insn = emit_insn (gen_pro_epilogue_adjust_stack_add + (Pmode, dest, src, addend)); + if (style >= 0) ix86_add_queued_cfa_restore_notes (insn); @@ -9218,13 +9234,30 @@ ix86_expand_prologue (void) the stack frame saving one cycle of the prologue. However, avoid doing this if we have to probe the stack; at least on x86_64 the stack probe can turn into a call that clobbers a red zone location. */ - else if (ix86_using_red_zone () - && (! TARGET_STACK_PROBE - || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) + else if ((ix86_using_red_zone () + && (! TARGET_STACK_PROBE + || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) + || crtl->shrink_wrapped_separate) { + HOST_WIDE_INT allocate_offset; + if (crtl->shrink_wrapped_separate) + { + allocate_offset = m->fs.sp_offset - frame.stack_pointer_offset; + + /* Adjust the total offset at the beginning of the function. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (allocate_offset), -1, + m->fs.cfa_reg == stack_pointer_rtx); + m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset; + } + ix86_emit_save_regs_using_mov (frame.reg_save_offset); - cfun->machine->red_zone_used = true; int_registers_saved = true; + + if (ix86_using_red_zone () + && (! TARGET_STACK_PROBE + || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) + cfun->machine->red_zone_used = true; } } @@ -9344,6 +9377,8 @@ ix86_expand_prologue (void) && flag_stack_clash_protection && !ix86_target_stack_probe ()) { + gcc_assert (!crtl->shrink_wrapped_separate); + ix86_adjust_stack_and_probe (allocate, int_registers_saved, false); allocate = 0; } @@ -9354,6 +9389,8 @@ ix86_expand_prologue (void) { const HOST_WIDE_INT probe_interval = get_probe_interval (); + gcc_assert (!crtl->shrink_wrapped_separate); + if (STACK_CHECK_MOVING_SP) { if (crtl->is_leaf @@ -9410,12 +9447,16 @@ ix86_expand_prologue (void) else if (!ix86_target_stack_probe () || frame.stack_pointer_offset < CHECK_STACK_LIMIT) { + gcc_assert (!crtl->shrink_wrapped_separate); + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-allocate), -1, m->fs.cfa_reg == stack_pointer_rtx); } else { + gcc_assert (!crtl->shrink_wrapped_separate); + rtx eax = gen_rtx_REG (Pmode, AX_REG); rtx r10 = NULL; const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx); @@ -9801,30 +9842,35 @@ ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset, for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true)) { - rtx reg = gen_rtx_REG (word_mode, regno); - rtx mem; - rtx_insn *insn; - - mem = choose_baseaddr (cfa_offset, NULL); - mem = gen_frame_mem (word_mode, mem); - insn = emit_move_insn (reg, mem); - if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg)) + /* Skip registers, already processed by shrink wrap separate. */ + if (!cfun->machine->reg_is_wrapped_separately[regno]) { - /* Previously we'd represented the CFA as an expression - like *(%ebp - 8). We've just popped that value from - the stack, which means we need to reset the CFA to - the drap register. This will remain until we restore - the stack pointer. */ - add_reg_note (insn, REG_CFA_DEF_CFA, reg); - RTX_FRAME_RELATED_P (insn) = 1; + rtx reg = gen_rtx_REG (word_mode, regno); + rtx mem; + rtx_insn *insn; - /* This means that the DRAP register is valid for addressing. */ - m->fs.drap_valid = true; - } - else - ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + mem = choose_baseaddr (cfa_offset, NULL); + mem = gen_frame_mem (word_mode, mem); + insn = emit_move_insn (reg, mem); + + if (m->fs.cfa_reg == crtl->drap_reg + && regno == REGNO (crtl->drap_reg)) + { + /* Previously we'd represented the CFA as an expression + like *(%ebp - 8). We've just popped that value from + the stack, which means we need to reset the CFA to + the drap register. This will remain until we restore + the stack pointer. */ + add_reg_note (insn, REG_CFA_DEF_CFA, reg); + RTX_FRAME_RELATED_P (insn) = 1; + /* DRAP register is valid for addressing. */ + m->fs.drap_valid = true; + } + else + ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + } cfa_offset -= UNITS_PER_WORD; } } @@ -10103,10 +10149,11 @@ ix86_expand_epilogue (int style) less work than reloading sp and popping the register. */ else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1) restore_regs_via_mov = true; - else if (TARGET_EPILOGUE_USING_MOVE - && cfun->machine->use_fast_prologue_epilogue - && (frame.nregs > 1 - || m->fs.sp_offset != reg_save_offset)) + else if (crtl->shrink_wrapped_separate + || (TARGET_EPILOGUE_USING_MOVE + && cfun->machine->use_fast_prologue_epilogue + && (frame.nregs > 1 + || m->fs.sp_offset != reg_save_offset))) restore_regs_via_mov = true; else if (frame_pointer_needed && !frame.nregs @@ -10120,6 +10167,9 @@ ix86_expand_epilogue (int style) else restore_regs_via_mov = false; + if (crtl->shrink_wrapped_separate) + gcc_assert (restore_regs_via_mov); + if (restore_regs_via_mov || frame.nsseregs) { /* Ensure that the entire register save area is addressable via @@ -10172,6 +10222,7 @@ ix86_expand_epilogue (int style) gcc_assert (m->fs.sp_offset == UNITS_PER_WORD); gcc_assert (!crtl->drap_reg); gcc_assert (!frame.nregs); + gcc_assert (!crtl->shrink_wrapped_separate); } else if (restore_regs_via_mov) { @@ -10186,6 +10237,8 @@ ix86_expand_epilogue (int style) rtx sa = EH_RETURN_STACKADJ_RTX; rtx_insn *insn; + gcc_assert (!crtl->shrink_wrapped_separate); + /* Stack realignment doesn't work with eh_return. */ if (crtl->stack_realign_needed) sorry ("Stack realignment not supported with " @@ -28076,6 +28129,195 @@ ix86_cannot_copy_insn_p (rtx_insn *insn) #undef TARGET_DOCUMENTATION_NAME #define TARGET_DOCUMENTATION_NAME "x86" +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */ +sbitmap +ix86_get_separate_components (void) +{ + HOST_WIDE_INT offset, to_allocate; + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + struct machine_function *m = cfun->machine; + + offset = m->frame.stack_pointer_offset; + to_allocate = offset - m->frame.sse_reg_save_offset; + + /* Shrink wrap separate uses MOV, which means APX PPX cannot be used. + Experiments show that APX PPX can speed up the prologue. If the function + does not exit early during actual execution, then using APX PPX is faster. + If the function always exits early during actual execution, then shrink + wrap separate reduces the number of MOV (PUSH/POP) instructions actually + executed, thus speeding up execution. + foo: + movl $1, %eax + testq %rdi, %rdi + jne.L60 + ret ---> early return. + .L60: + subq $88, %rsp ---> belong to prologue. + xorl %eax, %eax + movq %rbx, 40 (%rsp) ---> belong to prologue. + movq 8 (%rdi), %rbx + movq %rbp, 48 (%rsp) ---> belong to prologue. + movq %rdi, %rbp + testq %rbx, %rbx + jne.L61 + movq 40 (%rsp), %rbx + movq 48 (%rsp), %rbp + addq $88, %rsp + ret + .L61: + movq %r12, 56 (%rsp) ---> belong to prologue. + movq %r13, 64 (%rsp) ---> belong to prologue. + movq %r14, 72 (%rsp) ---> belong to prologue. + ... ... + + Disable shrink wrap separate when PPX is enabled. */ + if ((TARGET_APX_PPX && !crtl->calls_eh_return) + || cfun->machine->func_type != TYPE_NORMAL + || TARGET_SEH + || crtl->stack_realign_needed + || m->call_ms2sysv) + return components; + + /* Since shrink wrapping separate uses MOV instead of PUSH/POP. + Disable shrink wrap separate when MOV is prohibited. */ + if (save_regs_using_push_pop (to_allocate)) + return components; + + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + /* Skip registers with large offsets, where a pseudo may be needed. */ + if (IN_RANGE (offset, -0x8000, 0x7fff)) + bitmap_set_bit (components, regno); + offset += UNITS_PER_WORD; + } + + /* Don't mess with the following registers. */ + if (frame_pointer_needed) + bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); + + if (crtl->drap_reg) + bitmap_clear_bit (components, REGNO (crtl->drap_reg)); + + if (pic_offset_table_rtx) + bitmap_clear_bit (components, REAL_PIC_OFFSET_TABLE_REGNUM); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */ +sbitmap +ix86_components_for_bb (basic_block bb) +{ + bitmap in = DF_LIVE_IN (bb); + bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; + bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; + + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + + function_abi_aggregator callee_abis; + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (CALL_P (insn)) + callee_abis.note_callee_abi (insn_callee_abi (insn)); + HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi); + + /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */ + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (!fixed_regs[regno] + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) + || bitmap_bit_p (in, regno) + || bitmap_bit_p (gen, regno) + || bitmap_bit_p (kill, regno))) + bitmap_set_bit (components, regno); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. */ +void +ix86_disqualify_components (sbitmap, edge, sbitmap, bool) +{ + /* Nothing to do for x86. */ +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */ +void +ix86_emit_prologue_components (sbitmap components) +{ + HOST_WIDE_INT cfa_offset; + struct machine_function *m = cfun->machine; + + cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset + - m->frame.stack_pointer_offset; + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + if (bitmap_bit_p (components, regno)) + ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */ +void +ix86_emit_epilogue_components (sbitmap components) +{ + HOST_WIDE_INT cfa_offset; + struct machine_function *m = cfun->machine; + cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset + - m->frame.stack_pointer_offset; + + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + if (bitmap_bit_p (components, regno)) + { + rtx reg = gen_rtx_REG (word_mode, regno); + rtx mem; + rtx_insn *insn; + + mem = choose_baseaddr (cfa_offset, NULL); + mem = gen_frame_mem (word_mode, mem); + insn = emit_move_insn (reg, mem); + + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_RESTORE, reg); + } + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */ +void +ix86_set_handled_components (sbitmap components) +{ + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (bitmap_bit_p (components, regno)) + { + cfun->machine->reg_is_wrapped_separately[regno] = true; + cfun->machine->use_fast_prologue_epilogue = true; + cfun->machine->frame.save_regs_using_mov = true; + } +} + +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS ix86_get_separate_components +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB ix86_components_for_bb +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS ix86_disqualify_components +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ + ix86_emit_prologue_components +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ + ix86_emit_epilogue_components +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS ix86_set_handled_components + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-i386.h" diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index d32d9ad997e6..7c16eac77001 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2821,6 +2821,10 @@ struct GTY(()) machine_function { /* Cached initial frame layout for the current function. */ struct ix86_frame frame; + /* The components already handled by separate shrink-wrapping, which should + not be considered by the prologue and epilogue. */ + bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER]; + /* For -fsplit-stack support: A stack local which holds a pointer to the stack arguments for a function with a variable number of arguments. This is set at the start of the function and is used diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 99f382497148..6bd557431f5c 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -27452,6 +27452,28 @@ (const_string "*"))) (set_attr "mode" "<MODE>")]) +(define_insn "@pro_epilogue_adjust_stack_add_nocc<mode>" + [(set (match_operand:P 0 "register_operand" "=r") + (plus:P (match_operand:P 1 "register_operand" "r") + (match_operand:P 2 "<nonmemory_operand>" "l<i>"))) + (clobber (mem:BLK (scratch)))] + "" +{ + if (operands[2] == CONST0_RTX (<MODE>mode)) + return "mov{<imodesuffix>}\t{%1, %0|%0, %1}"; + else + { + operands[2] = SET_SRC (XVECEXP (PATTERN (insn), 0, 0)); + return "lea{<imodesuffix>}\t{%E2, %0|%0, %E2}"; + } +} + [(set (attr "length_immediate") + (cond [(eq_attr "type" "imov") + (const_string "0") + ] + (const_string "*"))) + (set_attr "mode" "<MODE>")]) + (define_insn "@pro_epilogue_adjust_stack_sub_<mode>" [(set (match_operand:P 0 "register_operand" "=r") (minus:P (match_operand:P 1 "register_operand" "0") diff --git a/gcc/testsuite/g++.target/i386/shrink_wrap_separate.C b/gcc/testsuite/g++.target/i386/shrink_wrap_separate.C new file mode 100644 index 000000000000..294dccde5d31 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/shrink_wrap_separate.C @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */ +typedef struct a b; +typedef double c; +struct a { + b *d; + b *e; +}; +struct f { + c g; +}; +inline bool h(c i, b *m) { + b *j = m->e; + for (; m->e; j = j->d) + if (h(i, j)) + return 0; + return m; +} +bool k() { + f *l; + b *n; + return h(l->g, n); +} +/* { dg-final { scan-rtl-dump "The components we wrap separately are \\\[sep 3 4\\\]" "pro_and_epilogue" { target { ia32 } } } } */ +/* { dg-final { scan-rtl-dump "The components we wrap separately are \\\[sep 40 41 42 43\\\]" "pro_and_epilogue" { target { ! ia32 } } } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-1.c index 3862f1e0d900..532a9a045add 100644 --- a/gcc/testsuite/gcc.target/i386/avx10_2-comibf-1.c +++ b/gcc/testsuite/gcc.target/i386/avx10_2-comibf-1.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=x86-64-v3 -mavx10.2 -O2 -fno-trapping-math" } */ +/* { dg-options "-march=x86-64-v3 -mavx10.2 -O2 -fno-trapping-math -fno-shrink-wrap" } */ /* { dg-final { scan-assembler-times "vcomisbf16\[ \\t\]+\[^{}\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 6 } } */ /* { dg-final { scan-assembler-times {j[a-z]+\s} 6 } } */ diff --git a/gcc/testsuite/gcc.target/i386/interrupt-16.c b/gcc/testsuite/gcc.target/i386/interrupt-16.c index cb45ba54e3dc..ca4441b3aee6 100644 --- a/gcc/testsuite/gcc.target/i386/interrupt-16.c +++ b/gcc/testsuite/gcc.target/i386/interrupt-16.c @@ -18,5 +18,5 @@ foo (int i) /* { dg-final { scan-assembler-not "(push|pop)(l|q)\[\\t \]*%(r|e)bp" } } */ /* { dg-final { scan-assembler-not "(push|pop)l\[\\t \]*%edi" { target ia32 } } } */ /* { dg-final { scan-assembler-not "(push|pop)q\[\\t \]*%r\[0-9\]+" { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "pushq\[\\t \]*%rdi" 1 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "popq\[\\t \]*%rdi" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "(pushq.*%rdi|subq.*\\\$8,.*%rsp)" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "(popq.*%rdi|addq.*\\\$8,.*%rsp)" 1 { target { ! ia32 } } } } */ diff --git a/gcc/testsuite/gcc.target/i386/shrink_wrap_separate_check_lea.c b/gcc/testsuite/gcc.target/i386/shrink_wrap_separate_check_lea.c new file mode 100644 index 000000000000..0f2449f68b6e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/shrink_wrap_separate_check_lea.c @@ -0,0 +1,29 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */ + +/* Avoid inserting sub between test-je-jle to change EFlags, lea should be used here + xorl %eax, %eax + testl %edi, %edi + je .L11 + sub $16, %rsp ------> leaq -16(%rsp), %rsp + movq %r13, 8(%rsp) + movl $1, %r13d + jle .L4 +*/ +int foo (int num) +{ + if (!num) + return 0; + + register int r13 __asm ("r13") = 1; + + for ( int i = 0; i < num; i++) + { + register int r12 __asm ("r12") = 1; + asm volatile ("" : "+r" (r12), "+r" (r13)); + } + + return 1; +} +/* { dg-final { scan-rtl-dump "The components we wrap separately are \\\[sep 40\\\]" "pro_and_epilogue" } } */ +/* { dg-final { scan-assembler "leaq.*(%rsp)" } } */ diff --git a/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c b/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c index 5f3d3e166afb..46fc4648dbd7 100644 --- a/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c +++ b/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fno-tree-vectorize -mabi=sysv" } */ +/* { dg-options "-O2 -fno-tree-vectorize -mabi=sysv -fno-shrink-wrap-separate" } */ extern int glb1, gbl2, gbl3; diff --git a/gcc/testsuite/gfortran.dg/guality/arg1.f90 b/gcc/testsuite/gfortran.dg/guality/arg1.f90 index 332a4ed1d872..775b7bb304f1 100644 --- a/gcc/testsuite/gfortran.dg/guality/arg1.f90 +++ b/gcc/testsuite/gfortran.dg/guality/arg1.f90 @@ -1,5 +1,5 @@ ! { dg-do run } -! { dg-options "-g" } +! { dg-options "-fno-shrink-wrap -g" } integer :: a(10), b(12) call sub (a, 10) call sub (b, 12)