> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/x86_64/abi/callabi/leaf-2.c: Adjust the test.
> >         * gcc.target/i386/interrupt-16.c: Likewise.
> >         * g++.target/i386/shrink_wrap_separate.c: New test.
> 
> This one should have .C suffix.
> 

Done.

> Some comment fixes/clarifications inline.
> 
> Uros.
> 
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -6909,6 +6909,26 @@ ix86_pro_and_epilogue_can_use_push2pop2
> (int nregs)
> >          && (nregs + aligned) >= 3;
> >  }
> >
> > +/* Check if push pop should be used to save registers.  */
> 
> Check if push/pop should be used to save/restore registers.
> 
Done.

> >  static void
> > @@ -7193,20 +7213,7 @@ ix86_compute_frame_layout (void)
> >    /* Size prologue needs to allocate.  */
> >    to_allocate = offset - frame->sse_reg_save_offset;
> >
> > -  if ((!to_allocate && frame->nregs <= 1)
> > -      || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
> > -       /* If static stack checking is enabled and done with probes,
> > -         the registers need to be saved before allocating the frame.  */
> > -      || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
> > -      /* If stack clash probing needs a loop, then it needs a
> > -        scratch register.  But the returned register is only guaranteed
> > -        to be safe to use after register saves are complete.  So if
> > -        stack clash protections are enabled and the allocated frame is
> > -        larger than the probe interval, then use pushes to save
> > -        callee saved registers.  */
> > -      || (flag_stack_clash_protection
> > -         && !ix86_target_stack_probe ()
> > -         && to_allocate > get_probe_interval ()))
> > +  if (save_regs_using_push_pop (to_allocate))
> >      frame->save_regs_using_mov = false;
> >
> >    if (ix86_using_red_zone ()
> > @@ -7664,7 +7671,9 @@ ix86_emit_save_regs_using_mov
> (HOST_WIDE_INT cfa_offset)
> >    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> >      if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
> >        {
> > -        ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
> > +       /* Skip shrink warp separate already processed registers.  */
> 
> Skip  registers, already processed by shrink wrap separate.
> 
Done.

> > +       if (!cfun->machine->reg_is_wrapped_separately[regno])
> > +         ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
> >         cfa_offset -= UNITS_PER_WORD;
> >        }
> >  }
> > @@ -9227,6 +9236,18 @@ ix86_expand_prologue (void)
> >                && (! TARGET_STACK_PROBE
> >                    || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
> >         {
> > +         HOST_WIDE_INT allocate_offset;
> > +         /* If shrink wrap separate works, we will adjust the total offset 
> > of
> > +            rsp at the beginning.  */
> 
> Hm, the above should be rephrased.
> 
> > +         if (crtl->shrink_wrapped_separate)
> > +           {
> > +             allocate_offset = m->fs.sp_offset - 
> > frame.stack_pointer_offset;
> > +             pro_epilogue_adjust_stack (stack_pointer_rtx, 
> > stack_pointer_rtx,
> > +                                        GEN_INT (allocate_offset), -1,
> > +                                        m->fs.cfa_reg ==
> > + stack_pointer_rtx);
> 
> Maybe put the above comment here, saying:
> 
> Adjust the total offset at the beginning of the function.
> 

I put it before the function that emit the sub instruction.

              /* Adjust the total offset at the beginning of the function.  */
              pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
                                         GEN_INT (allocate_offset), -1,
                                         m->fs.cfa_reg == stack_pointer_rtx);
              m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset;

> > +             m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset;
> > +           }
> > +
> >           ix86_emit_save_regs_using_mov (frame.reg_save_offset);
> >           cfun->machine->red_zone_used = true;
> >           int_registers_saved = true;
> > @@ -9806,30 +9827,36 @@ ix86_emit_restore_regs_using_mov
> (HOST_WIDE_INT cfa_offset,
> >    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> >      if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno,
> maybe_eh_return, true))
> >        {
> > -       rtx reg = gen_rtx_REG (word_mode, regno);
> > -       rtx mem;
> > -       rtx_insn *insn;
> > -
> > -       mem = choose_baseaddr (cfa_offset, NULL);
> > -       mem = gen_frame_mem (word_mode, mem);
> > -       insn = emit_move_insn (reg, mem);
> >
> > -        if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO 
> > (crtl->drap_reg))
> > +       /* Skip shrink warp separate already processed registers.  */
> 
> Skip  registers, already processed by shrink wrap separate.
> 
Done.

> > +       if (!cfun->machine->reg_is_wrapped_separately[regno])
> >           {
> > -           /* Previously we'd represented the CFA as an expression
> > -              like *(%ebp - 8).  We've just popped that value from
> > -              the stack, which means we need to reset the CFA to
> > -              the drap register.  This will remain until we restore
> > -              the stack pointer.  */
> > -           add_reg_note (insn, REG_CFA_DEF_CFA, reg);
> > -           RTX_FRAME_RELATED_P (insn) = 1;
> > +           rtx reg = gen_rtx_REG (word_mode, regno);
> > +           rtx mem;
> > +           rtx_insn *insn;
> >
> > -           /* This means that the DRAP register is valid for addressing.  
> > */
> > -           m->fs.drap_valid = true;
> > -         }
> > -       else
> > -         ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
> > +           mem = choose_baseaddr (cfa_offset, NULL);
> > +           mem = gen_frame_mem (word_mode, mem);
> > +           insn = emit_move_insn (reg, mem);
> >
> > +           if (m->fs.cfa_reg == crtl->drap_reg
> > +               && regno == REGNO (crtl->drap_reg))
> > +             {
> > +               /* Previously we'd represented the CFA as an expression
> > +                  like *(%ebp - 8).  We've just popped that value from
> > +                  the stack, which means we need to reset the CFA to
> > +                  the drap register.  This will remain until we restore
> > +                  the stack pointer.  */
> > +               add_reg_note (insn, REG_CFA_DEF_CFA, reg);
> > +               RTX_FRAME_RELATED_P (insn) = 1;
> > +
> > +               /* This means that the DRAP register is valid for 
> > addressing.
> > +                */
> 
> Just: "DRAP register is valid for addressing."
> 
Done.

> > +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
> sbitmap
> > +ix86_get_separate_components (void) {
> > +  HOST_WIDE_INT offset, to_allocate;
> > +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> > +  bitmap_clear (components);
> > +
> > +  offset = cfun->machine->frame.stack_pointer_offset;
> > +  to_allocate = offset - cfun->machine->frame.sse_reg_save_offset;
> > +  /* When PPX is enabled, disable shrink wrap separate.
> > +    It is a trade-off.  */
> Mark a trade-off with "???".
> 
> ??? Disable shrink wrap separate when PPX is enabled.
> 

Added the following description.

  /* Shrink wrap separate uses MOV, which means APX PPX cannot be used.
     Experiments show that APX PPX can speed up the prologue. If the function
     does not exit early during actual execution, then using APX PPX is faster.
     If the function always exits early during actual execution, then shrink
     wrap separate reduces the number of MOV (PUSH/POP) instructions actually
     executed, thus speeding up execution.
foo:
        movl    $1, %eax
        testq   %rdi, %rdi
        jne     .L60
        ret                    ---> early return.
.L60:
        subq    $88, %rsp      ---> belong to prologue.
        xorl    %eax, %eax
        movq    %rbx, 40(%rsp) ---> belong to prologue.
        movq    8(%rdi), %rbx
        movq    %rbp, 48(%rsp) ---> belong to prologue.
        movq    %rdi, %rbp
        testq   %rbx, %rbx
        jne     .L61
        movq    40(%rsp), %rbx
        movq    48(%rsp), %rbp
        addq    $88, %rsp
        ret
.L61:
        movq    %r12, 56(%rsp) ---> belong to prologue.
        movq    %r13, 64(%rsp) ---> belong to prologue.
        movq    %r14, 72(%rsp) ---> belong to prologue.
        ... ...

     It is a trade-off. Disable shrink wrap separate when PPX is enabled.  */

> > +  if (TARGET_APX_PPX && !crtl->calls_eh_return)
> > +    return components;
> > +
> > +  /* Since shrink wrapping separate uses MOV instead of PUSH/POP
> > +     We need to disable shrink wrap separate when move is prohibited.
> > + */
> 
> Disable shrink wrap separate when MOV is prohibited.
> 
Done.

> > +  if (save_regs_using_push_pop (to_allocate))
> > +    return components;
> > +
> > +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> > +    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
> > +      {
> > +       /* We can only wrap registers that have small operand offsets.
> > +          For large offsets a pseudo register might be needed which
> > +          cannot be created during the shrink wrapping pass.  */
> 
> Skip registers with large offsets, where a pseudo may be needed.
> 
Done.

> > +       if (IN_RANGE (offset, -0x8000, 0x7fff))
> > +         bitmap_set_bit (components, regno);
> > +       offset += UNITS_PER_WORD;
> > +      }
> > +
> > +  /* Don't mess with the following frame pointer.  */
> 
> Don't mess with the frame pointer..
Change it to "Don't mess with the following registers." to explain the 
following three bitmap_clear_bits.

> > +/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.  */
> void
> > +ix86_disqualify_components (sbitmap, edge, sbitmap, bool) {
> > +  /* Nothing to do for i386.  */
> 
> ... for x86.
> 
Done.

Thanks,
Lili.

Reply via email to