This patch fixes two stack alignment bugs in the split-stack support. The first is that I managed to miscalculate the stack alignment in the assembly code. I was treating that code as though it were a normal function. That is wrong, because __morestack is actually called without any stack adjustment in the caller, which means that when __morestack is entered the stack % 16 will == 8 in 32-bit mode and 0 in 64-bit mode, unlike the usual case of 12 and 8, respectively. This patch corrects the code to align the stack correctly when calling the C split-stack functions. Previously the stack was misaligned when calling those functions. Since those functions don't happen to use any vector registers, this did not matter except for performance. In any case, this patch fixes it.
The second bug is that the C functions were not aligning the returned stack. The result was coming back to an alignment determined by the parameter size. This is simply wrong. Since the C code doesn't know the required stack alignment, I simply made it always align to a 32-byte boundary. If split stack support is added for more processors, this may need to become processor dependent. Along the way I noticed that the 32-bit __morestack_non_split support was mishandling the return address when called by a varargs function, and I fixed that too. Bootstrapped and ran split-stack and Go tests on x86_64-unknown-linux-gnu, both 64-bit and 32-bit mode. Committed to mainline. Ian 2012-11-06 Ian Lance Taylor <i...@google.com> * generic-morestack.c (__generic_morestack): Align the returned stack pointer to a 32 byte boundary. * config/i386/morestack.S (__morestack_non_split) [32-bit]: Don't increment the return address until we have decided that we don't have a varargs function. (__morestack) [32-bit]: Align stack correctly when calling C functions. (__morestack) [64-bit]: Likewise.
Index: generic-morestack.c =================================================================== --- generic-morestack.c (revision 193263) +++ generic-morestack.c (working copy) @@ -549,6 +549,7 @@ __generic_morestack (size_t *pframe_size char *to; void *ret; size_t i; + size_t aligned; current = __morestack_current_segment; @@ -580,15 +581,19 @@ __generic_morestack (size_t *pframe_size *pframe_size = current->size - param_size; + /* Align the returned stack to a 32-byte boundary. */ + aligned = (param_size + 31) & ~ (size_t) 31; + #ifdef STACK_GROWS_DOWNWARD { char *bottom = (char *) (current + 1) + current->size; - to = bottom - param_size; - ret = bottom - param_size; + to = bottom - aligned; + ret = bottom - aligned; } #else to = current + 1; - ret = (char *) (current + 1) + param_size; + to += aligned - param_size; + ret = (char *) (current + 1) + aligned; #endif /* We don't call memcpy to avoid worrying about the dynamic linker Index: config/i386/morestack.S =================================================================== --- config/i386/morestack.S (revision 193263) +++ config/i386/morestack.S (working copy) @@ -200,18 +200,19 @@ __morestack_non_split: jb 2f # Get more space if we need it. - # This breaks call/return prediction, as described above. - incq 8(%rsp) # Increment the return address. - # If the instruction that we return to is # leaq 24(%rbp), %r11n # then we have been called by a varargs function that expects # %ebp to hold a real value. That can only work if we do the # full stack split routine. FIXME: This is fragile. movq 8(%rsp),%rax + incq %rax # Skip ret instruction in caller. cmpl $0x185d8d4c,(%rax) je 2f + # This breaks call/return prediction, as described above. + incq 8(%rsp) # Increment the return address. + popq %rax # Restore register. .cfi_adjust_cfa_offset -8 # Adjust for popped register. @@ -296,9 +297,13 @@ __morestack: # argument size is pushed then the new stack frame size is # pushed. - # Align stack to 16-byte boundary with enough space for saving - # registers and passing parameters to functions we call. - subl $40,%esp + # In the body of a non-leaf function, the stack pointer will + # be aligned to a 16-byte boundary. That is CFA + 12 in the + # stack picture above: (CFA + 12) % 16 == 0. At this point we + # have %esp == CFA - 8, so %esp % 16 == 12. We need some + # space for saving registers and passing parameters, and we + # need to wind up with %esp % 16 == 0. + subl $44,%esp # Because our cleanup code may need to clobber %ebx, we need # to save it here so the unwinder can restore the value used @@ -393,13 +398,15 @@ __morestack: movl %ebp,%esp # Restore stack pointer. + # As before, we now have %esp % 16 == 12. + pushl %eax # Push return value on old stack. pushl %edx - subl $8,%esp # Align stack to 16-byte boundary. + subl $4,%esp # Align stack to 16-byte boundary. call __morestack_unblock_signals - addl $8,%esp + addl $4,%esp popl %edx # Restore return value. popl %eax @@ -485,15 +492,21 @@ __morestack: pushq %r9 pushq %r11 - pushq $0 # For alignment. + + # We entered morestack with the stack pointer aligned to a + # 16-byte boundary (the call to morestack's caller used 8 + # bytes, and the call to morestack used 8 bytes). We have now + # pushed 10 registers, so we are still aligned to a 16-byte + # boundary. call __morestack_block_signals leaq -8(%rbp),%rdi # Address of new frame size. leaq 24(%rbp),%rsi # The caller's parameters. - addq $8,%rsp popq %rdx # The size of the parameters. + subq $8,%rsp # Align stack. + call __generic_morestack movq -8(%rbp),%r10 # Reload modified frame size @@ -564,6 +577,9 @@ __morestack: movq %rbp,%rsp # Restore stack pointer. + # Now (%rsp & 16) == 8. + + subq $8,%rsp # For alignment. pushq %rax # Push return value on old stack. pushq %rdx @@ -571,6 +587,7 @@ __morestack: popq %rdx # Restore return value. popq %rax + addq $8,%rsp .cfi_remember_state popq %rbp