Using RBP as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

Use R12 instead of RBP for the TBL register.  Since R12 is also used as
another temporary register (T1), it gets clobbered in each round of
computation.  So the TBL value needs to be freshly reloaded into R12
each time it's used.  Since the value of TBL can change, store its
permanent value on the stack at the frame_TBL offset.

Also remove the unused y4 variable.

Reported-by: Eric Biggers <ebigge...@gmail.com>
Reported-by: Peter Zijlstra <pet...@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoim...@redhat.com>
---
 arch/x86/crypto/sha512-avx2-asm.S | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/crypto/sha512-avx2-asm.S 
b/arch/x86/crypto/sha512-avx2-asm.S
index 7f5f6c6ec72e..37cfc2004abd 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -81,7 +81,7 @@ d           = %r8
 e           = %rdx
 y3          = %rsi
 
-TBL   = %rbp
+TBL   = %r12 # clobbered by T1
 
 a     = %rax
 b     = %rbx
@@ -96,11 +96,10 @@ y0    = %r13
 y1    = %r14
 y2    = %r15
 
-y4    = %r12
-
 # Local variables (stack frame)
 XFER_SIZE = 4*8
 SRND_SIZE = 1*8
+TBL_SIZE = 1*8
 INP_SIZE = 1*8
 INPEND_SIZE = 1*8
 RSPSAVE_SIZE = 1*8
@@ -108,7 +107,8 @@ GPRSAVE_SIZE = 6*8
 
 frame_XFER = 0
 frame_SRND = frame_XFER + XFER_SIZE
-frame_INP = frame_SRND + SRND_SIZE
+frame_TBL = frame_SRND + SRND_SIZE
+frame_INP = frame_TBL + TBL_SIZE
 frame_INPEND = frame_INP + INP_SIZE
 frame_RSPSAVE = frame_INPEND + INPEND_SIZE
 frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
@@ -601,7 +601,7 @@ ENTRY(sha512_transform_rorx)
        vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 
 loop0:
-       lea     K512(%rip), TBL
+       movq    $K512, frame_TBL(%rsp)
 
        ## byte swap first 16 dwords
        COPY_YMM_AND_BSWAP      Y_0, (INP), BYTE_FLIP_MASK
@@ -616,39 +616,46 @@ loop0:
 
 .align 16
 loop1:
+       mov frame_TBL(%rsp), TBL
        vpaddq  (TBL), Y_0, XFER
        vmovdqa XFER, frame_XFER(%rsp)
        FOUR_ROUNDS_AND_SCHED
 
+       mov frame_TBL(%rsp), TBL
        vpaddq  1*32(TBL), Y_0, XFER
        vmovdqa XFER, frame_XFER(%rsp)
        FOUR_ROUNDS_AND_SCHED
 
+       mov frame_TBL(%rsp), TBL
        vpaddq  2*32(TBL), Y_0, XFER
        vmovdqa XFER, frame_XFER(%rsp)
        FOUR_ROUNDS_AND_SCHED
 
+       mov frame_TBL(%rsp), TBL
        vpaddq  3*32(TBL), Y_0, XFER
        vmovdqa XFER, frame_XFER(%rsp)
-       add     $(4*32), TBL
        FOUR_ROUNDS_AND_SCHED
 
+       addq    $(4*32), frame_TBL(%rsp)
        subq    $1, frame_SRND(%rsp)
        jne     loop1
 
        movq    $2, frame_SRND(%rsp)
 loop2:
+       mov frame_TBL(%rsp), TBL
        vpaddq  (TBL), Y_0, XFER
        vmovdqa XFER, frame_XFER(%rsp)
        DO_4ROUNDS
+
+       mov frame_TBL(%rsp), TBL
        vpaddq  1*32(TBL), Y_1, XFER
        vmovdqa XFER, frame_XFER(%rsp)
-       add     $(2*32), TBL
        DO_4ROUNDS
 
        vmovdqa Y_2, Y_0
        vmovdqa Y_3, Y_1
 
+       add     $(2*32), frame_TBL(%rsp)
        subq    $1, frame_SRND(%rsp)
        jne     loop2
 
-- 
2.13.5

Reply via email to