The x86 glue helper module is starting to show its age: - It relies heavily on function pointers to invoke asm helper functions that operate on fixed input sizes that are relatively small. This means the performance is severely impacted by retpolines. - It goes to great lengths to amortize the cost of kernel_fpu_begin()/end() over as much work as possible, which is no longer necessary now that FPU save/restore is done lazily, and doing so may cause unbounded scheduling blackouts due to the fact that enabling the FPU in kernel mode disables preemption. - The CBC mode decryption helper makes backward strides through the input, in order to avoid a single block size memcpy() between chunks. Consuming the input in this manner is highly likely to defeat any hardware prefetchers, so it is better to go through the data linearly, and perform the extra memcpy() where needed (which is turned into direct loads and stores by the compiler anyway). Note that benchmarks won't show this effect, given that the memory they use is always cache hot. - It implements blockwise XOR in terms of le128 pointers, which imply an alignment that is not guaranteed by the API, violating the C standard.
GCC does not seem to be smart enough to elide the indirect calls when the function pointers are passed as arguments to static inline helper routines modeled after the existing ones. So instead, let's create some CPP macros that encapsulate the core of the ECB and CBC processing, so we can wire them up for existing users of the glue helper module, i.e., Camellia, Serpent, Twofish and CAST6. Acked-by: Eric Biggers <ebigg...@google.com> Signed-off-by: Ard Biesheuvel <a...@kernel.org> --- arch/x86/crypto/ecb_cbc_helpers.h | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h new file mode 100644 index 000000000000..eaa15c7b29d6 --- /dev/null +++ b/arch/x86/crypto/ecb_cbc_helpers.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRYPTO_ECB_CBC_HELPER_H +#define _CRYPTO_ECB_CBC_HELPER_H + +#include <crypto/internal/skcipher.h> +#include <asm/fpu/api.h> + +/* + * Mode helpers to instantiate parameterized skcipher ECB/CBC modes without + * having to rely on indirect calls and retpolines. + */ + +#define ECB_WALK_START(req, bsize, fpu_blocks) do { \ + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \ + const int __bsize = (bsize); \ + struct skcipher_walk walk; \ + int err = skcipher_walk_virt(&walk, (req), false); \ + while (walk.nbytes > 0) { \ + unsigned int nbytes = walk.nbytes; \ + bool do_fpu = (fpu_blocks) != -1 && \ + nbytes >= (fpu_blocks) * __bsize; \ + const u8 *src = walk.src.virt.addr; \ + u8 *dst = walk.dst.virt.addr; \ + u8 __maybe_unused buf[(bsize)]; \ + if (do_fpu) kernel_fpu_begin() + +#define CBC_WALK_START(req, bsize, fpu_blocks) \ + ECB_WALK_START(req, bsize, fpu_blocks) + +#define ECB_WALK_ADVANCE(blocks) do { \ + dst += (blocks) * __bsize; \ + src += (blocks) * __bsize; \ + nbytes -= (blocks) * __bsize; \ +} while (0) + +#define ECB_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + (func)(ctx, dst, src); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define CBC_ENC_BLOCK(func) do { \ + const u8 *__iv = walk.iv; \ + while (nbytes >= __bsize) { \ + crypto_xor_cpy(dst, src, __iv, __bsize); \ + (func)(ctx, dst, dst); \ + __iv = dst; \ + ECB_WALK_ADVANCE(1); \ + } \ + memcpy(walk.iv, __iv, __bsize); \ +} while (0) + +#define CBC_DEC_BLOCK(blocks, func) do { \ + while (nbytes >= (blocks) * __bsize) { \ + const u8 *__iv = src + ((blocks) - 1) * __bsize; \ + if (dst == src) \ + __iv = memcpy(buf, __iv, __bsize); \ + (func)(ctx, dst, src); \ + crypto_xor(dst, walk.iv, __bsize); \ + memcpy(walk.iv, __iv, __bsize); \ + ECB_WALK_ADVANCE(blocks); \ + } \ +} while (0) + +#define ECB_WALK_END() \ + if (do_fpu) kernel_fpu_end(); \ + err = skcipher_walk_done(&walk, nbytes); \ + } \ + return err; \ +} while (0) + +#define CBC_WALK_END() ECB_WALK_END() + +#endif -- 2.17.1