[PATCH] crypto: api - Remove no-op exit_ops code
crypto_exit_cipher_ops() and crypto_exit_compress_ops() are no-ops and have been for a long time, so remove them. Signed-off-by: Eric Biggers --- crypto/api.c | 20 ++-- crypto/cipher.c | 4 crypto/compress.c | 4 crypto/internal.h | 3 --- 4 files changed, 2 insertions(+), 29 deletions(-) diff --git a/crypto/api.c b/crypto/api.c index bbc147c..a88729f 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -310,24 +310,8 @@ static void crypto_exit_ops(struct crypto_tfm *tfm) { const struct crypto_type *type = tfm->__crt_alg->cra_type; - if (type) { - if (tfm->exit) - tfm->exit(tfm); - return; - } - - switch (crypto_tfm_alg_type(tfm)) { - case CRYPTO_ALG_TYPE_CIPHER: - crypto_exit_cipher_ops(tfm); - break; - - case CRYPTO_ALG_TYPE_COMPRESS: - crypto_exit_compress_ops(tfm); - break; - - default: - BUG(); - } + if (type && tfm->exit) + tfm->exit(tfm); } static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask) diff --git a/crypto/cipher.c b/crypto/cipher.c index 39541e0..94fa355 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -116,7 +116,3 @@ int crypto_init_cipher_ops(struct crypto_tfm *tfm) return 0; } - -void crypto_exit_cipher_ops(struct crypto_tfm *tfm) -{ -} diff --git a/crypto/compress.c b/crypto/compress.c index c33f076..f2d5229 100644 --- a/crypto/compress.c +++ b/crypto/compress.c @@ -42,7 +42,3 @@ int crypto_init_compress_ops(struct crypto_tfm *tfm) return 0; } - -void crypto_exit_compress_ops(struct crypto_tfm *tfm) -{ -} diff --git a/crypto/internal.h b/crypto/internal.h index 7eefcdb..f073204 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -76,9 +76,6 @@ struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask); int crypto_init_cipher_ops(struct crypto_tfm *tfm); int crypto_init_compress_ops(struct crypto_tfm *tfm); -void crypto_exit_cipher_ops(struct crypto_tfm *tfm); -void crypto_exit_compress_ops(struct crypto_tfm *tfm); - struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask); void crypto_larval_kill(struct crypto_alg *alg); struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask); -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] crypto: skcipher - Remove unused crypto_lookup_skcipher() declaration
The definition of crypto_lookup_skcipher() was already removed in commit 3a01d0ee2b99 ("crypto: skcipher - Remove top-level givcipher interface"). So the declaration should be removed too. Signed-off-by: Eric Biggers --- include/crypto/internal/skcipher.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index a21a95e..95d2a18 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -74,8 +74,6 @@ static inline int crypto_grab_skcipher2(struct crypto_skcipher_spawn *spawn, return crypto_grab_skcipher(spawn, name, type, mask); } -struct crypto_alg *crypto_lookup_skcipher(const char *name, u32 type, u32 mask); - static inline void crypto_drop_skcipher(struct crypto_skcipher_spawn *spawn) { crypto_drop_spawn(&spawn->base); -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] crypto: cmac - return -EINVAL if block size is unsupported
cmac_create() previously returned 0 if a cipher with a block size other than 8 or 16 bytes was specified. It should return -EINVAL instead. Granted, this doesn't actually change any behavior because cryptomgr currently ignores any return value other than -EAGAIN from template ->create() functions. Signed-off-by: Eric Biggers --- crypto/cmac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/cmac.c b/crypto/cmac.c index 7a8bfbd..b6c4059 100644 --- a/crypto/cmac.c +++ b/crypto/cmac.c @@ -243,6 +243,7 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb) case 8: break; default: + err = -EINVAL; goto out_put_alg; } -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] crypto: cmac - fix alignment of 'consts'
The per-transform 'consts' array is accessed as __be64 in crypto_cmac_digest_setkey() but was only guaranteed to be aligned to __alignof__(long). Fix this by aligning it to __alignof__(__be64). Signed-off-by: Eric Biggers --- crypto/cmac.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/crypto/cmac.c b/crypto/cmac.c index b6c4059..04080dc 100644 --- a/crypto/cmac.c +++ b/crypto/cmac.c @@ -57,7 +57,8 @@ static int crypto_cmac_digest_setkey(struct crypto_shash *parent, unsigned long alignmask = crypto_shash_alignmask(parent); struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent); unsigned int bs = crypto_shash_blocksize(parent); - __be64 *consts = PTR_ALIGN((void *)ctx->ctx, alignmask + 1); + __be64 *consts = PTR_ALIGN((void *)ctx->ctx, + (alignmask | (__alignof__(__be64) - 1)) + 1); u64 _const[2]; int i, err = 0; u8 msb_mask, gfmask; @@ -173,7 +174,8 @@ static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out) struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); - u8 *consts = PTR_ALIGN((void *)tctx->ctx, alignmask + 1); + u8 *consts = PTR_ALIGN((void *)tctx->ctx, + (alignmask | (__alignof__(__be64) - 1)) + 1); u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1); u8 *prev = odds + bs; unsigned int offset = 0; @@ -258,7 +260,8 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb) if (err) goto out_free_inst; - alignmask = alg->cra_alignmask | (sizeof(long) - 1); + /* We access the data as u32s when xoring. */ + alignmask = alg->cra_alignmask | (__alignof__(u32) - 1); inst->alg.base.cra_alignmask = alignmask; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; @@ -270,7 +273,9 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb) + alg->cra_blocksize * 2; inst->alg.base.cra_ctxsize = - ALIGN(sizeof(struct cmac_tfm_ctx), alignmask + 1) + ALIGN(sizeof(struct cmac_tfm_ctx), crypto_tfm_ctx_alignment()) + + ((alignmask | (__alignof__(__be64) - 1)) & + ~(crypto_tfm_ctx_alignment() - 1)) + alg->cra_blocksize * 2; inst->alg.base.cra_init = cmac_init_tfm; -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] crypto: cmac - fix alignment of 'consts'
On Mon, Oct 10, 2016 at 10:29:55AM -0700, Joe Perches wrote: > On Mon, 2016-10-10 at 10:15 -0700, Eric Biggers wrote: > > The per-transform 'consts' array is accessed as __be64 in > > crypto_cmac_digest_setkey() but was only guaranteed to be aligned to > > __alignof__(long). Fix this by aligning it to __alignof__(__be64). > [] > > diff --git a/crypto/cmac.c b/crypto/cmac.c > [] > > @@ -57,7 +57,8 @@ static int crypto_cmac_digest_setkey(struct crypto_shash > > *parent, > > unsigned long alignmask = crypto_shash_alignmask(parent); > > struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent); > > unsigned int bs = crypto_shash_blocksize(parent); > > - __be64 *consts = PTR_ALIGN((void *)ctx->ctx, alignmask + 1); > > + __be64 *consts = PTR_ALIGN((void *)ctx->ctx, > > + (alignmask | (__alignof__(__be64) - 1)) + 1); > > Using a bitwise or looks very odd there. Perhaps: > > min(alignmask + 1, __alignof__(__be64)) > Alignment has to be a power of 2. From the code I've read, crypto drivers work with alignment a lot and use bitwise OR to mean "the more restrictive of these alignmasks". So I believe the way it's written is the preferred style. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] crypto: cmac - fix alignment of 'consts'
On Mon, Oct 10, 2016 at 10:51:14AM -0700, Joe Perches wrote: > > Hey Eric. > > I don't see any PTR_ALIGN uses in crypto/ or drivers/crypto/ that > use a bitwise or, just mask + 1, but I believe the effect is the > same. Anyway, your choice, but I think using min is clearer. > > cheers, Joe Usually the bitwise OR is used when setting cra_alignmask in the 'struct crypto_alg'. Indeed, the problem could be solved by setting inst->alg.base.cra_alignmask = alg->cra_alignmask | (__alignof__(__be64) - 1); I decided against that because it would always force 8-byte alignment for CMAC input buffers and keys, when in fact they don't need that level of alignment unless the underlying block cipher requires it. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] crypto: skcipher - Get rid of crypto_spawn_skcipher2()
Since commit 3a01d0ee2b99 ("crypto: skcipher - Remove top-level givcipher interface"), crypto_spawn_skcipher2() and crypto_spawn_skcipher() are equivalent. So switch callers of crypto_spawn_skcipher2() to crypto_spawn_skcipher() and remove it. Signed-off-by: Eric Biggers --- crypto/authenc.c | 2 +- crypto/authencesn.c| 2 +- crypto/ccm.c | 2 +- crypto/chacha20poly1305.c | 2 +- crypto/ctr.c | 2 +- crypto/cts.c | 2 +- crypto/gcm.c | 2 +- include/crypto/internal/skcipher.h | 6 -- 8 files changed, 7 insertions(+), 13 deletions(-) diff --git a/crypto/authenc.c b/crypto/authenc.c index 03d5edc..875470b 100644 --- a/crypto/authenc.c +++ b/crypto/authenc.c @@ -324,7 +324,7 @@ static int crypto_authenc_init_tfm(struct crypto_aead *tfm) if (IS_ERR(auth)) return PTR_ERR(auth); - enc = crypto_spawn_skcipher2(&ictx->enc); + enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) goto err_free_ahash; diff --git a/crypto/authencesn.c b/crypto/authencesn.c index bad6ef4..6f8f6b8 100644 --- a/crypto/authencesn.c +++ b/crypto/authencesn.c @@ -342,7 +342,7 @@ static int crypto_authenc_esn_init_tfm(struct crypto_aead *tfm) if (IS_ERR(auth)) return PTR_ERR(auth); - enc = crypto_spawn_skcipher2(&ictx->enc); + enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) goto err_free_ahash; diff --git a/crypto/ccm.c b/crypto/ccm.c index 67e3636..26b924d 100644 --- a/crypto/ccm.c +++ b/crypto/ccm.c @@ -462,7 +462,7 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm) if (IS_ERR(cipher)) return PTR_ERR(cipher); - ctr = crypto_spawn_skcipher2(&ictx->ctr); + ctr = crypto_spawn_skcipher(&ictx->ctr); err = PTR_ERR(ctr); if (IS_ERR(ctr)) goto err_free_cipher; diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c index 66291d4..db1bc31 100644 --- a/crypto/chacha20poly1305.c +++ b/crypto/chacha20poly1305.c @@ -532,7 +532,7 @@ static int chachapoly_init(struct crypto_aead *tfm) if (IS_ERR(poly)) return PTR_ERR(poly); - chacha = crypto_spawn_skcipher2(&ictx->chacha); + chacha = crypto_spawn_skcipher(&ictx->chacha); if (IS_ERR(chacha)) { crypto_free_ahash(poly); return PTR_ERR(chacha); diff --git a/crypto/ctr.c b/crypto/ctr.c index 57114b1..a9a7a44 100644 --- a/crypto/ctr.c +++ b/crypto/ctr.c @@ -312,7 +312,7 @@ static int crypto_rfc3686_init_tfm(struct crypto_skcipher *tfm) unsigned long align; unsigned int reqsize; - cipher = crypto_spawn_skcipher2(spawn); + cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); diff --git a/crypto/cts.c b/crypto/cts.c index 8883b62..00254d7 100644 --- a/crypto/cts.c +++ b/crypto/cts.c @@ -290,7 +290,7 @@ static int crypto_cts_init_tfm(struct crypto_skcipher *tfm) unsigned bsize; unsigned align; - cipher = crypto_spawn_skcipher2(spawn); + cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); diff --git a/crypto/gcm.c b/crypto/gcm.c index 5f11b80..b7ad808 100644 --- a/crypto/gcm.c +++ b/crypto/gcm.c @@ -575,7 +575,7 @@ static int crypto_gcm_init_tfm(struct crypto_aead *tfm) if (IS_ERR(ghash)) return PTR_ERR(ghash); - ctr = crypto_spawn_skcipher2(&ictx->ctr); + ctr = crypto_spawn_skcipher(&ictx->ctr); err = PTR_ERR(ctr); if (IS_ERR(ctr)) goto err_free_hash; diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 558f5c9..7a7e815 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -91,12 +91,6 @@ static inline struct crypto_skcipher *crypto_spawn_skcipher( return crypto_spawn_tfm2(&spawn->base); } -static inline struct crypto_skcipher *crypto_spawn_skcipher2( - struct crypto_skcipher_spawn *spawn) -{ - return crypto_spawn_skcipher(spawn); -} - static inline void crypto_skcipher_set_reqsize( struct crypto_skcipher *skcipher, unsigned int reqsize) { -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] crypto: skcipher - Get rid of crypto_grab_skcipher2()
Since commit 3a01d0ee2b99 ("crypto: skcipher - Remove top-level givcipher interface"), crypto_grab_skcipher2() and crypto_grab_skcipher() are equivalent. So switch callers of crypto_grab_skcipher2() to crypto_grab_skcipher() and remove it. Signed-off-by: Eric Biggers --- crypto/authenc.c | 6 +++--- crypto/authencesn.c| 6 +++--- crypto/ccm.c | 6 +++--- crypto/chacha20poly1305.c | 6 +++--- crypto/ctr.c | 6 +++--- crypto/cts.c | 6 +++--- crypto/gcm.c | 6 +++--- include/crypto/internal/skcipher.h | 6 -- 8 files changed, 21 insertions(+), 27 deletions(-) diff --git a/crypto/authenc.c b/crypto/authenc.c index a7e1ac7..03d5edc 100644 --- a/crypto/authenc.c +++ b/crypto/authenc.c @@ -420,9 +420,9 @@ static int crypto_authenc_create(struct crypto_template *tmpl, goto err_free_inst; crypto_set_skcipher_spawn(&ctx->enc, aead_crypto_instance(inst)); - err = crypto_grab_skcipher2(&ctx->enc, enc_name, 0, - crypto_requires_sync(algt->type, -algt->mask)); + err = crypto_grab_skcipher(&ctx->enc, enc_name, 0, + crypto_requires_sync(algt->type, + algt->mask)); if (err) goto err_drop_auth; diff --git a/crypto/authencesn.c b/crypto/authencesn.c index 121010a..bad6ef4 100644 --- a/crypto/authencesn.c +++ b/crypto/authencesn.c @@ -441,9 +441,9 @@ static int crypto_authenc_esn_create(struct crypto_template *tmpl, goto err_free_inst; crypto_set_skcipher_spawn(&ctx->enc, aead_crypto_instance(inst)); - err = crypto_grab_skcipher2(&ctx->enc, enc_name, 0, - crypto_requires_sync(algt->type, -algt->mask)); + err = crypto_grab_skcipher(&ctx->enc, enc_name, 0, + crypto_requires_sync(algt->type, + algt->mask)); if (err) goto err_drop_auth; diff --git a/crypto/ccm.c b/crypto/ccm.c index 006d857..67e3636 100644 --- a/crypto/ccm.c +++ b/crypto/ccm.c @@ -544,9 +544,9 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl, goto err_free_inst; crypto_set_skcipher_spawn(&ictx->ctr, aead_crypto_instance(inst)); - err = crypto_grab_skcipher2(&ictx->ctr, ctr_name, 0, - crypto_requires_sync(algt->type, -algt->mask)); + err = crypto_grab_skcipher(&ictx->ctr, ctr_name, 0, + crypto_requires_sync(algt->type, + algt->mask)); if (err) goto err_drop_cipher; diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c index e899ef5..66291d4 100644 --- a/crypto/chacha20poly1305.c +++ b/crypto/chacha20poly1305.c @@ -625,9 +625,9 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb, goto err_free_inst; crypto_set_skcipher_spawn(&ctx->chacha, aead_crypto_instance(inst)); - err = crypto_grab_skcipher2(&ctx->chacha, chacha_name, 0, - crypto_requires_sync(algt->type, -algt->mask)); + err = crypto_grab_skcipher(&ctx->chacha, chacha_name, 0, + crypto_requires_sync(algt->type, + algt->mask)); if (err) goto err_drop_poly; diff --git a/crypto/ctr.c b/crypto/ctr.c index ff4d21e..57114b1 100644 --- a/crypto/ctr.c +++ b/crypto/ctr.c @@ -370,9 +370,9 @@ static int crypto_rfc3686_create(struct crypto_template *tmpl, spawn = skcipher_instance_ctx(inst); crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst)); - err = crypto_grab_skcipher2(spawn, cipher_name, 0, - crypto_requires_sync(algt->type, -algt->mask)); + err = crypto_grab_skcipher(spawn, cipher_name, 0, + crypto_requires_sync(algt->type, + algt->mask)); if (err) goto err_free_inst; diff --git a/crypto/cts.c b/crypto/cts.c index 5197618..8883b62 100644 --- a/crypto/cts.c +++ b/crypto/cts.c @@ -348,9 +348,9 @@ static int crypto_cts_create(struct crypto_template *tmpl,
Alignment of shash input buffers?
Hi, I'm having trouble understanding how alignment of shash input buffers is supposed to work. If you pass crypto_shash_update() a buffer that is not aligned to the shash algorithm's alignmask, it will call the underlying ->update() function twice, once with a temporary aligned buffer and once with the aligned remainder of the original input buffer. So far so good. The problem is that some (many?) hash algorithms actually deal with fixed size blocks, and the lengths of buffers which users might pass to crypto_shash_update() are not necessarily multiples of the block size. This can cause the input buffer to fall out of alignment. Let's use cmac(aes) as an example. It will copy up to 16 bytes to a temporary buffer, process (xor into the state and encrypt) that block if full, then directly process 16-byte blocks from the input buffer, then save any remainder. The middle step uses crypto_xor() directly on the input buffer, which assumes 4-byte alignment. However this alignment is not guaranteed. For example, if I pass two aligned buffers, one of length 15 and one of length 17 to crypto_shash_update(), crypto_xor() is called on a misaligned pointer. ghash is another example of an algorithm that has this problem, although that one seems even more broken since it doesn't even set an alignmask at all (?). I am wondering, is this intentional? If it's broken, do shash algorithms need to be fixed to assume no alignment, e.g. by using the get_unaligned_*() macros or by memcpy()-ing each block into a temporary buffer? Is there a better solution? Thanks, Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/16] crypto: skcipher - Add skcipher walk interface
Hi Herbert, just a few preliminary comments. I haven't made it through everything yet. On Wed, Nov 02, 2016 at 07:19:02AM +0800, Herbert Xu wrote: > +static int skcipher_walk_first(struct skcipher_walk *walk) > +{ > + if (WARN_ON_ONCE(in_irq())) > + return -EDEADLK; > + > + walk->nbytes = walk->total; > + if (unlikely(!walk->total)) > + return 0; > + > + walk->buffer = NULL; > + if (unlikely(((unsigned long)walk->iv & walk->alignmask))) { > + int err = skcipher_copy_iv(walk); > + if (err) > + return err; > + } > + > + walk->page = NULL; > + > + return skcipher_walk_next(walk); > +} I think the case where skcipher_copy_iv() fails may be handled incorrectly. Wouldn't it need to set walk.nbytes to 0 so as to not confuse callers which expect that behavior? Or maybe it should be calling skcipher_walk_done(). > +static int skcipher_walk_skcipher(struct skcipher_walk *walk, > + struct skcipher_request *req) > +{ > + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); > + > + scatterwalk_start(&walk->in, req->src); > + scatterwalk_start(&walk->out, req->dst); > + > + walk->in.sg = req->src; > + walk->out.sg = req->dst; Setting walk->in.sg and walk->out.sg is redundant with the scatterwalk_start() calls. > +int skcipher_walk_virt(struct skcipher_walk *walk, > +struct skcipher_request *req, bool atomic) > +{ > + int err; > + > + walk->flags &= ~SKCIPHER_WALK_PHYS; > + > + err = skcipher_walk_skcipher(walk, req); > + > + walk->flags &= atomic ? ~SKCIPHER_WALK_SLEEP : ~0; > + > + return err; > +} This gets called with uninitialized 'walk.flags'. This was somewhat of a theoretical problem with the old blkcipher_walk code but it looks like now it will interact badly with the new SKCIPHER_WALK_SLEEP flag. As far as I can see, whether the flag will end up set or not can depend on the uninitialized value. It would be nice if this problem could be avoided entirely be setting flags=0. I'm also wondering about the choice to not look at 'atomic' until after the call to skcipher_walk_skcipher(). Wouldn't this mean that the choice of 'atomic' would not be respected in e.g. the kmalloc() in skcipher_copy_iv()? > +int skcipher_walk_async(struct skcipher_walk *walk, > + struct skcipher_request *req) > +{ > + walk->flags |= SKCIPHER_WALK_PHYS; > + > + INIT_LIST_HEAD(&walk->buffers); > + > + return skcipher_walk_skcipher(walk, req); > +} > +EXPORT_SYMBOL_GPL(skcipher_walk_async); I don't see any users of the "async" walking being introduced; are some planned? Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
vmalloced stacks and scatterwalk_map_and_copy()
Hello, I hit the BUG_ON() in arch/x86/mm/physaddr.c:26 while testing some crypto code in an x86_64 kernel with CONFIG_DEBUG_VIRTUAL=y and CONFIG_VMAP_STACK=y: /* carry flag will be set if starting x was >= PAGE_OFFSET */ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); The problem is the following code in scatterwalk_map_and_copy() in crypto/scatterwalk.c, which tries to determine if the buffer passed in aliases the physical memory of the first segment of the scatterlist: if (sg_page(sg) == virt_to_page(buf) && sg->offset == offset_in_page(buf)) return; If 'buf' is on the stack with CONFIG_VMAP_STACK=y it will be a vmalloc address. And virt_to_page(buf) does not have a meaningful behavior on vmalloc addresses. Hence the BUG. I don't think this should be fixed by forbidding passing a stack address to scatterwalk_map_and_copy(). Not only are there a number of callers that explicitly use stack addresses (e.g. poly_verify_tag() in crypto/chacha20poly1305.c) but it's also possible for a whole skcipher_request to be allocated on the stack with the SKCIPHER_REQUEST_ON_STACK() macro. Note that this has nothing to do with DMA per se. Another solution would be to make scatterwalk_map_and_copy() explicitly check for virt_addr_valid(). But this would make the behavior of scatterwalk_map_and_copy() confusing because it would detect aliasing but only under some circumstances, and it would depend on the kernel config. Currently I think the best solution would be to require that callers to scatterwalk_map_and_copy() do not alias their source and destination. Then the alias check could be removed. This check has only been there since v4.2 (commit 74412fd5d71b6), so I'd hope not many callers rely on the behavior. I'm not sure exactly which ones do, though. Thoughts on this? Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: vmalloced stacks and scatterwalk_map_and_copy()
On Thu, Nov 03, 2016 at 01:30:49PM -0700, Andy Lutomirski wrote: > > Also, Herbert, it seems like the considerable majority of the crypto > code is acting on kernel virtual memory addresses and does software > processing. Would it perhaps make sense to add a kvec-based or > iov_iter-based interface to the crypto code? I bet it would be quite > a bit faster and it would make crypto on stack buffers work directly. I'd like to hear Herbert's opinion on this too, but as I understand it, if a symmetric cipher API operating on virtual addresses was added, similar to the existing "shash" API it would only allow software processing. Whereas with the current API you can request a transform and use it the same way regardless of whether the crypto framework has chosen a software or hardware implementation, or a combination thereof. If this wasn't a concern then I expect using virtual addresses would indeed simplify things a lot, at least for users not already working with physical memory (struct page). Either way, in the near term it looks like 4.9 will be released with the new behavior that encryption/decryption is not supported on stack buffers. Separately from the scatterwalk_map_and_copy() issue, today I've found two places in the filesystem-level encryption code that do encryption on stack buffers and therefore hit the 'BUG_ON(!virt_addr_valid(buf));' in sg_set_buf(). I will be sending patches to fix these, but I suspect there may be more crypto API users elsewhere that have this same problem. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] fscrypto: don't use on-stack buffer for filename encryption
With the new (in 4.9) option to use a virtually-mapped stack (CONFIG_VMAP_STACK), stack buffers cannot be used as input/output for the scatterlist crypto API because they may not be directly mappable to struct page. For short filenames, fname_encrypt() was encrypting a stack buffer holding the padded filename. Fix it by encrypting the filename in-place in the output buffer, thereby making the temporary buffer unnecessary. This bug could most easily be observed in a CONFIG_DEBUG_SG kernel because this allowed the BUG in sg_set_buf() to be triggered. Signed-off-by: Eric Biggers --- fs/crypto/fname.c | 53 + 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9a28133..9b774f4 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -39,65 +39,54 @@ static void fname_crypt_complete(struct crypto_async_request *req, int res) static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { - u32 ciphertext_len; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - struct scatterlist src_sg, dst_sg; + struct scatterlist sg; int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim; + unsigned int lim; + unsigned int cryptlen; lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = max(iname->len, (u32)FS_CRYPTO_BLOCK_SIZE); - ciphertext_len = round_up(ciphertext_len, padding); - ciphertext_len = min(ciphertext_len, lim); + /* +* Copy the filename to the output buffer for encrypting in-place and +* pad it with the needed number of NUL bytes. +*/ + cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); + cryptlen = round_up(cryptlen, padding); + cryptlen = min(cryptlen, lim); + memcpy(oname->name, iname->name, iname->len); + memset(oname->name + iname->len, 0, cryptlen - iname->len); - if (ciphertext_len <= sizeof(buf)) { - workbuf = buf; - } else { - alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); - if (!alloc_buf) - return -ENOMEM; - workbuf = alloc_buf; - } + /* Initialize the IV */ + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - /* Allocate request */ + /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); - kfree(alloc_buf); + "%s: skcipher_request_alloc() failed\n", __func__); return -ENOMEM; } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, fname_crypt_complete, &ecr); + sg_init_one(&sg, oname->name, cryptlen); + skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - if (iname->len < ciphertext_len) - memset(workbuf + iname->len, 0, ciphertext_len - iname->len); - - /* Initialize IV */ - memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, workbuf, ciphertext_len); - sg_init_one(&dst_sg, oname->name, ciphertext_len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + /* Do the encryption */ res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { + /* Request is being completed asynchronously; wait for it */ wait_for_completion(&ecr.completion); res = ecr.res; } - kfree(alloc_buf); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR @@ -105,7 +94,7 @@ static int fname_encrypt(struct inode *inode, return res; } - oname->len = ciphertext_len; + oname->len = cryptlen; return 0; } -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] fscrypto: don't use on-stack buffer for key derivation
With the new (in 4.9) option to use a virtually-mapped stack (CONFIG_VMAP_STACK), stack buffers cannot be used as input/output for the scatterlist crypto API because they may not be directly mappable to struct page. get_crypt_info() was using a stack buffer to hold the output from the encryption operation used to derive the per-file key. Fix it by using a heap buffer. This bug could most easily be observed in a CONFIG_DEBUG_SG kernel because this allowed the BUG in sg_set_buf() to be triggered. Signed-off-by: Eric Biggers --- fs/crypto/keyinfo.c | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 82f0285..67fb6d8 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -185,7 +185,7 @@ int get_crypt_info(struct inode *inode) struct crypto_skcipher *ctfm; const char *cipher_str; int keysize; - u8 raw_key[FS_MAX_KEY_SIZE]; + u8 *raw_key = NULL; int res; res = fscrypt_initialize(); @@ -238,6 +238,15 @@ int get_crypt_info(struct inode *inode) if (res) goto out; + /* +* This cannot be a stack buffer because it is passed to the scatterlist +* crypto API as part of key derivation. +*/ + res = -ENOMEM; + raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + if (!raw_key) + goto out; + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -276,7 +285,8 @@ int get_crypt_info(struct inode *inode) if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); + raw_key = NULL; if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { put_crypt_info(crypt_info); goto retry; @@ -287,7 +297,7 @@ int get_crypt_info(struct inode *inode) if (res == -ENOKEY) res = 0; put_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); return res; } -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: vmalloced stacks and scatterwalk_map_and_copy()
On Thu, Nov 03, 2016 at 02:12:07PM -0700, Eric Biggers wrote: > On Thu, Nov 03, 2016 at 01:30:49PM -0700, Andy Lutomirski wrote: > > > > Also, Herbert, it seems like the considerable majority of the crypto > > code is acting on kernel virtual memory addresses and does software > > processing. Would it perhaps make sense to add a kvec-based or > > iov_iter-based interface to the crypto code? I bet it would be quite > > a bit faster and it would make crypto on stack buffers work directly. > > I'd like to hear Herbert's opinion on this too, but as I understand it, if a > symmetric cipher API operating on virtual addresses was added, similar to the > existing "shash" API it would only allow software processing. Whereas with > the > current API you can request a transform and use it the same way regardless of > whether the crypto framework has chosen a software or hardware implementation, > or a combination thereof. If this wasn't a concern then I expect using > virtual > addresses would indeed simplify things a lot, at least for users not already > working with physical memory (struct page). > > Either way, in the near term it looks like 4.9 will be released with the new > behavior that encryption/decryption is not supported on stack buffers. > Separately from the scatterwalk_map_and_copy() issue, today I've found two > places in the filesystem-level encryption code that do encryption on stack > buffers and therefore hit the 'BUG_ON(!virt_addr_valid(buf));' in > sg_set_buf(). > I will be sending patches to fix these, but I suspect there may be more crypto > API users elsewhere that have this same problem. > > Eric [Added linux-mm to Cc] For what it's worth, grsecurity has a special case to allow a scatterlist entry to be created from a stack buffer: static inline void sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) { const void *realbuf = buf; #ifdef CONFIG_GRKERNSEC_KSTACKOVERFLOW if (object_starts_on_stack(buf)) realbuf = buf - current->stack + current->lowmem_stack; #endif #ifdef CONFIG_DEBUG_SG BUG_ON(!virt_addr_valid(realbuf)); #endif sg_set_page(sg, virt_to_page(realbuf), buflen, offset_in_page(realbuf)); } It seems to maintain two virtual mappings for each stack, a physically contiguous one (task_struct.lowmem_stack) and a physically non-contiguous one (task_struct.stack). This seems different from upstream CONFIG_VMAP_STACK which just maintains a physically non-contiguous one. I don't know about all the relative merits of the two approaches. But one of the things that will need to be done with the currently upstream approach is that all callers of sg_set_buf() will need to be checked to make sure they aren't using stack addresses, and any that are will need to be updated to do otherwise, e.g. by using heap-allocated memory. I suppose this is already happening, but in the case of the crypto API it will probably take a while for all the users to be identified and updated. (And it's not always clear from the local context whether something can be stack memory or not, e.g. the memory for crypto request objects may be either.) Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: vmalloced stacks and scatterwalk_map_and_copy()
On Thu, Nov 03, 2016 at 08:57:49PM -0700, Andy Lutomirski wrote: > > The crypto request objects can live on the stack just fine. It's the > request buffers that need to live elsewhere (or the alternative > interfaces can be used, or the crypto core code can start using > something other than scatterlists). > There are cases where a crypto operation is done on a buffer embedded in a request object. The example I'm aware of is in the GCM implementation (crypto/gcm.c). Basically it needs to encrypt 16 zero bytes prepended with the actual data, so it fills a buffer in the request object (crypto_gcm_req_priv_ctx.auth_tag) with zeroes and builds a new scatterlist which covers both this buffer and the original data scatterlist. Granted, GCM provides the aead interface not the skcipher interface, and currently there is no AEAD_REQUEST_ON_STACK() macro like there is a SKCIPHER_REQUEST_ON_STACK() macro. So maybe no one is creating aead requests on the stack right now. But it's something to watch out for. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
On Thu, Nov 03, 2016 at 11:20:08PM +0100, Jason A. Donenfeld wrote: > Hi David, > > On Thu, Nov 3, 2016 at 6:08 PM, David Miller wrote: > > In any event no piece of code should be doing 32-bit word reads from > > addresses like "x + 3" without, at a very minimum, going through the > > kernel unaligned access handlers. > > Excellent point. In otherwords, > > ctx->r[0] = (le32_to_cpuvp(key + 0) >> 0) & 0x3ff; > ctx->r[1] = (le32_to_cpuvp(key + 3) >> 2) & 0x303; > ctx->r[2] = (le32_to_cpuvp(key + 6) >> 4) & 0x3ffc0ff; > ctx->r[3] = (le32_to_cpuvp(key + 9) >> 6) & 0x3f03fff; > ctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00f; > > should change to: > > ctx->r[0] = (le32_to_cpuvp(key + 0) >> 0) & 0x3ff; > ctx->r[1] = (get_unaligned_le32(key + 3) >> 2) & 0x303; > ctx->r[2] = (get_unaligned_le32(key + 6) >> 4) & 0x3ffc0ff; > ctx->r[3] = (get_unaligned_le32(key + 9) >> 6) & 0x3f03fff; > ctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00f; > I agree, and the current code is wrong; but do note that this proposal is correct for poly1305_setrkey() but not for poly1305_setskey() and poly1305_blocks(). In the latter two cases, 4-byte alignment of the source buffer is *not* guaranteed. Although crypto_poly1305_update() will be called with a 4-byte aligned buffer due to the alignmask set on poly1305_alg, the algorithm operates on 16-byte blocks and therefore has to buffer partial blocks. If some number of bytes that is not 0 mod 4 is buffered, then the buffer will fall out of alignment on the next update call. Hence, get_unaligned_le32() is actually needed on all the loads, since the buffer will, in general, be of unknown alignment. Note: some other shash algorithms have this problem too and do not handle it correctly. It seems to be a common mistake. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
On Mon, Nov 07, 2016 at 07:08:22PM +0100, Jason A. Donenfeld wrote: > Hmm... The general data flow that strikes me as most pertinent is > something like: > > struct sk_buff *skb = get_it_from_somewhere(); > skb = skb_share_check(skb, GFP_ATOMIC); > num_frags = skb_cow_data(skb, ..., ...); > struct scatterlist sg[num_frags]; > sg_init_table(sg, num_frags); > skb_to_sgvec(skb, sg, ..., ...); > blkcipher_walk_init(&walk, sg, sg, len); > blkcipher_walk_virt_block(&desc, &walk, BLOCK_SIZE); > while (walk.nbytes >= BLOCK_SIZE) { > size_t chunk_len = rounddown(walk.nbytes, BLOCK_SIZE); > poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len); > blkcipher_walk_done(&desc, &walk, walk.nbytes % BLOCK_SIZE); > } > if (walk.nbytes) { > poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes); > blkcipher_walk_done(&desc, &walk, 0); > } > > Is your suggestion that that in the final if block, walk.src.virt.addr > might be unaligned? Like in the case of the last fragment being 67 > bytes long? I was not referring to any users in particular, only what users could do. As an example, if you did crypto_shash_update() with 32, 15, then 17 bytes, and the underlying algorithm is poly1305-generic, the last block would end up misaligned. This doesn't appear possible with your pseudocode because it only passes in multiples of the block size until the very end. However I don't see it claimed anywhere that shash API users have to do that. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] poly1305: generic C can be faster on chips with slow unaligned access
On Mon, Nov 07, 2016 at 08:02:35PM +0100, Jason A. Donenfeld wrote: > On Mon, Nov 7, 2016 at 7:26 PM, Eric Biggers wrote: > > > > I was not referring to any users in particular, only what users could do. > > As an > > example, if you did crypto_shash_update() with 32, 15, then 17 bytes, and > > the > > underlying algorithm is poly1305-generic, the last block would end up > > misaligned. This doesn't appear possible with your pseudocode because it > > only > > passes in multiples of the block size until the very end. However I don't > > see > > it claimed anywhere that shash API users have to do that. > > Actually it appears that crypto/poly1305_generic.c already buffers > incoming blocks to a buffer that definitely looks aligned, to prevent > this condition! > No it does *not* buffer all incoming blocks, which is why the source pointer can fall out of alignment. Yes, I actually tested this. In fact this situation is even hit, in both possible places, in the self-tests. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] poly1305: generic C can be faster on chips with slow unaligned access
On Mon, Nov 07, 2016 at 08:47:09PM +0100, Jason A. Donenfeld wrote: > By using the unaligned access helpers, we drastically improve > performance on small MIPS routers that have to go through the exception > fix-up handler for these unaligned accesses. > > Signed-off-by: Jason A. Donenfeld > --- Reviewed-by: Eric Biggers Nit: the subject line is a little unclear about what was changed. "make generic C faster on chips with slow unaligned access" would be better. -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] poly1305: generic C can be faster on chips with slow unaligned access
On Tue, Nov 08, 2016 at 08:52:39AM +0100, Martin Willi wrote: > > > Not sure what the exact alignment rules for key/iv are, but maybe we > want to replace the same function in chacha20_generic.c as well? > > Martin chacha20-generic provides a blkcipher API and sets an alignmask of sizeof(u32) - 1. This applies to the key buffer for ->setkey() and to the mapped addresses for the input/output buffers and IV during the blkcipher walk. So it does not appear to have the problems that poly1305 has. I do however see one small problem which is that 'u8 stream[CHACHA20_BLOCK_SIZE];' is, strictly speaking, not guaranteed to be aligned to u32. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [v2 PATCH 1/16] crypto: skcipher - Add skcipher walk interface
Hi Herbert, On Sun, Nov 13, 2016 at 07:45:32PM +0800, Herbert Xu wrote: > +int skcipher_walk_done(struct skcipher_walk *walk, int err) > +{ > + unsigned int nbytes = 0; > + unsigned int n = 0; > + > + if (likely(err >= 0)) { > + n = walk->nbytes - err; > + nbytes = walk->total - n; > + } > + > + if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS | > + SKCIPHER_WALK_SLOW | > + SKCIPHER_WALK_COPY | > + SKCIPHER_WALK_DIFF { > +unmap_src: > + skcipher_unmap_src(walk); Isn't it wrong to unmap the buffers when err < 0? They won't have been mapped yet, e.g. if the 'skcipher_walk_done(walk, -EINVAL)' case is hit. > + /* Short-circuit for the common/fast path. */ > + if (!(((unsigned long)walk->iv ^ (unsigned long)walk->oiv) | > + ((unsigned long)walk->buffer ^ (unsigned long)walk->page) | > + (unsigned long)walk->page)) > + goto out; This is really saying that the IV wasn't copied and that walk->buffer and walk->page are both NULL. But if walk->buffer is NULL then the IV wasn't copied. So this can be simplified to: if (!((unsigned long)walk->buffer | (unsigned long)walk->page)) goto out; > +int skcipher_walk_next(struct skcipher_walk *walk) > +{ Shouldn't this be static? There's even a static declaration earlier in the file. > +static int skcipher_next_slow(struct skcipher_walk *walk) > +{ > + bool phys = walk->flags & SKCIPHER_WALK_PHYS; > + unsigned alignmask = walk->alignmask; > + unsigned bsize = walk->chunksize; ... > + walk->nbytes = bsize; skcipher_next_slow() is always setting up 'chunksize' bytes to be processed. Isn't this wrong in the 'blocksize < chunksize' case because fewer than 'chunksize' bytes may need to be processed? > +static inline void *skcipher_map(struct scatter_walk *walk) > +{ > + struct page *page = scatterwalk_page(walk); > + > + return (PageHighMem(page) ? kmap_atomic(page) : page_address(page)) + > +offset_in_page(walk->offset); > +} Is the PageHighMem() optimization really worthwhile? It will skip kmap_atomic() and hence preempt_disable() for non-highmem pages, so it may make it harder to find bugs where users are sleeping when they shouldn't be. It also seems inconsistent that skcipher_map() will do this optimization but scatterwalk_map() won't. > + if (!walk->page) { > + gfp_t gfp = skcipher_walk_gfp(walk); > + > + walk->page = (void *)__get_free_page(gfp); > + if (!walk->page) > + goto slow_path; > + } > + > + walk->nbytes = min_t(unsigned, n, > + PAGE_SIZE - offset_in_page(walk->page)); walk->page will be page-aligned, so there's no need for offset_in_page(). Also, 'n' has already been clamped to at most one page. So AFAICS this can simply be 'walk->nbytes = n;'. > +int skcipher_walk_done(struct skcipher_walk *walk, int err); ... > +void skcipher_walk_complete(struct skcipher_walk *walk, int err); The naming is confusing: "done" vs. "complete". They can easily be mixed up. Would it make more sense to have something with "async" in the name for the second one, like skcipher_walk_async_done()? Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [v2 PATCH 6/16] crypto: cryptd - Add support for skcipher
On Sun, Nov 13, 2016 at 07:45:37PM +0800, Herbert Xu wrote: > +static void cryptd_skcipher_encrypt(struct crypto_async_request *base, > + int err) > +{ > + struct skcipher_request *req = skcipher_request_cast(base); > + struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); > + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); > + struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); > + struct crypto_skcipher *child = ctx->child; > + SKCIPHER_REQUEST_ON_STACK(subreq, child); > + > + if (unlikely(err == -EINPROGRESS)) > + goto out; > + > + skcipher_request_set_tfm(subreq, child); > + skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, > + NULL, NULL); > + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, > +req->iv); > + > + err = crypto_skcipher_encrypt(subreq); > + skcipher_request_zero(subreq); > + > + req->base.complete = rctx->complete; > + > +out: > + cryptd_skcipher_complete(req, err); > +} > + > +static void cryptd_skcipher_decrypt(struct crypto_async_request *base, > + int err) > +{ > + struct skcipher_request *req = skcipher_request_cast(base); > + struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); > + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); > + struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); > + struct crypto_skcipher *child = ctx->child; > + SKCIPHER_REQUEST_ON_STACK(subreq, child); > + > + if (unlikely(err == -EINPROGRESS)) > + goto out; > + > + skcipher_request_set_tfm(subreq, child); > + skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, > + NULL, NULL); > + skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, > +req->iv); > + > + err = crypto_skcipher_decrypt(subreq); > + skcipher_request_zero(subreq); > + > + req->base.complete = rctx->complete; > + > +out: > + cryptd_skcipher_complete(req, err); > +} cryptd_skcipher_encrypt() and cryptd_skcipher_decrypt() are identical except for whether crypto_skcipher_encrypt() or crypto_skcipher_decrypt() is used. I suggest having them wrap a function cryptd_skcipher_crypt() that takes in a function pointer to crypto_skcipher_encrypt or crypto_skcipher_decrypt. Alternatively a bool would be okay too. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [v2 PATCH 4/16] crypto: xts - Convert to skcipher
On Sun, Nov 13, 2016 at 07:45:35PM +0800, Herbert Xu wrote: > +static int do_encrypt(struct skcipher_request *req, int err) > +{ > + struct rctx *rctx = skcipher_request_ctx(req); > + struct skcipher_request *subreq; > + > + subreq = &rctx->subreq; > + > + while (!err && rctx->left) { > + err = pre_crypt(req) ?: > + crypto_skcipher_encrypt(subreq) ?: > + post_crypt(req); > + > + if (err == -EINPROGRESS || > + (err == -EBUSY && > + req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) > + return err; > + } > + > + exit_crypt(req); > + return err; > +} > + > +static void encrypt_done(struct crypto_async_request *areq, int err) > +{ > + struct skcipher_request *req = areq->data; > + struct skcipher_request *subreq; > + struct rctx *rctx; > + > + rctx = skcipher_request_ctx(req); > + subreq = &rctx->subreq; > + subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; > + > + err = do_encrypt(req, err ?: post_crypt(req)); > + if (rctx->left) > + return; > + > + skcipher_request_complete(req, err); > +} > + > +static int encrypt(struct skcipher_request *req) > +{ > + return do_encrypt(req, init_crypt(req, encrypt_done)); > +} > + > +static int do_decrypt(struct skcipher_request *req, int err) > +{ > + struct rctx *rctx = skcipher_request_ctx(req); > + struct skcipher_request *subreq; > + > + subreq = &rctx->subreq; > + > + while (!err && rctx->left) { > + err = pre_crypt(req) ?: > + crypto_skcipher_decrypt(subreq) ?: > + post_crypt(req); > + > + if (err == -EINPROGRESS || > + (err == -EBUSY && > + req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) > + return err; > + } > + > + exit_crypt(req); > + return err; > +} > + > +static void decrypt_done(struct crypto_async_request *areq, int err) > +{ > + struct skcipher_request *req = areq->data; > + struct skcipher_request *subreq; > + struct rctx *rctx; > + > + rctx = skcipher_request_ctx(req); > + subreq = &rctx->subreq; > + subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; > + > + err = do_decrypt(req, err ?: post_crypt(req)); > + if (rctx->left) > + return; > + > + skcipher_request_complete(req, err); > +} > + > +static int decrypt(struct skcipher_request *req) > +{ > + return do_decrypt(req, init_crypt(req, decrypt_done)); > } There's duplicated code for encryption and decryption here. AFAICS, the only difference between XTS encryption and decryption is whether the block cipher is used in encryption or decryption mode for the ECB step. So I suggest storing a function pointer in 'struct rctx' to either crypto_skcipher_encrypt or crypto_skcipher_decrypt, then calling through it for the ECB step. Then this code can be shared. In other words I'd like the top-level functions to look like this: static int encrypt(struct skcipher_request *req) { struct rctx *rctx = skcipher_request_ctx(req); rctx->crypt = crypto_skcipher_encrypt; return do_crypt(req); } static int decrypt(struct skcipher_request *req) { struct rctx *rctx = skcipher_request_ctx(req); rctx->crypt = crypto_skcipher_decrypt; return do_crypt(req); } I'm also wondering about the line which does 'subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;'. Is the real intent of that to clear the CRYPTO_TFM_REQ_MAY_SLEEP flag because the completion callback may be called in an atomic context, and if so shouldn't it just clear that flag only, rather than all flags except CRYPTO_TFM_REQ_MAY_BACKLOG? > + if (req->cryptlen > XTS_BUFFER_SIZE) { > + subreq->cryptlen = min(req->cryptlen, (unsigned)PAGE_SIZE); > + rctx->ext = kmalloc(subreq->cryptlen, gfp); > + } There's no check for failure to allocate the 'rctx->ext' memory. > + /* Alas we screwed up the naming so we have to mangle the > + * cipher name. > + */ > + if (!strncmp(cipher_name, "ecb(", 4)) { > + unsigned len; > > - inst->alg.cra_ctxsize = sizeof(struct priv); > + len = strlcpy(ctx->name, cipher_name + 4, sizeof(ctx->name)); > + if (len < 2 || len >= sizeof(ctx->name)) > + goto err_drop_spawn; > > - inst->alg.cra_init = init_tfm; > - inst->alg.cra_exit = exit_tfm; > + if (ctx->name[len - 1] != ')') > + goto err_drop_spawn; > > - inst->alg.cra_blkcipher.setkey = setkey; > - inst->alg.cra_blkcipher.encrypt = encrypt; > - inst->alg.cra_blkcipher.decrypt = decrypt; > + ctx->name[len - 1] = 0; > > -out_put_alg: > - crypto_mod_put(alg); > - return inst; > -} > + if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, > +
Re: [v2 PATCH 7/16] crypto: simd - Add simd skcipher helper
On Sun, Nov 13, 2016 at 07:45:38PM +0800, Herbert Xu wrote: > This patch adds the simd skcipher helper which is meant to be > a replacement for ablk helper. It replaces the underlying blkcipher > interface with skcipher, and also presents the top-level algorithm > as an skcipher. I assume this means it's planned for all users of ablk_helper to be migrated to crypto_simd, and ablk_helper will be removed? > + salg = kzalloc(sizeof(*alg), GFP_KERNEL); > + if (!salg) { > + salg = ERR_PTR(-ENOMEM); > + goto out_put_tfm; > + } Shouldn't this be 'sizeof(*salg)'? > + tfm = crypto_alloc_skcipher(basename, CRYPTO_ALG_INTERNAL, > + CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC); > + if (IS_ERR(tfm)) > + return ERR_CAST(tfm); > + > + ialg = crypto_skcipher_alg(tfm); It seems this really just needs an algorithm and not a transform. Perhaps it should be calling crypto_find_alg() directly? > + err = -ENAMETOOLONG; > + if (snprintf(alg->base.cra_name, CRYPTO_MAX_ALG_NAME, "%s", algname) >= > + CRYPTO_MAX_ALG_NAME) > + goto out_free_salg; > + > + if (snprintf(alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", > + drvname) >= CRYPTO_MAX_ALG_NAME) > + goto out_free_salg; Could use strscpy() or strlcpy() here. > +static int simd_skcipher_encrypt(struct skcipher_request *req) > +{ > + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); > + struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); > + struct skcipher_request *subreq; > + struct crypto_skcipher *child; > + > + subreq = skcipher_request_ctx(req); > + *subreq = *req; > + > + if (!may_use_simd() || > + (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) > + child = &ctx->cryptd_tfm->base; > + else > + child = cryptd_skcipher_child(ctx->cryptd_tfm); > + > + skcipher_request_set_tfm(subreq, child); > + > + return crypto_skcipher_encrypt(subreq); > +} > + > +static int simd_skcipher_decrypt(struct skcipher_request *req) > +{ > + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); > + struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); > + struct skcipher_request *subreq; > + struct crypto_skcipher *child; > + > + subreq = skcipher_request_ctx(req); > + *subreq = *req; > + > + if (!may_use_simd() || > + (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) > + child = &ctx->cryptd_tfm->base; > + else > + child = cryptd_skcipher_child(ctx->cryptd_tfm); > + > + skcipher_request_set_tfm(subreq, child); > + > + return crypto_skcipher_decrypt(subreq); > +} These are the same except for the crypto_skcipher_encrypt/crypto_skcipher_decrypt at the end, so they could be mostly shared. -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] fscrypto: don't use on-stack buffer for key derivation
On Tue, Nov 15, 2016 at 11:47:04AM -0500, Theodore Ts'o wrote: > On Thu, Nov 03, 2016 at 03:03:02PM -0700, Eric Biggers wrote: > > With the new (in 4.9) option to use a virtually-mapped stack > > (CONFIG_VMAP_STACK), stack buffers cannot be used as input/output for > > the scatterlist crypto API because they may not be directly mappable to > > struct page. get_crypt_info() was using a stack buffer to hold the > > output from the encryption operation used to derive the per-file key. > > Fix it by using a heap buffer. > > > > This bug could most easily be observed in a CONFIG_DEBUG_SG kernel > > because this allowed the BUG in sg_set_buf() to be triggered. > > > > Signed-off-by: Eric Biggers > > This commit is on the fscrypt and dev branches on ext4.git. > > - Ted Hi Ted, Would it make any sense to send these two patches to Linus for v4.9-rc6, given that they fix bugs introduced in 4.9 with the virtually-mapped stack feature? Or would you prefer to wait and have them go to 4.9 via stable? Note that CONFIG_VMAP_STACK defaults to y on x86_64. Thanks, Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: vmalloced stacks and scatterwalk_map_and_copy()
On Mon, Nov 21, 2016 at 04:26:19PM +0800, Herbert Xu wrote: > crypto: scatterwalk - Remove unnecessary aliasing check in map_and_copy > > The aliasing check in map_and_copy is no longer necessary because > the IPsec ESP code no longer provides an IV that points into the > actual request data. As this check is now triggering BUG checks > due to the vmalloced stack code, I'm removing it. > > Reported-by: Eric Biggers > Signed-off-by: Herbert Xu > > diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c > index 52ce17a..c16c94f8 100644 > --- a/crypto/scatterwalk.c > +++ b/crypto/scatterwalk.c > @@ -68,10 +68,6 @@ void scatterwalk_map_and_copy(void *buf, struct > scatterlist *sg, > > sg = scatterwalk_ffwd(tmp, sg, start); > > - if (sg_page(sg) == virt_to_page(buf) && > - sg->offset == offset_in_page(buf)) > - return; > - > scatterwalk_start(&walk, sg); > scatterwalk_copychunks(buf, &walk, nbytes, out); > scatterwalk_done(&walk, out, 0); This looks fine to me if you're confident that the aliasing check is indeed no longer necessary. Another idea I had was to replace memcpy() with memmove(). But I don't want to be in a situation where we're stuck with memmove() forever because of users who probably don't even exist. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] crypto: acomp - don't use stack buffer in test_acomp()
With virtually-mapped stacks (CONFIG_VMAP_STACK=y), using the scatterlist crypto API with stack buffers is not allowed, and with appropriate debugging options will cause the 'BUG_ON(!virt_addr_valid(buf));' in sg_set_buf() to be triggered. Use a heap buffer instead. Fixes: d7db7a882deb ("crypto: acomp - update testmgr with support for acomp") Signed-off-by: Eric Biggers --- crypto/testmgr.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index ded50b6..aca1b7b 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1448,17 +1448,21 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate, { const char *algo = crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm)); unsigned int i; - char output[COMP_BUF_SIZE]; + char *output; int ret; struct scatterlist src, dst; struct acomp_req *req; struct tcrypt_result result; + output = kmalloc(COMP_BUF_SIZE, GFP_KERNEL); + if (!output) + return -ENOMEM; + for (i = 0; i < ctcount; i++) { unsigned int dlen = COMP_BUF_SIZE; int ilen = ctemplate[i].inlen; - memset(output, 0, sizeof(output)); + memset(output, 0, dlen); init_completion(&result.completion); sg_init_one(&src, ctemplate[i].input, ilen); sg_init_one(&dst, output, dlen); @@ -1507,7 +1511,7 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate, unsigned int dlen = COMP_BUF_SIZE; int ilen = dtemplate[i].inlen; - memset(output, 0, sizeof(output)); + memset(output, 0, dlen); init_completion(&result.completion); sg_init_one(&src, dtemplate[i].input, ilen); sg_init_one(&dst, output, dlen); @@ -1555,6 +1559,7 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate, ret = 0; out: + kfree(output); return ret; } -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Crash in crypto mcryptd
On Thu, Dec 01, 2016 at 05:47:02PM -0800, Tim Chen wrote: > On Thu, 2016-12-01 at 19:00 -0500, Mikulas Patocka wrote: > > Hi > > > > There is a bug in mcryptd initialization. > > > > This is a test module that tries various hash algorithms. When you load > > the module with "insmod test.ko 'alg=mcryptd(md5)'", the machine crashes. > > I don't think your test setup is right. The mcryptd supports only > multi-buffer > algorithm. I don't think there is such an implementation for md5. > > Please refer to arch/x86/crypto/sha1-mb > multi-buffer implementation of sha1 to see the proper > setup and usage with mcryptd. You can also run tcrypt test to > exercise this code. > > Tim No, mcryptd must not crash the kernel if it's passed the wrong algorithm. Users can try to instantiate it with any algorithm using AF_ALG, for example: struct sockaddr_alg addr = { .salg_type = "hash", .salg_name = "mcryptd(md5)", }; int fd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(fd, (struct sockaddr *)&addr, sizeof(addr)); Currently, this instantly crashes the kernel. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/6] crypto: testmgr - avoid overlap in chunked tests
On Mon, Dec 05, 2016 at 06:42:23PM +, Ard Biesheuvel wrote: > The IDXn offsets are chosen such that tap values (which may go up to > 255) end up overlapping in the xbuf allocation. In particular, IDX1 > and IDX3 are too close together, so update IDX3 to avoid this issue. > Hi Ard, This patch is causing the self-tests for "xts(ecb(aes-asm))" to fail. This is on x86. Any idea why? Here's what I see in the log: alg: skcipher: Chunk test 1 failed on encryption at page 0 for xts(ecb(aes-asm)) : 1c 3b 3a 10 2f 77 03 86 e4 83 6c 99 e3 70 cf 9b 0010: ea 00 80 3f 5e 48 23 57 a4 ae 12 d4 14 a3 e6 3b 0020: 5d 31 e2 76 f8 fe 4a 8d 66 b3 17 f9 ac 68 3f 44 0030: 68 0a 86 ac 35 ad fc 33 45 be fe cb 4b b1 88 fd 0040: 57 76 92 6c 49 a3 09 5e b1 08 fd 10 98 ba ec 70 0050: aa a6 69 99 a7 2a 82 f2 7d 84 8b 21 d4 a7 41 b0 0060: c5 cd 4d 5f ff 9d ac 89 ae ba 12 29 61 d0 3a 75 0070: 71 23 e9 87 0f 8a cf 10 00 02 08 87 89 14 29 ca 0080: 2a 3e 7a 7d 7d f7 b1 03 55 16 5c 8b 9a 6d 0a 7d 0090: e8 b0 62 c4 50 0d c4 cd 12 0c 0f 74 18 da e3 d0 00a0: b5 78 1c 34 80 3f a7 54 21 c7 90 df e1 de 18 34 00b0: f2 80 d7 66 7b 32 7f 6c 8c d7 55 7e 12 ac 3a 0f 00c0: 93 ec 05 c5 2e 04 93 ef 31 a1 2d 3d 92 60 f7 9a 00d0: 28 9d 6a 37 9b c7 0c 50 84 14 73 d1 a8 cc 81 ec 00e0: 58 3e 96 45 e0 7b 8d 96 70 65 5b a5 bb cf ec c6 00f0: dc 39 66 38 0a d8 fe cb 17 b6 ba 02 46 9a 02 0a 0100: 84 e1 8e 8f 84 25 20 70 c1 3e 9f 1f 28 9b e5 4f 0110: bc 48 14 57 77 8f 61 60 15 e1 32 7a 02 b1 40 f1 0120: 50 5e b3 09 32 6d 68 37 8f 83 74 59 5c 84 9d 84 0130: f4 c3 33 ec 44 23 88 51 43 cb 47 bd 71 c5 ed ae 0140: 9b e6 9a 2f fe ce b1 be c9 de 24 4f be 15 99 2b 0150: 11 b7 7c 04 0f 12 bd 8f 6a 97 5a 44 a0 f9 0c 29 0160: a9 ab c3 d4 d8 93 92 72 84 c5 87 54 cc e2 94 52 0170: 9f 86 14 dc d2 ab a9 91 92 5f ed c4 ae 74 ff ac 0180: 6e 33 3b 93 eb 4a ff 04 79 da 9a 41 0e 44 50 e0 0190: dd 7a e4 c6 e2 91 09 00 57 5d a4 01 fc 07 05 9f 01a0: 64 5e 8b 7e 9b fd ef 33 94 30 54 ff 84 01 14 93 01b0: c2 7b 34 29 ea ed b4 ed 53 76 44 1a 77 ed 43 85 01c0: 1a d7 7f 16 f5 41 df d2 69 d5 0d 6a 5f 14 fb 0a 01d0: 1e 2a 8f 42 61 9e 5e c2 59 bd 96 d0 e5 cc 23 1f 01e0: fb 84 ed 15 a8 eb 66 07 31 6b f6 ef Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/6] crypto: testmgr - avoid overlap in chunked tests
On Wed, Dec 07, 2016 at 07:53:51PM +, Ard Biesheuvel wrote: > Does this help at all? > > diff --git a/crypto/testmgr.c b/crypto/testmgr.c > index 670893bcf361..59e67f5b544b 100644 > --- a/crypto/testmgr.c > +++ b/crypto/testmgr.c > @@ -63,7 +63,7 @@ int alg_test(const char *driver, const char *alg, > u32 type, u32 mask) > */ > #define IDX1 32 > #define IDX2 32400 > -#define IDX3 511 > +#define IDX3 1511 > #define IDX4 8193 > #define IDX5 2 > #define IDX6 17101 > Yes, with that change made the self-tests pass. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Remaining crypto API regressions with CONFIG_VMAP_STACK
In the 4.9 kernel, virtually-mapped stacks will be supported and enabled by default on x86_64. This has been exposing a number of problems in which on-stack buffers are being passed into the crypto API, which to support crypto accelerators operates on 'struct page' rather than on virtual memory. Some of these problems have already been fixed, but I was wondering how many problems remain, so I briefly looked through all the callers of sg_set_buf() and sg_init_one(). Overall I found quite a few remaining problems, detailed below. The following crypto drivers initialize a scatterlist to point into an ahash_request, which may have been allocated on the stack with AHASH_REQUEST_ON_STACK(): drivers/crypto/bfin_crc.c:351 drivers/crypto/qce/sha.c:299 drivers/crypto/sahara.c:973,988 drivers/crypto/talitos.c:1910 drivers/crypto/ccp/ccp-crypto-aes-cmac.c:105,119,142 drivers/crypto/ccp/ccp-crypto-sha.c:95,109,124 drivers/crypto/qce/sha.c:325 The following crypto drivers initialize a scatterlist to point into an ablkcipher_request, which may have been allocated on the stack with SKCIPHER_REQUEST_ON_STACK(): drivers/crypto/ccp/ccp-crypto-aes-xts.c:162 drivers/crypto/ccp/ccp-crypto-aes.c:94 And these other places do crypto operations on buffers clearly on the stack: drivers/net/wireless/intersil/orinoco/mic.c:72 drivers/usb/wusbcore/crypto.c:264 net/ceph/crypto.c:182 net/rxrpc/rxkad.c:737,1000 security/keys/encrypted-keys/encrypted.c:500 fs/cifs/smbencrypt.c:96 Note: I almost certainly missed some, since I excluded places where the use of a stack buffer was not obvious to me. I also excluded AEAD algorithms since there isn't an AEAD_REQUEST_ON_STACK() macro (yet). The "good" news with these bugs is that on x86_64 without CONFIG_DEBUG_SG=y or CONFIG_DEBUG_VIRTUAL=y, you can still do virt_to_page() and then page_address() on a vmalloc address and get back the same address, even though you aren't *supposed* to be able to do this. This will make things still work for most people. The bad news is that if you happen to have consumed just about 1 page (or N pages) of your stack at the time you call the crypto API, your stack buffer may actually span physically non-contiguous pages, so the crypto algorithm will scribble over some unrelated page. Also, hardware crypto drivers which actually do operate on physical memory will break too. So I am wondering: is the best solution really to make all these crypto API algorithms and users use heap buffers, as opposed to something like maintaining a lowmem alias for the stack, or introducing a more general function to convert buffers (possibly in the vmalloc space) into scatterlists? And if the current solution is desired, who is going to fix all of these bugs and when? Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining crypto API regressions with CONFIG_VMAP_STACK
On Fri, Dec 09, 2016 at 09:25:38PM -0800, Andy Lutomirski wrote: > > The following crypto drivers initialize a scatterlist to point into an > > ahash_request, which may have been allocated on the stack with > > AHASH_REQUEST_ON_STACK(): > > > > drivers/crypto/bfin_crc.c:351 > > drivers/crypto/qce/sha.c:299 > > drivers/crypto/sahara.c:973,988 > > drivers/crypto/talitos.c:1910 > > This are impossible or highly unlikely on x86. > > > drivers/crypto/ccp/ccp-crypto-aes-cmac.c:105,119,142 > > drivers/crypto/ccp/ccp-crypto-sha.c:95,109,124 > > These > > > drivers/crypto/qce/sha.c:325 > > This is impossible on x86. > Thanks for looking into these. I didn't investigate who/what is likely to be using each driver. Of course I would not be surprised to see people want to start supporting virtually mapped stacks on other architectures too. > > > > The "good" news with these bugs is that on x86_64 without CONFIG_DEBUG_SG=y > > or > > CONFIG_DEBUG_VIRTUAL=y, you can still do virt_to_page() and then > > page_address() > > on a vmalloc address and get back the same address, even though you aren't > > *supposed* to be able to do this. This will make things still work for most > > people. The bad news is that if you happen to have consumed just about 1 > > page > > (or N pages) of your stack at the time you call the crypto API, your stack > > buffer may actually span physically non-contiguous pages, so the crypto > > algorithm will scribble over some unrelated page. > > Are you sure? If it round-trips to the same virtual address, it > doesn't matter if the buffer is contiguous. You may be right, I didn't test this. The hash_walk and blkcipher_walk code do go page by page, but I suppose on x86_64 it would just step from one bogus "struct page" to the adjacent one and still map it to the original virtual address. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kernel-hardening] Re: Remaining crypto API regressions with CONFIG_VMAP_STACK
On Sat, Dec 10, 2016 at 01:32:08PM +0800, Herbert Xu wrote: > On Fri, Dec 09, 2016 at 09:25:38PM -0800, Andy Lutomirski wrote: > > > > > The following crypto drivers initialize a scatterlist to point into an > > > ablkcipher_request, which may have been allocated on the stack with > > > SKCIPHER_REQUEST_ON_STACK(): > > > > > > drivers/crypto/ccp/ccp-crypto-aes-xts.c:162 > > > drivers/crypto/ccp/ccp-crypto-aes.c:94 > > > > These are real, and I wish I'd known about them sooner. > > Are you sure? Any instance of *_ON_STACK must only be used with > sync algorithms and most drivers under drivers/crypto declare > themselves as async. > Why exactly is that? Obviously, it wouldn't work if you returned from the stack frame before the request completed, but does anything stop someone from using an *_ON_STACK() request and then waiting for the request to complete before returning from the stack frame? Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kernel-hardening] Re: Remaining crypto API regressions with CONFIG_VMAP_STACK
On Sat, Dec 10, 2016 at 01:37:12PM +0800, Herbert Xu wrote: > On Fri, Dec 09, 2016 at 09:25:38PM -0800, Andy Lutomirski wrote: > > > > Herbert, how hard would it be to teach the crypto code to use a more > > sensible data structure than scatterlist and to use coccinelle fix > > this stuff for real? > > First of all we already have a sync non-SG hash interface, it's > called shash. > > If we had enough sync-only users of skcipher then I'll consider > adding an interface for it. However, at this point in time it > appears to more sense to convert such users over to the async > interface rather than the other way around. > > As for AEAD we never had a sync interface to begin with and I > don't think I'm going to add one. > Isn't the question of "should the API use physical or virtual addresses" independent of the question of "should the API support asynchronous requests"? You can already choose, via the flags and mask arguments when allocating a crypto transform, whether you want it to be synchronous or asynchronous or whether you don't care. I don't see what that says about whether the API should take in physical memory (e.g. scatterlists or struct pages) or virtual memory (e.g. iov_iters or just regular pointers). And while it's true that asynchronous algorithms are often provided by hardware drivers that operate on physical memory, it's not always the case. For example some of the AES-NI algorithms are asynchronous only because they use the SSE registers which can't always available to kernel code, so the request may need to be processed by another thread. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining crypto API regressions with CONFIG_VMAP_STACK
On Sat, Dec 10, 2016 at 04:16:43PM +0800, Herbert Xu wrote: > Why did you drop me from the CC list when you were replying to > my email? > Sorry --- this thread is Cc'ed to the kernel-hardening mailing list (which was somewhat recently revived), and I replied to the email that reached me from there. It looks like it currently behaves a little differently from the vger mailing lists, in that it replaces "Reply-To" with the address of the mailing list itself rather than the sender. So that's how you got dropped. It also seems to add a prefix to the subject... I > >> Are you sure? Any instance of *_ON_STACK must only be used with > >> sync algorithms and most drivers under drivers/crypto declare > >> themselves as async. > > > > Why exactly is that? Obviously, it wouldn't work if you returned from the > > stack > > frame before the request completed, but does anything stop someone from > > using an > > *_ON_STACK() request and then waiting for the request to complete before > > returning from the stack frame? > > The *_ON_STACK variants (except SHASH of course) were simply hacks > to help legacy crypto API users to cope with the new async interface. > In general we should avoid using the sync interface when possible. > > It's a bad idea for the obvious reason that most of our async > algorithms want to DMA and that doesn't work very well when you're > using memory from the stack. Sure, I just feel that the idea of "is this algorithm asynchronous?" is being conflated with the idea of "does this algorithm operate on physical memory?". Also, if *_ON_STACK are really not allowed with asynchronous algorithms can there at least be a comment or a WARN_ON() to express this? Thanks, Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Remaining crypto API regressions with CONFIG_VMAP_STACK
On Sun, Dec 11, 2016 at 11:13:55AM -0800, Andy Lutomirski wrote: > On Fri, Dec 9, 2016 at 3:08 PM, Eric Biggers wrote: > > In the 4.9 kernel, virtually-mapped stacks will be supported and enabled by > > default on x86_64. This has been exposing a number of problems in which > > on-stack buffers are being passed into the crypto API, which to support > > crypto > > accelerators operates on 'struct page' rather than on virtual memory. > > > > > fs/cifs/smbencrypt.c:96 > > This should use crypto_cipher_encrypt_one(), I think. > > --Andy Yes, I believe that's correct. It encrypts 8 bytes with ecb(des) which is equivalent to simply encrypting one block with DES. Maybe try the following (untested): static int smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) { unsigned char key2[8]; struct crypto_cipher *cipher; str_to_key(key, key2); cipher = crypto_alloc_cipher("des", 0, 0); if (IS_ERR(cipher)) { cifs_dbg(VFS, "could not allocate des cipher\n"); return PTR_ERR(cipher); } crypto_cipher_setkey(cipher, key2, 8); crypto_cipher_encrypt_one(cipher, out, in); crypto_free_cipher(cipher); return 0; } - Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] siphash: add cryptographically secure hashtable function
On Mon, Dec 12, 2016 at 04:48:17AM +0100, Jason A. Donenfeld wrote: > > diff --git a/lib/Makefile b/lib/Makefile > index 50144a3aeebd..71d398b04a74 100644 > --- a/lib/Makefile > +++ b/lib/Makefile > @@ -22,7 +22,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ >sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ >flex_proportions.o ratelimit.o show_mem.o \ >is_single_threaded.o plist.o decompress.o kobject_uevent.o \ > - earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o > + earlycpio.o seq_buf.o siphash.o \ > + nmi_backtrace.o nodemask.o win_minmax.o > > lib-$(CONFIG_MMU) += ioremap.o > lib-$(CONFIG_SMP) += cpumask.o > @@ -44,7 +45,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o > obj-y += kstrtox.o > obj-$(CONFIG_TEST_BPF) += test_bpf.o > obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o > -obj-$(CONFIG_TEST_HASH) += test_hash.o > +obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o Maybe add to the help text for CONFIG_TEST_HASH that it now tests siphash too? > +static inline u64 le64_to_cpuvp(const void *p) > +{ > + return le64_to_cpup(p); > +} This assumes the key and message buffers are aligned to __alignof__(u64). Unless that's going to be a clearly documented requirement for callers, you should use get_unaligned_le64() instead. And you can pass a 'u8 *' directly to get_unaligned_le64(), no need for a helper function. > + b = (v0 ^ v1) ^ (v2 ^ v3); > + return (__force u64)cpu_to_le64(b); > +} It makes sense for this to return a u64, but that means the cpu_to_le64() is wrong, since u64 indicates CPU endianness. It should just return 'b'. > +++ b/lib/test_siphash.c > @@ -0,0 +1,116 @@ > +/* Test cases for siphash.c > + * > + * Copyright (C) 2015-2016 Jason A. Donenfeld > + * > + * This file is provided under a dual BSD/GPLv2 license. > + * > + * SipHash: a fast short-input PRF > + * https://131002.net/siphash/ > + */ > + > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > + > +#include > +#include > +#include > +#include > +#include > + > +static const u8 test_vectors[64][8] = { > + { 0x31, 0x0e, 0x0e, 0xdd, 0x47, 0xdb, 0x6f, 0x72 }, Can you mention in a comment where the test vectors came from? > + if (memcmp(&out, test_vectors[i], 8)) { > + pr_info("self-test %u: FAIL\n", i + 1); > + ret = -EINVAL; > + } If you make the output really be CPU-endian like I'm suggesting then this will need to be something like: if (out != get_unaligned_le64(test_vectors[i])) { Or else make the test vectors be an array of u64. - Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] orinoco: Use shash instead of ahash for MIC calculations
On Mon, Dec 12, 2016 at 12:55:55PM -0800, Andy Lutomirski wrote: > +int orinoco_mic(struct crypto_shash *tfm_michael, u8 *key, > u8 *da, u8 *sa, u8 priority, > u8 *data, size_t data_len, u8 *mic) > { > - AHASH_REQUEST_ON_STACK(req, tfm_michael); > - struct scatterlist sg[2]; > + SHASH_DESC_ON_STACK(desc, tfm_michael); > u8 hdr[ETH_HLEN + 2]; /* size of header + padding */ > int err; > > @@ -67,18 +66,27 @@ int orinoco_mic(struct crypto_ahash *tfm_michael, u8 *key, > hdr[ETH_ALEN * 2 + 2] = 0; > hdr[ETH_ALEN * 2 + 3] = 0; > > - /* Use scatter gather to MIC header and data in one go */ > - sg_init_table(sg, 2); > - sg_set_buf(&sg[0], hdr, sizeof(hdr)); > - sg_set_buf(&sg[1], data, data_len); > + desc->tfm = tfm_michael; > + desc->flags = 0; > > - if (crypto_ahash_setkey(tfm_michael, key, MIC_KEYLEN)) > - return -1; > + err = crypto_shash_setkey(tfm_michael, key, MIC_KEYLEN); > + if (err) > + return err; > + > + err = crypto_shash_init(desc); > + if (err) > + return err; > + > + err = crypto_shash_update(desc, hdr, sizeof(hdr)); > + if (err) > + return err; > + > + err = crypto_shash_update(desc, data, data_len); > + if (err) > + return err; > + > + err = crypto_shash_final(desc, mic); > + shash_desc_zero(desc); > > - ahash_request_set_tfm(req, tfm_michael); > - ahash_request_set_callback(req, 0, NULL, NULL); > - ahash_request_set_crypt(req, sg, mic, data_len + sizeof(hdr)); > - err = crypto_ahash_digest(req); > - ahash_request_zero(req); > return err; It's probably a good idea to always do shash_desc_zero(), even when something above it fails. Otherwise this looks fine. Thanks for sending these patches! Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] siphash: add cryptographically secure hashtable function
On Mon, Dec 12, 2016 at 11:18:32PM +0100, Jason A. Donenfeld wrote: > + for (; data != end; data += sizeof(u64)) { > + m = get_unaligned_le64(data); > + v3 ^= m; > + SIPROUND; > + SIPROUND; > + v0 ^= m; > + } > +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 > + b |= le64_to_cpu(load_unaligned_zeropad(data) & > bytemask_from_count(left)); > +#else Hmm, I don't think you can really do load_unaligned_zeropad() without first checking for 'left != 0'. The fixup section for load_unaligned_zeropad() assumes that rounding the pointer down to a word boundary will produce an address from which an 'unsigned long' can be loaded. But if 'left = 0' and we happen to be on a page boundary with the next page unmapped, then this will not be true and the second load will still fault. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] keys/encrypted: Fix two crypto-on-the-stack bugs
On Wed, Dec 14, 2016 at 01:04:04PM +0800, Herbert Xu wrote: > On Tue, Dec 13, 2016 at 06:53:03PM -0800, Andy Lutomirski wrote: > > On Tue, Dec 13, 2016 at 6:48 PM, Andy Lutomirski wrote: > > > The driver put a constant buffer of all zeros on the stack and > > > pointed a scatterlist entry at it in two places. This doesn't work > > > with virtual stacks. Use ZERO_PAGE instead. > > > > Wait a second... > > > > > - sg_set_buf(&sg_out[1], pad, sizeof pad); > > > + sg_set_buf(&sg_out[1], empty_zero_page, 16); > > > > My fix here is obviously bogus (I meant to use ZERO_PAGE(0)), but what > > exactly is the code trying to do? The old code makes no sense. It's > > setting the *output* buffer to zeroed padding. > > It's decrypting so I presume it just needs to the extra space for > the padding and the result will be thrown away. > It looks like the data is zero-padded to a 16-byte AES block boundary before being encrypted with CBC, so the encrypted data may be longer than the original data. (I don't know why it doesn't use CTS mode instead, which would make the lengths the same.) Since the crypto API can do in-place operations, when *encrypting* I suggest copying the decrypted data to the output buffer, padding it with 0's, and encrypting it in-place. This eliminates the need for the stack buffer. But when *decrypting* there will be up to 15 extra throwaway bytes of output produced by the decryption. There must be space made for these, and since it's output it can't be empty_zero_page. I guess either check whether space can be made for it in-place, or allocate a temporary buffer... Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] crypto: testmgr - use kmemdup instead of kmalloc+memcpy
From: Eric Biggers It's recommended to use kmemdup instead of kmalloc followed by memcpy. Signed-off-by: Eric Biggers --- crypto/testmgr.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 44e888b0b041..881176ebd8a8 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1463,13 +1463,12 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate, int ilen = ctemplate[i].inlen; void *input_vec; - input_vec = kmalloc(ilen, GFP_KERNEL); + input_vec = kmemdup(ctemplate[i].input, ilen, GFP_KERNEL); if (!input_vec) { ret = -ENOMEM; goto out; } - memcpy(input_vec, ctemplate[i].input, ilen); memset(output, 0, dlen); init_completion(&result.completion); sg_init_one(&src, input_vec, ilen); @@ -1525,13 +1524,12 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate, int ilen = dtemplate[i].inlen; void *input_vec; - input_vec = kmalloc(ilen, GFP_KERNEL); + input_vec = kmemdup(dtemplate[i].input, ilen, GFP_KERNEL); if (!input_vec) { ret = -ENOMEM; goto out; } - memcpy(input_vec, dtemplate[i].input, ilen); memset(output, 0, dlen); init_completion(&result.completion); sg_init_one(&src, input_vec, ilen); -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 5/6] crypto: aesni-intel - Add bulk request support
On Thu, Jan 12, 2017 at 01:59:57PM +0100, Ondrej Mosnacek wrote: > This patch implements bulk request handling in the AES-NI crypto drivers. > The major advantage of this is that with bulk requests, the kernel_fpu_* > functions (which are usually quite slow) are now called only once for the > whole > request. > Hi Ondrej, To what extent does the performance benefit of this patchset result from just the reduced numbers of calls to kernel_fpu_begin() and kernel_fpu_end()? If it's most of the benefit, would it make any sense to optimize kernel_fpu_begin() and kernel_fpu_end() instead? And if there are other examples besides kernel_fpu_begin/kernel_fpu_end where the bulk API would provide a significant performance boost, can you mention them? Interestingly, the arm64 equivalent to kernel_fpu_begin() (kernel_neon_begin_partial() in arch/arm64/kernel/fpsimd.c) appears to have an optimization where the SIMD registers aren't saved if they were already saved. I wonder why something similar isn't done on x86. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 0/5] Update LZ4 compressor module
On Thu, Jan 26, 2017 at 08:57:30AM +0100, Sven Schmidt wrote: > > This patchset is for updating the LZ4 compression module to a version based > on LZ4 v1.7.3 allowing to use the fast compression algorithm aka LZ4 fast > which provides an "acceleration" parameter as a tradeoff between > high compression ratio and high compression speed. > > We want to use LZ4 fast in order to support compression in lustre > and (mostly, based on that) investigate data reduction techniques in behalf of > storage systems. > > Also, it will be useful for other users of LZ4 compression, as with LZ4 fast > it is possible to enable applications to use fast and/or high compression > depending on the usecase. > For instance, ZRAM is offering a LZ4 backend and could benefit from an updated > LZ4 in the kernel. > Hi Sven, [For some reason I didn't receive patch 1/5 and had to get it from patchwork... I'm not sure why. I'm subscribed to linux-crypto but not linux-kernel.] The proposed patch defines LZ4_MEMORY_USAGE to 10 which means that LZ4 compression will use a hash table of only 1024 bytes, containing only 256 entries, to find matches. This differs from upstream LZ4 1.7.3, which uses LZ4_MEMORY_USAGE of 14, as well as the previous LZ4 included in the Linux kernel, both of which specify the hash table size to be 16384 bytes, containing 4096 entries. Given that varying the hash table size is a trade-off between memory usage, speed, and compression ratio, is this an intentional difference and has it been benchmarked? Also, in lz4defs.h: > #if defined(__x86_64__) >typedef U64 reg_t; /* 64-bits in x32 mode */ > #else >typedef size_t reg_t;/* 32-bits in x32 mode */ > #endif Are you sure this really needed over just always using size_t? > #if LZ4_ARCH64 > #ifdef __BIG_ENDIAN__ > #define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) > #else > #define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) > #endif > #else > #ifdef __BIG_ENDIAN__ > #define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) > #else > #define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) > #endif > #endif LZ4_NBCOMMONBYTES() is defined incorrectly for 64-bit little endian; it should be using __builtin_ctzll(). Nit: can you also clean up the weird indentation (e.g. double tabs) in lz4defs.h? Thanks, Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 4.10 aesni-intel no longer having lrw/ablk_helper dependencies?
On Sun, Jan 29, 2017 at 10:31:59PM +0100, Arkadiusz Miskiewicz wrote: > Hi. > > [arekm@xps ~]$ modinfo --set-version 4.9.6 aesni-intel | grep depends > depends:glue_helper,aes-x86_64,lrw,cryptd,ablk_helper > > [arekm@xps ~]$ modinfo --set-version 4.10.0-rc5-00161-gfd694aaa46c7 aesni- > intel |grep depends > depends:glue_helper,aes-x86_64,crypto_simd,cryptd > > With 4.10.0 lrw and ablk_helper are no longer in dependencies while > aesni-intel seem to still need these. My luks encrypted rootfs fails to > unlock. Initrd generation script only installs modules based on dependencies > and that seems to be a reason for luks unlock failure with 4.10rc - some > missing modules. > > Failure looks like that: > $ insmod path/to/aesni-intel.ko > dmesg gets logged: > "AVX2 version of gcm_enc/dec engaged > AES CTR mode by8 optimization enabled." > and insmod reports that it cannot insert aesni-intel because of unresolved > symbol or unknown option but nothing more is logged in dmesg. > > 4.9.x works fine. > > What did change/how to figure out deps now? > > Both modules exist: Hi Arkadiusz, First, aesni-intel no longer includes an LRW implementation itself. Instead, the generic LRW module must be selected. Internally it will use the aesni-intel accelerated ECB algorithm if available. So you need to make sure that the "lrw" module is included in the initrd if it's not already. But I think the bigger problem is that aesni-intel couldn't be insmod'ed at all, which shouldn't happen. The problem might actually be related to the "pcbc" algorithm. Upon initialization, aesni-intel now tries to wrap "pcbc(__aes-aesni)" with the "fpu" template. This will fail if the "pcbc" module hasn't been inserted. I think this wasn't a problem before because the old code using ablk_helper instead of crypto_simd didn't try to find "pcbc" until someone asked for it, while now aesni-intel will try to find it immediately. And since aesni-intel has no direct dependency on pcbc, I'm guessing what happened is that pcbc didn't end up in your initrd even though it may have been built. (You can verify this by adding pcbc to your initrd and seeing if that works around the problem.) Herbert, would it make any sense to solve this by creating a real dependency of aesni-intel on pcbc, by making aesni-intel reference an exported symbol in pcbc if IS_ENABLED(CONFIG_PCBC)? Or do you / did you have something else in mind? I also think it's kind of weird that aesni-intel has to do anything with pcbc at all since it doesn't actually implement pcbc specifically; it's just wrapping it in the "fpu" template to avoid so many calls to kernel_fpu_begin/kernel_fpu_end. I wonder if there is a better solution to this, maybe even optimizing kernel_fpu_begin/kernel_fpu_end so that the wrapping isn't needed. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] dm: switch dm-verity to async hash crypto API
On Sun, Jan 29, 2017 at 09:39:20AM +0200, Gilad Ben-Yossef wrote: > Hi Odrej, > > On Thu, Jan 26, 2017 at 1:34 PM, Ondrej Mosnáček > wrote: > > Hi Gilad, > > > > 2017-01-24 15:38 GMT+01:00 Gilad Ben-Yossef : > >> - v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); > >> + v->tfm = crypto_alloc_ahash(v->alg_name, 0, CRYPTO_ALG_ASYNC); > > > > I believe you should pass zero as the mask here. When flags == 0 and > > mask == CRYPTO_ALG_ASYNC, you are basically saying "I want only algs > > that have flags & CRYPTO_ALG_ASYNC == 0", which means you should only > > get ahash tfms that are always synchronous (see [1]). However, since > > you set a non-NULL callback in verity_hash_init, I don't think this > > was your intention. By setting the mask to zero, you should be able to > > get also an actual async tfm. > > > > Thanks, > > Ondrej > > > > [1] https://lkml.org/lkml/2016/12/13/904 > > Thank you very much for the review. > > I am the first to admit that I find the Crypto API very cryptic (pun > intended)... :-) > > I actually followed the example in Documentation/crypto/api-intro.txt. > I see now the example is > not doing what I though it was doing. Thank you for clarifying this. I > will send out a 2nd version. > > The thing I find puzzling in this is that I saw a difference in > latency when a async algorythm > provider driver with high priority (3000) was loaded vs. not. Based on > your description I would > expect the performance not to change. I will retest with the change > and will publish the results. > > Thanks! > Gilad Hi Gilad, One thing to keep in mind is that the crypto API somewhat conflates the concept of "is the algorithm asynchronous?" with "does this algorithm operate on virtual memory or physical memory?". The shash API is both synchronous and operates on virtual memory, and when using it you only have access to shash algorithms. The ahash API on the other hand operates on physical memory and can be used either synchronously or asynchronously. In addition, both shash and ahash algorithms may be accessed through the ahash API, and for shash algorithms the API will handle translating physical addresses to virtual addresses. So simply by using the ahash API instead of the shash API, even still requiring a "synchronous" algorithm, you may gain access to a different implementation of the algorithm, thereby changing the performance. So whenever doing any benchmarks I suggest including the actual driver name (cra_driver_name) of the algorithms used; otherwise it may be unclear why the performance changed. As for the patch, I haven't looked at it in detail, but I agree that if dm-verity is indeed operating on sufficiently large buffers in physically contiguous memory, then the ahash API would be better than the shash API. Note, that it also could be useful look into supporting having multiple async requests issued and pending at the same time, similar to what dm-crypt does. Eric -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH] crypto: algapi - make crypto_xor() and crypto_inc() alignment agnostic
On Mon, Jan 30, 2017 at 02:11:29PM +, Ard Biesheuvel wrote: > Instead of unconditionally forcing 4 byte alignment for all generic > chaining modes that rely on crypto_xor() or crypto_inc() (which may > result in unnecessary copying of data when the underlying hardware > can perform unaligned accesses efficiently), make those functions > deal with unaligned input explicitly, but only if the Kconfig symbol > HAVE_EFFICIENT_UNALIGNED_ACCESS is set. This will allow us to drop > the alignmasks from the CBC, CMAC, CTR, CTS, PCBC and SEQIV drivers. > > For crypto_inc(), this simply involves making the 4-byte stride > conditional on HAVE_EFFICIENT_UNALIGNED_ACCESS being set, given that > it typically operates on 16 byte buffers. > > For crypto_xor(), an algorithm is implemented that simply runs through > the input using the largest strides possible if unaligned accesses are > allowed. If they are not, an optimal sequence of memory accesses is > emitted that takes the relative alignment of the input buffers into > account, e.g., if the relative misalignment of dst and src is 4 bytes, > the entire xor operation will be completed using 4 byte loads and stores > (modulo unaligned bits at the start and end). Note that all expressions > involving startalign and misalign are simply eliminated by the compiler > if HAVE_EFFICIENT_UNALIGNED_ACCESS is defined. > Hi Ard, This is a good idea, and I think it was error-prone to be requiring 4-byte alignment always, and also inefficient on many architectures. The new crypto_inc() looks fine, but the new crypto_xor() is quite complicated. I'm wondering whether it has to be that way, especially since it seems to most commonly be used on very small input buffers, e.g. 8 or 16-byte blocks. There are a couple trivial ways it could be simplified, e.g. using 'dst' and 'src' directly instead of 'a' and 'b' (which also seems to improve code generation by getting rid of the '+= len & ~mask' parts), or using sizeof(long) directly instead of 'size' and 'mask'. But also when I tried testing the proposed crypto_xor() on MIPS, it didn't work correctly on a misaligned buffer. With startalign=1, it did one iteration of the following loop and then exited with startalign=0 and entered the "unsigned long at a time" loop, which is incorrect since at that point the buffers were not yet fully aligned: > do { > if (len < sizeof(u8)) > break; > > if (len >= size && !(startalign & 1) && !(misalign & 1)) > break; > > *dst++ ^= *src++; > len -= sizeof(u8); > startalign &= ~sizeof(u8); > } while (misalign & 1); I think it would need to do instead: startalign += sizeof(u8); startalign %= sizeof(unsigned long); But I am wondering whether you considered something simpler, using the get_unaligned/put_unaligned helpers, maybe even using a switch statement for the last (sizeof(long) - 1) bytes so it can be compiled as a jump table. Something like this: #define xor_unaligned(dst, src) \ put_unaligned(get_unaligned(dst) ^ get_unaligned(src), (dst)) void crypto_xor(u8 *dst, const u8 *src, unsigned int len) { while (len >= sizeof(unsigned long)) { xor_unaligned((unsigned long *)dst, (unsigned long *)src); dst += sizeof(unsigned long); src += sizeof(unsigned long); len -= sizeof(unsigned long); } switch (len) { #ifdef CONFIG_64BIT case 7: dst[6] ^= src[6]; /* fall through */ case 6: xor_unaligned((u16 *)&dst[4], (u16 *)&src[4]); goto len_4; case 5: dst[4] ^= src[4]; /* fall through */ case 4: len_4: xor_unaligned((u32 *)dst, (u32 *)src); break; #endif case 3: dst[2] ^= src[2]; /* fall through */ case 2: xor_unaligned((u16 *)dst, (u16 *)src); break; case 1: dst[0] ^= src[0]; break; } } That would seem like a better choice for small buffers, which seems to be the more common case. It should generate slightly faster code on architectures with fast unaligned access like x86_64, while still being sufficient on architectures without --- perhaps even faster, since it wouldn't have as many branches. Eric
Re: [RFC PATCH v2 4/4] crypto: aes - add generic time invariant AES for CTR/CCM/GCM
Hi Ard, On Sat, Jan 28, 2017 at 11:33:33PM +, Ard Biesheuvel wrote: > > Note that this only implements AES encryption, which is all we need > for CTR and CBC-MAC. AES decryption can easily be implemented in a > similar way, but is significantly more costly. Is the expectation of decryption being more costly the only reason why "aes-ti" couldn't be a "cipher" algorithm, allowing it to automatically be used by the existing templates for CTR, CBC-MAC, CBC, ECB, XTS, CMAC, etc.? It doesn't seem to do anything expensive on a per-block basis like loading SSE registers, so it seems it would fit better as a "cipher" algorithm if at all possible. Then there would be no need to implement all these modes yet again. Also, what would be the feasibility of simply replacing aes-generic with the time-invariant implementation, rather than offering two implementations and requiring users to choose one, usually without the needed expertise? > + > +/* > + * Emit the sbox as __weak with external linkage to prevent the compiler > + * from doing constant folding on sbox references involving fixed indexes. > + */ > +__weak const u8 __cacheline_aligned __aesti_sbox[] = { Did you consider marking it 'volatile' instead? > +static int aesti_set_key(struct aes_ti_ctx *ctx, const u8 *in_key, > + unsigned int key_len) > +{ > + struct crypto_aes_ctx rk; > + int err; > + > + err = crypto_aes_expand_key(&rk, in_key, key_len); > + if (err) > + return err; crypto_aes_expand_key() assumes that the key is u32-aligned; I don't think that's guaranteed here. Eric
Re: [PATCH v2] crypto: algapi - make crypto_xor() and crypto_inc() alignment agnostic
Hi Ard, On Thu, Feb 02, 2017 at 03:56:28PM +, Ard Biesheuvel wrote: > + const int size = sizeof(unsigned long); > + int delta = ((unsigned long)dst ^ (unsigned long)src) & (size - 1); > + int misalign = 0; > + > + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && delta) { > + misalign = 1 << __ffs(delta); > + > + /* > + * If we care about alignment, process as many bytes as > + * needed to advance dst and src to values whose alignments > + * equal their relative misalignment. This will allow us to > + * process the remainder of the input using optimal strides. > + */ > + while (((unsigned long)dst & (misalign - 1)) && len > 0) { > + *dst++ ^= *src++; > + len--; > + } > + } > > + while (len >= size && !misalign) { > + *(unsigned long *)dst ^= *(unsigned long *)src; > + dst += size; > + src += size; > + len -= size; > + } > Unfortunately this is still broken, for two different reasons. First, if the pointers have the same relative misalignment, then 'delta' and 'misalign' will be set to 0 and long accesses will be used, even though the pointers may actually be misaligned, e.g. 0x8001 and 0x9001. Second, if the pointers have a relative misalignent that is not a power-of-2, then 'misalign' will be set to the wrong value. For example, with delta=3, it's actually only safe to do byte accesses, but the code will set misalign=2 and do u16 accesses. I kind of liked the version with put_unaligned/get_unaligned (and it seems to perform okay on MIPS, though not on ARM which is probably more important). But if the various cases with !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS are going to be handled/optimized I think they will just need to be separated out, maybe something like this: void crypto_xor(u8 *dst, const u8 *src, unsigned int len) { #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS unsigned long delta; if (((unsigned long)dst | (unsigned long)src | len) % sizeof(unsigned long) == 0) { /* fast path: everything is aligned, including len */ while (len >= sizeof(unsigned long)) { *(unsigned long *)dst ^= *(unsigned long *)src; dst += sizeof(unsigned long); src += sizeof(unsigned long); len -= sizeof(unsigned long); } return; } /* handle relative misalignment */ delta = (unsigned long)dst ^ (unsigned long)src; if (delta % sizeof(unsigned long)) { /* 1-byte relative misalignment: do byte accesses */ if (delta & 1) { while (len--) *dst++ ^= *src++; return; } /* 2-byte relative misalignment: do u16 accesses */ if ((delta & 2) || sizeof(unsigned long) == 4) { if ((unsigned long)dst % 2 && len) { *dst++ ^= *src++; len--; } while (len >= 2) { *(u16 *)dst ^= *(u16 *)src; dst += 2, src += 2, len -= 2; } if (len) *dst ^= *src; return; } /* 4-byte relative misalignment: do u32 accesses */ while ((unsigned long)dst % 4 && len) { *dst++ ^= *src++; len--; } while (len >= 4) { *(u32 *)dst ^= *(u32 *)src; dst += 4, src += 4, len -= 4; } while (len--) *dst++ ^= *src++; return; } /* handle absolute misalignment */ while ((unsigned long)dst % sizeof(unsigned long) && len) { *dst++ ^= *src++; len--; } #endif /* !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ while (len >= sizeof(unsigned long)) { *(unsigned long *)dst ^= *(unsigned long *)src; dst += sizeof(unsigned long); src += sizeof(unsigned long); len -= sizeof(unsigned long); } while (len--) *dst++ ^= *src++; }
Re: [PATCH v2] crypto: algapi - make crypto_xor() and crypto_inc() alignment agnostic
On Sat, Feb 04, 2017 at 01:20:38PM -0800, Eric Biggers wrote: > Unfortunately this is still broken, for two different reasons. First, if the > pointers have the same relative misalignment, then 'delta' and 'misalign' will > be set to 0 and long accesses will be used, even though the pointers may > actually be misaligned, e.g. 0x8001 and 0x9001. Second, if the > pointers > have a relative misalignent that is not a power-of-2, then 'misalign' will be > set to the wrong value. For example, with delta=3, it's actually only safe to > do byte accesses, but the code will set misalign=2 and do u16 accesses. > Correction: for the second issue I think I mixed up ffs and fls, so that part of the code was right. But it may still be a good idea to separate out the different cases. Eric
Re: [RFC PATCH] crypto: algapi - make crypto_xor() and crypto_inc() alignment agnostic
On Sun, Feb 05, 2017 at 12:10:53AM +0100, Jason A. Donenfeld wrote: > Another thing that might be helpful is that you can let gcc decide on > the alignment, and then optimize appropriately. Check out what we do > with siphash: > > https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/tree/include/linux/siphash.h#n76 > > static inline u64 siphash(const void *data, size_t len, const > siphash_key_t *key) > { > #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) > return __siphash_unaligned(data, len, key); > #endif > return ___siphash_aligned(data, len, key); > } > > With this trick, we fall through to the fast alignment-assuming code, > if gcc can prove that the address is inlined. This is often the case > when passing structs, or when passing buffers that have > __aligned(BLOCKSIZE). It proves to be a very useful optimization on > some platforms. Yes, this is a good idea. Though it seems that usually at least one of the two pointers passed to crypto_xor() will have alignment unknown to the compiler, sometimes the length is constant which inlining can help a lot for. For example, if someone does crypto_xor(foo, bar, 16) on x86_64 or ARM64, we'd really like it to turn into just a few instructions like this: mov(%rsi),%rax xor%rax,(%rdi) mov0x8(%rsi),%rax xor%rax,0x8(%rdi) So how about inlining crypto_xor() if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS or the pointers are long-aligned, otherwise calling an out-of-line function __crypto_xor_unaligned() that handles all the cases with weird alignment. Something like the following patch: (Note: exactly how __crypto_xor_unaligned() is implemented is still debatable; it could be more similar to Ard's proposal, or it could use the unaligned access helpers.) diff --git a/crypto/algapi.c b/crypto/algapi.c index df939b54b09f..a0591db3f13a 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -972,23 +972,69 @@ void crypto_inc(u8 *a, unsigned int size) } EXPORT_SYMBOL_GPL(crypto_inc); -static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size) +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +void __crypto_xor_unaligned(u8 *dst, const u8 *src, unsigned int len) { - for (; size; size--) - *a++ ^= *b++; -} + unsigned long delta = (unsigned long)dst ^ (unsigned long)src; -void crypto_xor(u8 *dst, const u8 *src, unsigned int size) -{ - u32 *a = (u32 *)dst; - u32 *b = (u32 *)src; + /* Handle relative misalignment */ + if (delta % sizeof(unsigned long)) { + + /* 1-byte relative misalignment? */ + if (delta & 1) { + while (len--) + *dst++ ^= *src++; + return; + } - for (; size >= 4; size -= 4) - *a++ ^= *b++; + /* 2-byte relative misalignment? */ + if ((delta & 2) || sizeof(unsigned long) == 4) { + if ((unsigned long)dst % __alignof__(u16) && len) { + *dst++ ^= *src++; + len--; + } + while (len >= 2) { + *(u16 *)dst ^= *(u16 *)src; + dst += 2, src += 2, len -= 2; + } + if (len) + *dst ^= *src; + return; + } + + /* 4-byte relative misalignment? */ + while ((unsigned long)dst % __alignof__(u32) && len) { + *dst++ ^= *src++; + len--; + } + while (len >= 4) { + *(u32 *)dst ^= *(u32 *)src; + dst += 4, src += 4, len -= 4; + } + while (len--) + *dst++ ^= *src++; + return; + } + + /* No relative misalignment; use word accesses */ + + while ((unsigned long)dst % __alignof__(unsigned long) && len) { + *dst++ ^= *src++; + len--; + } + + while (len >= sizeof(unsigned long)) { + *(unsigned long *)dst ^= *(unsigned long *)src; + dst += sizeof(unsigned long); + src += sizeof(unsigned long); + len -= sizeof(unsigned long); + } - crypto_xor_byte((u8 *)a, (u8 *)b, size); + while (len--) + *dst++ ^= *src++; } -EXPORT_SYMBOL_GPL(crypto_xor); +EXPORT_SYMBOL_GPL(__crypto_xor_unaligned); +#endif /* !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ unsigned int crypto_alg_extsize(struct crypto_alg *alg) { diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 404e9558e879..718145c5eaca 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -191,9 +191,29
Re: [PATCH v7 0/5] Update LZ4 compressor module
On Thu, Feb 09, 2017 at 08:31:21AM +0900, Minchan Kim wrote: > > Today, I did zram-lz4 performance test with fio in current mmotm and > found it makes regression about 20%. > This may or may not be the cause of the specific regression you're observing, but I just noticed that the proposed patch drops a lot of FORCEINLINE annotations from upstream LZ4. The FORCEINLINE's are there for a reason, especially for the main decompression and compression functions which are basically "templates" that take in different sets of constant parameters, and should be left in. We should #define FORCEINLINE to __always_inline somewhere, or just do a s/FORCEINLINE/__always_inline/g. Note that the upstream LZ4 code is very carefully optimized, so we should not, in general, be changing things like when functions are force-inlined, what the hash table size is, etc. [Also, for some reason linux-crypto is apparently still not receiving patch 1/5 in the series. It's missing from the linux-crypto archive at http://www.spinics.net/lists/linux-crypto/, so it's not just me.] Thanks! Eric
Re: [PATCH v7 0/5] Update LZ4 compressor module
Also I noticed another bug, this time in LZ4_count(): > #if defined(CONFIG_64BIT) > #define LZ4_ARCH64 1 > #else > #define LZ4_ARCH64 0 > #endif ... > #ifdef LZ4_ARCH64 >if ((pIn < (pInLimit-3)) >&& (LZ4_read32(pMatch) == LZ4_read32(pIn))) { >pIn += 4; pMatch += 4; >} > #endif Because of how LZ4_ARCH64 is defined, it needs to be '#if LZ4_ARCH64'. But I also think the way upstream LZ4 does 64-bit detection could have just been left as-is; it has a function which gets inlined: static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } Eric
Re: [PATCH v7 0/5] Update LZ4 compressor module
On Thu, Feb 09, 2017 at 12:05:40PM +0100, Sven Schmidt wrote: > > Because of how LZ4_ARCH64 is defined, it needs to be '#if LZ4_ARCH64'. > > > > But I also think the way upstream LZ4 does 64-bit detection could have just > > been > > left as-is; it has a function which gets inlined: > > > > static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } > > > > Eric > > does this apply for LZ4_isLittleEndian() as well? As a reminder: > > static unsigned LZ4_isLittleEndian(void) > { > /* don't use static : performance detrimental */ > const union { U32 u; BYTE c[4]; } one = { 1 }; > > return one.c[0]; > } > > It is surely easier to read and understand using these functions in favor of > the macros. Yes, it makes sense to retain LZ4_isLittleEndian() too, mainly so that the call sites don't need to be changed and we have less chance of introducing bugs. I also believe the *implementation* of LZ4_isLittleEndian() using the union "hack" to detecting endianness works fine and will get optimized correctly, though we could replace it with an #ifdef __LITTLE_ENDIAN__ if we wanted to. (I am sure that upstream LZ4 would do that if it was guaranteed to work, but upstream LZ4 needs to detect endianness reliably across many more different compilers, compiler versions, and platforms than the Linux kernel does, and there is no standard preprocessor macro for doing so.) Eric
Re: [PATCH v7 0/5] Update LZ4 compressor module
On Thu, Feb 09, 2017 at 12:02:11PM +0100, Sven Schmidt wrote: > > > > [Also, for some reason linux-crypto is apparently still not receiving patch > > 1/5 > > in the series. It's missing from the linux-crypto archive at > > http://www.spinics.net/lists/linux-crypto/, so it's not just me.] > > > > I don't really know what to do about this. I think the matter is the size of > the E-Mail. > Are there filters or something like that? Since in linux-kernel the patch > seems to get delivered. > I could otherwise CC you if you wish. > If I'm not mistaken, David Miller is the admin of the mail server on vger.kernel.org, and he already happens to be Cc'ed on this thread, so maybe he can answer as to why linux-crypto may be configured differently? Anyway, since the patch did make it to linux-kernel anyone can still download it from patchwork if they know where to look: https://patchwork.kernel.org/patch/9556271/ Eric
Re: [PATCH] lz4: fix performance regressions
Hi Sven, On Sun, Feb 12, 2017 at 12:16:18PM +0100, Sven Schmidt wrote: > /*- > * Reading and writing into memory > **/ > +typedef union { > + U16 u16; > + U32 u32; > + size_t uArch; > +} __packed unalign; > > -static inline U16 LZ4_read16(const void *memPtr) > +static FORCE_INLINE __maybe_unused U16 LZ4_read16(const void *ptr) > { > - U16 val; > - > - memcpy(&val, memPtr, sizeof(val)); > - > - return val; > + return ((const unalign *)ptr)->u16; > } > > -static inline U32 LZ4_read32(const void *memPtr) > +static FORCE_INLINE __maybe_unused U32 LZ4_read32(const void *ptr) > { > - U32 val; > - > - memcpy(&val, memPtr, sizeof(val)); > - > - return val; > + return ((const unalign *)ptr)->u32; > } > > -static inline size_t LZ4_read_ARCH(const void *memPtr) > +static FORCE_INLINE __maybe_unused size_t LZ4_read_ARCH(const void *ptr) > { > - size_t val; > - > - memcpy(&val, memPtr, sizeof(val)); > - > - return val; > + return ((const unalign *)ptr)->uArch; > } > > -static inline void LZ4_write16(void *memPtr, U16 value) > +static FORCE_INLINE __maybe_unused void LZ4_write16(void *memPtr, U16 value) > { > - memcpy(memPtr, &value, sizeof(value)); > + ((unalign *)memPtr)->u16 = value; > } > > -static inline void LZ4_write32(void *memPtr, U32 value) > -{ > - memcpy(memPtr, &value, sizeof(value)); > +static FORCE_INLINE __maybe_unused void LZ4_write32(void *memPtr, U32 value) > { > + ((unalign *)memPtr)->u32 = value; > } > > -static inline U16 LZ4_readLE16(const void *memPtr) > +static FORCE_INLINE __maybe_unused U16 LZ4_readLE16(const void *memPtr) > { > -#ifdef __LITTLE_ENDIAN__ > +#if LZ4_LITTLE_ENDIAN > return LZ4_read16(memPtr); > #else > const BYTE *p = (const BYTE *)memPtr; > @@ -137,19 +143,19 @@ static inline U16 LZ4_readLE16(const void *memPtr) > #endif > } Since upstream LZ4 is intended to be compiled at -O3, this may allow it to get away with using memcpy() for unaligned memory accesses. The reason it uses memcpy() is that, other than a byte-by-byte copy, it is the only portable way to express unaligned memory accesses. But the Linux kernel is sometimes compiled optimized for size (-Os), and I wouldn't be *too* surprised if some of the memcpy()'s don't always get inlined then, which could be causing the performance regression being observed. (Of course, this could be verified by checking whether CONFIG_CC_OPTIMIZE_FOR_SIZE=y is set, then reading the assembly.) But I don't think accessing a __packed structure directly is the right alternative. Instead, Linux already includes macros for unaligned memory accesses which have been optimized for every supported architecture. Those should just be used instead, e.g. like this: static FORCE_INLINE U16 LZ4_read16(const void *ptr) { return get_unaligned((const u16 *)ptr); } static FORCE_INLINE U32 LZ4_read32(const void *ptr) { return get_unaligned((const u32 *)ptr); } static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr) { return get_unaligned((const size_t *)ptr); } static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value) { put_unaligned(value, (u16 *)memPtr); } static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) { put_unaligned(value, (u32 *)memPtr); } static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); } static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value) { return put_unaligned_le16(value, memPtr); } static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) { if (LZ4_64bits()) { u64 a = get_unaligned((const u64 *)src); put_unaligned(a, (u64 *)dst); } else { u32 a = get_unaligned((const u32 *)src); u32 b = get_unaligned((const u32 *)src + 1); put_unaligned(a, (u32 *)dst); put_unaligned(b, (u32 *)dst + 1); } } Note that I dropped __maybe_unused as it's not needed on inline functions. That should be done everywhere else the patch proposes to add it too. > -#if LZ4_ARCH64 > -#ifdef __BIG_ENDIAN__ > -#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) > +static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) > +{ > +#if LZ4_LITTLE_ENDIAN > +#if LZ4_ARCH64 /* 64 Bits Little Endian */ > +#if defined(LZ4_FORCE_SW_BITCOUNT) > + static const int DeBruijnBytePos[64] = { > + 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, > + 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, > + 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, > + 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 > + }; > + > + return DeBruijnBytePos[((U64)((val & -(long long)val) > + * 0x0218A392CDABBD3FULL)) >> 58]; > #else > -#define LZ4_NBCOMMONBYTES(val) (__builtin_
[PATCH 3/4] crypto: gf128mul - rename the byte overflow tables
Though the GF(2^128) byte overflow tables were named the "lle" and "bbe" tables, they are not actually tied to these element formats specifically, but rather to particular a "bit endianness". For example, the bbe table is actually used for both bbe and ble multiplication. Therefore, rename the tables to "le" and "be" and update the comment to explain this. Cc: Alex Cope Signed-off-by: Eric Biggers --- crypto/gf128mul.c | 49 - 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index c050cf6f5aa9..1fde1c79ffa5 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c @@ -88,31 +88,46 @@ q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \ } -/* Given the value i in 0..255 as the byte overflow when a field element -in GHASH is multiplied by x^8, this function will return the values that -are generated in the lo 16-bit word of the field value by applying the -modular polynomial. The values lo_byte and hi_byte are returned via the -macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into -memory as required by a suitable definition of this macro operating on -the table above -*/ +/* + * Given a value i in 0..255 as the byte overflow when a field element + * in GF(2^128) is multiplied by x^8, the following macro returns the + * 16-bit value that must be XOR-ed into the low-degree end of the + * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1. + * + * There are two versions of the macro, and hence two tables: one for + * the "be" convention where the highest-order bit is the coefficient of + * the highest-degree polynomial term, and one for the "le" convention + * where the highest-order bit is the coefficient of the lowest-degree + * polynomial term. In both cases the values are stored in CPU byte + * endianness such that the coefficients are ordered consistently across + * bytes, i.e. in the "be" table bits 15..0 of the stored value + * correspond to the coefficients of x^15..x^0, and in the "le" table + * bits 15..0 correspond to the coefficients of x^0..x^15. + * + * Therefore, provided that the appropriate byte endianness conversions + * are done by the multiplication functions (and these must be in place + * anyway to support both little endian and big endian CPUs), the "be" + * table can be used for multiplications of both "bbe" and "ble" + * elements, and the "le" table can be used for multiplications of both + * "lle" and "lbe" elements. + */ -#define xda_bbe(i) ( \ +#define xda_be(i) ( \ (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \ (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \ (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \ (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \ ) -#define xda_lle(i) ( \ +#define xda_le(i) ( \ (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \ (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \ (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \ (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \ ) -static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle); -static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe); +static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le); +static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be); /* * The following functions multiply a field element by x or by x^8 in @@ -125,7 +140,7 @@ static void gf128mul_x_lle(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); - u64 _tt = gf128mul_table_lle[(b << 7) & 0xff]; + u64 _tt = gf128mul_table_le[(b << 7) & 0xff]; r->b = cpu_to_be64((b >> 1) | (a << 63)); r->a = cpu_to_be64((a >> 1) ^ (_tt << 48)); @@ -135,7 +150,7 @@ static void gf128mul_x_bbe(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); - u64 _tt = gf128mul_table_bbe[a >> 63]; + u64 _tt = gf128mul_table_be[a >> 63]; r->a = cpu_to_be64((a << 1) | (b >> 63)); r->b = cpu_to_be64((b << 1) ^ _tt); @@ -145,7 +160,7 @@ void gf128mul_x_ble(be128 *r, const be128 *x) { u64 a = le64_to_cpu(x->a); u64 b = le64_to_cpu(x->b); - u64 _tt = gf128mul_table_bbe[b >> 63]; + u64 _tt = gf128mul_table_be[b >> 63]; r->a = cpu_to_le64((a << 1) ^ _tt); r->b = cpu_to_le64((b << 1) | (a >> 63)); @@ -156,7 +171,7 @@ static void gf128mul_x8_lle(be128 *x) { u64 a = be64_to_
[PATCH 0/4] crypto: gf128mul cleanups
This patchset makes a few cleanups to the generic GF(2^128) multiplication code to make it slightly easier to understand and modify. No functional changes are intended. Eric Biggers (4): crypto: gf128mul - fix some comments crypto: gf128mul - remove xx() macro crypto: gf128mul - rename the byte overflow tables crypto: gf128mul - constify 4k and 64k multiplication tables crypto/gf128mul.c | 86 +++ include/crypto/gf128mul.h | 32 +- 2 files changed, 67 insertions(+), 51 deletions(-) -- 2.11.0.483.g087da7b7c-goog
[PATCH 4/4] crypto: gf128mul - constify 4k and 64k multiplication tables
Constify the multiplication tables passed to the 4k and 64k multiplication functions, as they are not modified by these functions. Cc: Alex Cope Signed-off-by: Eric Biggers --- crypto/gf128mul.c | 6 +++--- include/crypto/gf128mul.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index 1fde1c79ffa5..04facc0690aa 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c @@ -329,7 +329,7 @@ void gf128mul_free_64k(struct gf128mul_64k *t) } EXPORT_SYMBOL(gf128mul_free_64k); -void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t) +void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t) { u8 *ap = (u8 *)a; be128 r[1]; @@ -402,7 +402,7 @@ struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g) } EXPORT_SYMBOL(gf128mul_init_4k_bbe); -void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t) +void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t) { u8 *ap = (u8 *)a; be128 r[1]; @@ -417,7 +417,7 @@ void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t) } EXPORT_SYMBOL(gf128mul_4k_lle); -void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t) +void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t) { u8 *ap = (u8 *)a; be128 r[1]; diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h index 9662c4538873..0bc9b5f1c45e 100644 --- a/include/crypto/gf128mul.h +++ b/include/crypto/gf128mul.h @@ -174,8 +174,8 @@ struct gf128mul_4k { struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g); struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g); -void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t); -void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t); +void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t); +void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t); static inline void gf128mul_free_4k(struct gf128mul_4k *t) { @@ -196,6 +196,6 @@ struct gf128mul_64k { */ struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g); void gf128mul_free_64k(struct gf128mul_64k *t); -void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t); +void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t); #endif /* _CRYPTO_GF128MUL_H */ -- 2.11.0.483.g087da7b7c-goog
[PATCH 2/4] crypto: gf128mul - remove xx() macro
The xx() macro serves no purpose and can be removed. Cc: Alex Cope Signed-off-by: Eric Biggers --- crypto/gf128mul.c | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index d9e3eecc218a..c050cf6f5aa9 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c @@ -97,20 +97,18 @@ the table above */ -#define xx(p, q) 0x##p##q - #define xda_bbe(i) ( \ - (i & 0x80 ? xx(43, 80) : 0) ^ (i & 0x40 ? xx(21, c0) : 0) ^ \ - (i & 0x20 ? xx(10, e0) : 0) ^ (i & 0x10 ? xx(08, 70) : 0) ^ \ - (i & 0x08 ? xx(04, 38) : 0) ^ (i & 0x04 ? xx(02, 1c) : 0) ^ \ - (i & 0x02 ? xx(01, 0e) : 0) ^ (i & 0x01 ? xx(00, 87) : 0) \ + (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \ + (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \ + (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \ + (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \ ) #define xda_lle(i) ( \ - (i & 0x80 ? xx(e1, 00) : 0) ^ (i & 0x40 ? xx(70, 80) : 0) ^ \ - (i & 0x20 ? xx(38, 40) : 0) ^ (i & 0x10 ? xx(1c, 20) : 0) ^ \ - (i & 0x08 ? xx(0e, 10) : 0) ^ (i & 0x04 ? xx(07, 08) : 0) ^ \ - (i & 0x02 ? xx(03, 84) : 0) ^ (i & 0x01 ? xx(01, c2) : 0) \ + (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \ + (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \ + (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \ + (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \ ) static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle); -- 2.11.0.483.g087da7b7c-goog
[PATCH 1/4] crypto: gf128mul - fix some comments
Fix incorrect references to GF(128) instead of GF(2^128), as these are two entirely different fields, and fix a few other incorrect comments. Cc: Alex Cope Signed-off-by: Eric Biggers --- crypto/gf128mul.c | 13 +++-- include/crypto/gf128mul.h | 26 ++ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index 72015fee533d..d9e3eecc218a 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c @@ -44,7 +44,7 @@ --- Issue 31/01/2006 - This file provides fast multiplication in GF(128) as required by several + This file provides fast multiplication in GF(2^128) as required by several cryptographic authentication modes */ @@ -116,9 +116,10 @@ static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle); static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe); -/* These functions multiply a field element by x, by x^4 and by x^8 - * in the polynomial field representation. It uses 32-bit word operations - * to gain speed but compensates for machine endianess and hence works +/* + * The following functions multiply a field element by x or by x^8 in + * the polynomial field representation. They use 64-bit word operations + * to gain speed but compensate for machine endianness and hence work * correctly on both styles of machine. */ @@ -251,7 +252,7 @@ EXPORT_SYMBOL(gf128mul_bbe); /* This version uses 64k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key -value in GF(128). If we consider a GF(128) value in +value in GF(2^128). If we consider a GF(2^128) value in the buffer's lowest byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. But we also @@ -330,7 +331,7 @@ EXPORT_SYMBOL(gf128mul_64k_bbe); /* This version uses 4k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key -value in GF(128). If we consider a GF(128) value in a +value in GF(2^128). If we consider a GF(2^128) value in a single byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. If we take the highest byte in diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h index 592d47e565a8..9662c4538873 100644 --- a/include/crypto/gf128mul.h +++ b/include/crypto/gf128mul.h @@ -43,7 +43,7 @@ --- Issue Date: 31/01/2006 - An implementation of field multiplication in Galois Field GF(128) + An implementation of field multiplication in Galois Field GF(2^128) */ #ifndef _CRYPTO_GF128MUL_H @@ -65,7 +65,7 @@ * are left and the lsb's are right. char b[16] is an array and b[0] is * the first octet. * - * 8000 + * 1000 * b[0] b[1] b[2] b[3] b[13]b[14]b[15] * * Every bit is a coefficient of some power of X. We can store the bits @@ -85,15 +85,17 @@ * Both of the above formats are easy to implement on big-endian * machines. * - * EME (which is patent encumbered) uses the ble format (bits are stored - * in big endian order and the bytes in little endian). The above buffer - * represents X^7 in this case and the primitive polynomial is b[0] = 0x87. + * XTS and EME (the latter of which is patent encumbered) use the ble + * format (bits are stored in big endian order and the bytes in little + * endian). The above buffer represents X^7 in this case and the + * primitive polynomial is b[0] = 0x87. * * The common machine word-size is smaller than 128 bits, so to make * an efficient implementation we must split into machine word sizes. - * This file uses one 32bit for the moment. Machine endianness comes into - * play. The lle format in relation to machine endianness is discussed - * below by the original author of gf128mul Dr Brian Gladman. + * This implementation uses 64-bit words for the moment. Machine + * endianness comes into play. The lle format in relation to machine + * endianness is discussed below by the original author of gf128mul Dr + * Brian Gladman. * * Let's look at the bbe and ble format on a little endian machine. * @@ -127,10 +129,10 @@ * machines this will automatically aligned to wordsize and on a 64-bit * machine also. */ -/* Multiply a GF128 field element by x. Field elements are held in arrays -of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower -indexed bits placed in the more numerically significant bit positions -within bytes. +/* Multiply a GF(2^128) field element by x. Field elements are +held in arrays of bytes in which field bits 8n..8n
[PATCH 0/2] crypto: constify test vectors
From: Eric Biggers These two patches mark all the cryptographic test vectors as 'const'. This has several potential advantages and moves a large amount of data from .data to .rodata when the tests are enabled. The second patch does the real work; the first just prepares for it by updating a function to take a const buffer argument. Eric Biggers (2): crypto: kpp - constify buffer passed to crypto_kpp_set_secret() crypto: testmgr - constify all test vectors crypto/dh.c | 3 +- crypto/ecdh.c | 3 +- crypto/testmgr.c | 71 ++-- crypto/testmgr.h | 512 +- drivers/crypto/qat/qat_common/qat_asym_algs.c | 2 +- include/crypto/kpp.h | 6 +- 6 files changed, 305 insertions(+), 292 deletions(-) -- 2.11.0.483.g087da7b7c-goog
[PATCH 1/2] crypto: kpp - constify buffer passed to crypto_kpp_set_secret()
From: Eric Biggers Constify the buffer passed to crypto_kpp_set_secret() and kpp_alg.set_secret, since it is never modified. Signed-off-by: Eric Biggers --- crypto/dh.c | 3 ++- crypto/ecdh.c | 3 ++- drivers/crypto/qat/qat_common/qat_asym_algs.c | 2 +- include/crypto/kpp.h | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/crypto/dh.c b/crypto/dh.c index ddcb528ab2cc..87e3542cf1b8 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -79,7 +79,8 @@ static int dh_set_params(struct dh_ctx *ctx, struct dh *params) return 0; } -static int dh_set_secret(struct crypto_kpp *tfm, void *buf, unsigned int len) +static int dh_set_secret(struct crypto_kpp *tfm, const void *buf, +unsigned int len) { struct dh_ctx *ctx = dh_get_ctx(tfm); struct dh params; diff --git a/crypto/ecdh.c b/crypto/ecdh.c index 3de289806d67..63ca33771e4e 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -38,7 +38,8 @@ static unsigned int ecdh_supported_curve(unsigned int curve_id) } } -static int ecdh_set_secret(struct crypto_kpp *tfm, void *buf, unsigned int len) +static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) { struct ecdh_ctx *ctx = ecdh_get_ctx(tfm); struct ecdh params; diff --git a/drivers/crypto/qat/qat_common/qat_asym_algs.c b/drivers/crypto/qat/qat_common/qat_asym_algs.c index 0d35dca2e925..2aab80bc241f 100644 --- a/drivers/crypto/qat/qat_common/qat_asym_algs.c +++ b/drivers/crypto/qat/qat_common/qat_asym_algs.c @@ -491,7 +491,7 @@ static void qat_dh_clear_ctx(struct device *dev, struct qat_dh_ctx *ctx) ctx->g2 = false; } -static int qat_dh_set_secret(struct crypto_kpp *tfm, void *buf, +static int qat_dh_set_secret(struct crypto_kpp *tfm, const void *buf, unsigned int len) { struct qat_dh_ctx *ctx = kpp_tfm_ctx(tfm); diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h index 4307a2f2365f..ce8e1f79374b 100644 --- a/include/crypto/kpp.h +++ b/include/crypto/kpp.h @@ -74,7 +74,7 @@ struct crypto_kpp { * @base: Common crypto API algorithm data structure */ struct kpp_alg { - int (*set_secret)(struct crypto_kpp *tfm, void *buffer, + int (*set_secret)(struct crypto_kpp *tfm, const void *buffer, unsigned int len); int (*generate_public_key)(struct kpp_request *req); int (*compute_shared_secret)(struct kpp_request *req); @@ -273,8 +273,8 @@ struct kpp_secret { * * Return: zero on success; error code in case of error */ -static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm, void *buffer, - unsigned int len) +static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm, + const void *buffer, unsigned int len) { struct kpp_alg *alg = crypto_kpp_alg(tfm); -- 2.11.0.483.g087da7b7c-goog
[PATCH 2/2] crypto: testmgr - constify all test vectors
From: Eric Biggers Cryptographic test vectors should never be modified, so constify them to enforce this at both compile-time and run-time. This moves a significant amount of data from .data to .rodata when the crypto tests are enabled. Signed-off-by: Eric Biggers --- crypto/testmgr.c | 71 crypto/testmgr.h | 512 +++ 2 files changed, 297 insertions(+), 286 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index f9c378af3907..89f1dd1f4b13 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -83,47 +83,47 @@ struct tcrypt_result { struct aead_test_suite { struct { - struct aead_testvec *vecs; + const struct aead_testvec *vecs; unsigned int count; } enc, dec; }; struct cipher_test_suite { struct { - struct cipher_testvec *vecs; + const struct cipher_testvec *vecs; unsigned int count; } enc, dec; }; struct comp_test_suite { struct { - struct comp_testvec *vecs; + const struct comp_testvec *vecs; unsigned int count; } comp, decomp; }; struct hash_test_suite { - struct hash_testvec *vecs; + const struct hash_testvec *vecs; unsigned int count; }; struct cprng_test_suite { - struct cprng_testvec *vecs; + const struct cprng_testvec *vecs; unsigned int count; }; struct drbg_test_suite { - struct drbg_testvec *vecs; + const struct drbg_testvec *vecs; unsigned int count; }; struct akcipher_test_suite { - struct akcipher_testvec *vecs; + const struct akcipher_testvec *vecs; unsigned int count; }; struct kpp_test_suite { - struct kpp_testvec *vecs; + const struct kpp_testvec *vecs; unsigned int count; }; @@ -145,7 +145,8 @@ struct alg_test_desc { } suite; }; -static unsigned int IDX[8] = { IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 }; +static const unsigned int IDX[8] = { + IDX1, IDX2, IDX3, IDX4, IDX5, IDX6, IDX7, IDX8 }; static void hexdump(unsigned char *buf, unsigned int len) { @@ -203,7 +204,7 @@ static int wait_async_op(struct tcrypt_result *tr, int ret) } static int ahash_partial_update(struct ahash_request **preq, - struct crypto_ahash *tfm, struct hash_testvec *template, + struct crypto_ahash *tfm, const struct hash_testvec *template, void *hash_buff, int k, int temp, struct scatterlist *sg, const char *algo, char *result, struct tcrypt_result *tresult) { @@ -260,9 +261,9 @@ static int ahash_partial_update(struct ahash_request **preq, return ret; } -static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, - unsigned int tcount, bool use_digest, - const int align_offset) +static int __test_hash(struct crypto_ahash *tfm, + const struct hash_testvec *template, unsigned int tcount, + bool use_digest, const int align_offset) { const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); size_t digest_size = crypto_ahash_digestsize(tfm); @@ -538,7 +539,8 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, return ret; } -static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, +static int test_hash(struct crypto_ahash *tfm, +const struct hash_testvec *template, unsigned int tcount, bool use_digest) { unsigned int alignmask; @@ -566,7 +568,7 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, } static int __test_aead(struct crypto_aead *tfm, int enc, - struct aead_testvec *template, unsigned int tcount, + const struct aead_testvec *template, unsigned int tcount, const bool diff_dst, const int align_offset) { const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm)); @@ -957,7 +959,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc, } static int test_aead(struct crypto_aead *tfm, int enc, -struct aead_testvec *template, unsigned int tcount) +const struct aead_testvec *template, unsigned int tcount) { unsigned int alignmask; int ret; @@ -990,7 +992,8 @@ static int test_aead(struct crypto_aead *tfm, int enc, } static int test_cipher(struct crypto_cipher *tfm, int enc, - struct cipher_testvec *template, unsigned int tcount) + const struct cipher_testvec *template, + unsigned int tcount) { const char *algo = crypto_tfm_alg_driver_name(crypto_cipher_tfm(tfm)); unsigned int i, j, k; @@ -1068,7 +1071,8 @@ static int test_cipher(struct crypto_cipher *tfm
Re: [PATCH v4 0/8] crypto: aes - retire table based generic AES
On Mon, Jul 24, 2017 at 07:59:43AM +0100, Ard Biesheuvel wrote: > On 18 July 2017 at 13:06, Ard Biesheuvel wrote: > > The generic AES driver uses 16 lookup tables of 1 KB each, and has > > encryption and decryption routines that are fully unrolled. Given how > > the dependencies between this code and other drivers are declared in > > Kconfig files, this code is always pulled into the core kernel, even > > if it is usually superseded at runtime by accelerated drivers that > > exist for many architectures. > > > > This leaves us with 25 KB of dead code in the kernel, which is negligible > > in typical environments, but which is actually a big deal for the IoT > > domain, where every kilobyte counts. > > > > Also, the scalar, table based AES routines that exist for ARM, arm64, i586 > > and x86_64 share the lookup tables with AES generic, and may be invoked > > occasionally when the time-invariant AES-NI or other special instruction > > drivers are called in interrupt context, at which time the SIMD register > > file cannot be used. Pulling 16 KB of code and 9 KB of instructions into > > the L1s (and evicting what was already there) when a softirq happens to > > be handled in the context of an interrupt taken from kernel mode (which > > means no SIMD on x86) is also something that we may like to avoid, by > > falling back to a much smaller and moderately less performant driver. > > (Note that arm64 will be updated shortly to supply fallbacks for all > > SIMD based AES implementations, which will be based on the core routines) > > > > For the reasons above, this series refactors the way the various AES > > implementations are wired up, to allow the generic version in > > crypto/aes_generic.c to be omitted from the build entirely. > > > > Patch #1 removes some bogus 'select CRYPTO_AES' statement. > > > > Patch #2 factors out aes-generic's lookup tables, which are shared with > > arch-specific implementations in arch/x86, arch/arm and arch/arm64. > > > > Patch #3 replaces the table based aes-generic.o with a new aes.o based on > > the fixed time cipher, and uses it to fulfil dependencies on CRYPTO_AES. > > > > Patch #4 switches the fallback in the AES-NI code to the new, generic > > encrypt > > and decrypt routines so it no longer depends on the x86 scalar code or > > [transitively] on AES-generic. > > > > Patch #5 tweaks the ARM table based code to only use 2 KB + 256 bytes worth > > of lookup tables instead of 4 KB. > > > > Patch #6 does the same for arm64 > > > > Patch #7 removes the local copy of the AES sboxes from the arm64 NEON > > driver, > > and switches to the ones exposed by the new AES core module instead. > > > > Patch #8 updates the Kconfig help text to be more descriptive of what they > > actually control, rather than duplicating AES's wikipedia entry a number of > > times. > > > > v4: - remove aes-generic altogether instead of allow a preference to be set > > Actually, after benchmarking the x86_64 asm AES code, I am not so sure > we should remove AES_GENERIC at all, since it turns out to be faster. > Interestingly, I found a remark by Eric in the git log stating the > same, so if we want to cut down on AES variants, we should probably > start by deleting the x86 code instead. > > So please disregard this for now: I will rework the other stuff I have > so it no longer depends on this, and repost, because it is much more > important for me that that makes it into v4.14. This can wait for > v4.15, as far as I am concerned, and I will benchmark a bit more to > see if we can get rid of the i586 code as well. > Yes I did notice that aes-generic was actually faster. Probably the x86_64-asm implementation should be removed, but it may be worthwhile to try a few different gcc versions to see how well they compile aes-generic. I expect that x86_64-asm used to be faster but gcc has gotten smarter. Also x86_64-asm is only really useful on older CPUs, so ideally it should be benchmarked on an older CPU. Eric
[PATCH v2 1/7] fscrypt: add v2 encryption context and policy
From: Eric Biggers Currently, the fscrypt_context (i.e. the encryption xattr) does not contain a cryptographically secure identifier for the master key's payload. Therefore it's not possible to verify that the correct key was supplied, which is problematic in multi-user scenarios. To make this possible, define a new fscrypt_context version (v2) which includes a key_hash field, and allow userspace to opt-in to it when setting an encryption policy by setting fscrypt_policy.version to 2. For now just zero the new field; a later patch will start setting it for real. Even though we aren't changing the layout of struct fscrypt_policy (i.e. the struct used by the ioctls), the new context version still has to be "opt-in" because old kernels will not recognize it, and the keyring key will now need to be available when setting an encryption policy, which is an API change. We'll also be taking the opportunity to make another API change (dropping support for the filesystem-specific key prefixes). Previously, the version numbers were 0 in the fscrypt_policy and 1 in the fscrypt_context. Rather than incrementing them to 1 and 2, make them both 2 to be consistent with each other. It's not required that these numbers match, but it should make things less confusing. An alternative to adding a key_hash field would have been to reuse master_key_descriptor. However, master_key_descriptor is only 8 bytes, which is too short to be a cryptographically secure hash. Thus, master_key_descriptor would have needed to be lengthened to 16+ bytes, which would have required defining a fscrypt_policy_v2 structure and adding a FS_IOC_GET_ENCRYPTION_POLICY_V2 ioctl. It also would have required userspace to start using a specific hash algorithm to create the key descriptors, which would have made the API harder to use. Perhaps it should have been done that way originally, but at this point it seems better to keep the API simpler. Acked-by: Michael Halcrow Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h| 84 ++ fs/crypto/keyinfo.c| 14 +++ fs/crypto/policy.c | 73 ++-- include/linux/fscrypt_common.h | 2 +- include/uapi/linux/fs.h| 6 +++ 5 files changed, 135 insertions(+), 44 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index a1d5021c31ef..8329fb905ac6 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -24,40 +24,92 @@ #define FS_AES_256_CTS_KEY_SIZE32 #define FS_AES_256_XTS_KEY_SIZE64 -#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FSCRYPT_KEY_HASH_SIZE 16 /** - * Encryption context for inode + * fscrypt_context - the encryption context for an inode * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce + * Filesystems usually store this in an extended attribute. It identifies the + * encryption algorithm and key with which the file is encrypted. + * + * Since this is stored on-disk, be careful not to reorder fields or add any + * implicit padding bytes! */ struct fscrypt_context { - u8 format; + /* v1+ */ + + /* Version of this structure */ + u8 version; + + /* Encryption mode for the contents of regular files */ u8 contents_encryption_mode; + + /* Encryption mode for filenames in directories and symlink targets */ u8 filenames_encryption_mode; + + /* Options that affect how encryption is done (e.g. padding amount) */ u8 flags; + + /* Descriptor for this file's master key in the keyring */ u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + + /* +* A unique value used in combination with the master key to derive the +* file's actual encryption key +*/ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; -} __packed; -#define FS_ENCRYPTION_CONTEXT_FORMAT_V11 + /* v2+ */ + + /* Cryptographically secure hash of the master key */ + u8 key_hash[FSCRYPT_KEY_HASH_SIZE]; +}; + +#define FSCRYPT_CONTEXT_V1 1 +#define FSCRYPT_CONTEXT_V1_SIZEoffsetof(struct fscrypt_context, key_hash) + +#define FSCRYPT_CONTEXT_V2 2 +#define FSCRYPT_CONTEXT_V2_SIZEsizeof(struct fscrypt_context) + +static inline int fscrypt_context_size(const struct fscrypt_context *ctx) +{ + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + return FSCRYPT_CONTEXT_V1_SIZE; + case FSCRYPT_CONTEXT_V2: + return FSCRYPT_CONTEXT_V2_SIZE; + } + return 0; +} + +static inline bool +fscrypt_valid_context_fo
[PATCH v2 0/7] fscrypt: key verification and KDF improvement
From: Eric Biggers This patch series solves two major problems which filesystem-level encryption has currently. First, the user-supplied master keys are not verified, which means a malicious user can provide the wrong key for another user's file and cause a DOS attack or worse. This flaw has been criticized in the past [1]. Second, the KDF (Key Derivation Function) used to derive per-file keys is ad-hoc and nonstandard. While it meets the primary security requirement, it's inflexible and is missing some useful properties such as non-reversibility, which is important under some threat models. This weakness was noted by Unterluggauer and Mangard (2016) [2] who also demonstrated an EM attack against the current AES-based KDF. These problems are solved together by introducing a new encryption policy version where the KDF is changed to HKDF-SHA512, i.e. RFC-5869 [3] with SHA-512 as the underlying hash function. HKDF is used to derive the per-file keys as well as to generate a "key hash" which is stored on-disk to allow key verification. The HMAC transform for each master key is pre-keyed and cached, which in practice makes the new KDF about as fast or even faster than the old one which did not use the crypto API efficiently. Please give special consideration to the choice and usage of crypto algorithms and any other on-disk format and API changes, since we will be locked into these once merged. As a sanity check, I have verified the new on-disk format on ext4 by dumping the encryption xattr and contents of an encrypted regular file using debugfs, then reproducing the key_hash field and the encrypted contents in userspace using the RFC-6234 example code and OpenSSL for HKDF-SHA512 and AES-256-XTS respectively --- given the master key and unencrypted file contents. All these changes are independent of filesystem and encryption mode, i.e. the "v2" encryption policies can be used on any fscrypt-capable filesystem (ext4, f2fs, or ubifs currently) and with any of the supported encryption modes. Changes since v1: - Validate encryption modes and flags earlier in fscrypt_ioctl_set_policy(), since the modes are used in fscrypt_compute_key_hash(). - Various cleanups - Base on v4.13-rc2 References: [1] https://blog.quarkslab.com/a-glimpse-of-ext4-filesystem-level-encryption.html [2] Unterluggauer and Mangard (2016). "Exploiting the Physical Disparity: Side-Channel Attacks on Memory Encryption". https://eprint.iacr.org/2016/473.pdf [3] RFC 5869. "HMAC-based Extract-and-Expand Key Derivation Function (HKDF)". https://tools.ietf.org/html/rfc5869 Eric Biggers (7): fscrypt: add v2 encryption context and policy fscrypt: rename ->ci_master_key to ->ci_master_key_descriptor fscrypt: use HKDF-SHA512 to derive the per-inode encryption keys fscrypt: validate modes and flags earlier when setting policy fscrypt: verify that the correct master key was supplied fscrypt: cache the HMAC transform for each master key fscrypt: for v2 policies, support "fscrypt:" key prefix only fs/crypto/Kconfig | 2 + fs/crypto/fscrypt_private.h| 113 +-- fs/crypto/keyinfo.c| 668 ++--- fs/crypto/policy.c | 138 ++--- fs/super.c | 4 + include/linux/fs.h | 5 + include/linux/fscrypt_common.h | 2 +- include/uapi/linux/fs.h| 6 + 8 files changed, 779 insertions(+), 159 deletions(-) -- 2.14.0.rc0.400.g1c36432dff-goog
[PATCH v2 5/7] fscrypt: verify that the correct master key was supplied
From: Eric Biggers Currently, while a fscrypt master key is required to have a certain description in the keyring, its payload is never verified to be correct. While sufficient for well-behaved userspace, this is insecure in a multi-user system where a user has been given only read-only access to an encrypted file or directory. Specifically, if an encrypted file or directory does not yet have its key cached by the kernel, the first user who accesses it can provide an arbitrary key in their own keyring, which the kernel will then associate with the inode and use for read(), write(), readdir(), etc. by other users as well. Consequently, it's trivial for a user with *read-only* access to an encrypted file or directory to make it appear as garbage to everyone. Creative users might be able to accomplish more sophisticated attacks by careful choice of the key, e.g. choosing a key causes certain bytes of file contents to have certain values or that causes filenames containing the '/' character to appear. Solve the problem for v2 encryption policies by storing a "hash" of the master encryption key in the encryption xattr and verifying it before accepting the user-provided key. We generate the "hash" using HKDF-SHA512 by passing a distinct application-specific info string. This produces a value which is cryptographically isolated and can be stored in the clear without leaking any information about the master key or any other derived keys (in a computational sense). Reusing HKDF is better than doing e.g. SHA-512(master_key) because it avoids passing the same key into different cryptographic primitives. We make the hash field 16 bytes long, as this should provide sufficient collision and preimage resistance while not wasting too much space for the encryption xattr. Acked-by: Michael Halcrow Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 4 fs/crypto/keyinfo.c | 46 + fs/crypto/policy.c | 55 - 3 files changed, 95 insertions(+), 10 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index b5881cb26e8f..f9469a71cc29 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -94,6 +94,7 @@ fscrypt_valid_context_format(const struct fscrypt_context *ctx, int size) */ struct fscrypt_master_key { struct crypto_shash *mk_hmac; + u8 mk_hash[FSCRYPT_KEY_HASH_SIZE]; }; /* @@ -157,6 +158,9 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); /* keyinfo.c */ +extern int fscrypt_compute_key_hash(const struct inode *inode, + const struct fscrypt_policy *policy, + u8 hash[FSCRYPT_KEY_HASH_SIZE]); extern void __exit fscrypt_essiv_cleanup(void); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 3164e1c74196..a9514078cf9d 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -39,8 +39,11 @@ static struct crypto_shash *essiv_hash_tfm; * * Keys derived with different info strings are cryptographically isolated from * each other --- knowledge of one derived key doesn't reveal any others. + * (This property is particularly important for the derived key used as the + * "key hash", as that is stored in the clear.) */ #define HKDF_CONTEXT_PER_FILE_KEY 1 +#define HKDF_CONTEXT_KEY_HASH 2 /* * HKDF consists of two steps: @@ -211,6 +214,12 @@ alloc_master_key(const struct fscrypt_key *payload) err = crypto_shash_setkey(k->mk_hmac, prk, sizeof(prk)); if (err) goto fail; + + /* Calculate the "key hash" */ + err = hkdf_expand(k->mk_hmac, HKDF_CONTEXT_KEY_HASH, NULL, 0, + k->mk_hash, FSCRYPT_KEY_HASH_SIZE); + if (err) + goto fail; out: memzero_explicit(prk, sizeof(prk)); return k; @@ -536,6 +545,31 @@ void __exit fscrypt_essiv_cleanup(void) crypto_free_shash(essiv_hash_tfm); } +int fscrypt_compute_key_hash(const struct inode *inode, +const struct fscrypt_policy *policy, +u8 hash[FSCRYPT_KEY_HASH_SIZE]) +{ + struct fscrypt_master_key *k; + unsigned int min_keysize; + + /* +* Require that the master key be long enough for both the +* contents and filenames encryption modes. +*/ + min_keysize = + max(available_modes[policy->contents_encryption_mode].keysize, + available_modes[policy->filenames_encryption_mode].keysize); + + k = load_master_key_from_keyring(inode, policy->master_key_descriptor, +min_keysize); + if (I
[PATCH v2 7/7] fscrypt: for v2 policies, support "fscrypt:" key prefix only
From: Eric Biggers Since v2 encryption policies are opt-in, take the opportunity to also drop support for the legacy filesystem-specific key description prefixes "ext4:", "f2fs:", and "ubifs:", instead requiring the generic prefix "fscrypt:". The generic prefix is preferred since it works for all filesystems. Also there is a performance benefit from not having to search the keyrings twice. The old prefixes remain supported for v1 encryption policies. Reviewed-by: Michael Halcrow Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 3 +-- fs/crypto/keyinfo.c | 16 fs/crypto/policy.c | 2 +- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 4221c5b23882..c9de15d4b5b0 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -169,8 +169,7 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); /* keyinfo.c */ -extern int fscrypt_compute_key_hash(const struct inode *inode, - const struct fscrypt_policy *policy, +extern int fscrypt_compute_key_hash(const struct fscrypt_policy *policy, u8 hash[FSCRYPT_KEY_HASH_SIZE]); extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 2fca72826768..81b08d4a7efe 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -384,8 +384,7 @@ find_and_lock_keyring_key(const char *prefix, } static struct fscrypt_master_key * -load_master_key_from_keyring(const struct inode *inode, -const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], +load_master_key_from_keyring(const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], unsigned int min_keysize) { struct key *keyring_key; @@ -394,11 +393,6 @@ load_master_key_from_keyring(const struct inode *inode, keyring_key = find_and_lock_keyring_key(FS_KEY_DESC_PREFIX, descriptor, min_keysize, &payload); - if (keyring_key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) { - keyring_key = find_and_lock_keyring_key( - inode->i_sb->s_cop->key_prefix, - descriptor, min_keysize, &payload); - } if (IS_ERR(keyring_key)) return ERR_CAST(keyring_key); @@ -440,8 +434,7 @@ find_or_create_master_key(const struct inode *inode, /* * The needed master key isn't in memory yet. Load it from the keyring. */ - master_key = load_master_key_from_keyring(inode, - ctx->master_key_descriptor, + master_key = load_master_key_from_keyring(ctx->master_key_descriptor, min_keysize); if (IS_ERR(master_key)) return master_key; @@ -675,8 +668,7 @@ void __exit fscrypt_essiv_cleanup(void) crypto_free_shash(essiv_hash_tfm); } -int fscrypt_compute_key_hash(const struct inode *inode, -const struct fscrypt_policy *policy, +int fscrypt_compute_key_hash(const struct fscrypt_policy *policy, u8 hash[FSCRYPT_KEY_HASH_SIZE]) { struct fscrypt_master_key *k; @@ -690,7 +682,7 @@ int fscrypt_compute_key_hash(const struct inode *inode, max(available_modes[policy->contents_encryption_mode].keysize, available_modes[policy->filenames_encryption_mode].keysize); - k = load_master_key_from_keyring(inode, policy->master_key_descriptor, + k = load_master_key_from_keyring(policy->master_key_descriptor, min_keysize); if (IS_ERR(k)) return PTR_ERR(k); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ece55350cee8..fa5599005a72 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -119,7 +119,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) pr_warn_once("%s (pid %d) is setting less secure v0 encryption policy; recommend upgrading to v2.\n", current->comm, current->pid); } else { - ret = fscrypt_compute_key_hash(inode, &policy, key_hash); + ret = fscrypt_compute_key_hash(&policy, key_hash); if (ret) return ret; } -- 2.14.0.rc0.400.g1c36432dff-goog
[PATCH v2 6/7] fscrypt: cache the HMAC transform for each master key
From: Eric Biggers Now that we have a key_hash field which securely identifies a master key payload, introduce a cache of the HMAC transforms for the master keys currently in use for inodes using v2+ encryption policies. The entries in this cache are called 'struct fscrypt_master_key' and are identified by key_hash. The cache is per-superblock. (It could be global, but making it per-superblock should reduce the lock contention a bit, and we may need to keep track of keys on a per-superblock basis for other reasons later, e.g. to implement an ioctl for evicting keys.) This results in a large efficiency gain: we now only have to allocate and key an "hmac(sha512)" transformation, execute HKDF-Extract, and compute key_hash once per master key rather than once per inode. Note that this optimization can't easily be applied to the original AES-based KDF because that uses a different AES key for each KDF execution. In practice, this difference makes the HKDF per-inode encryption key derivation performance comparable to or even faster than the old KDF, which typically spends more time allocating an "ecb(aes)" transformation from the crypto API than doing actual crypto work. Note that it would have been possible to make the mapping be from raw_key => fscrypt_master_key (where raw_key denotes the actual bytes of the master key) rather than from key_hash => fscrypt_master_key. However, an advantage of doing lookups by key_hash is that it replaces the keyring lookup in most cases, which opens up the future possibilities of not even having the master key in memory following an initial provisioning step (if the HMAC-SHA512 implementation is hardware-backed), or of introducing an ioctl to provision a key to the filesystem directly, avoiding keyrings and their visibility problems entirely. Also, because key_hash is public information while raw_key is secret information, it would have been very difficult to use raw_key as a map key in a way that would prevent timing attacks while still being scalable to a large number of entries. Reviewed-by: Michael Halcrow Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 11 fs/crypto/keyinfo.c | 134 +++- fs/crypto/policy.c | 5 +- fs/super.c | 4 ++ include/linux/fs.h | 5 ++ 5 files changed, 152 insertions(+), 7 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index f9469a71cc29..4221c5b23882 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -91,10 +91,21 @@ fscrypt_valid_context_format(const struct fscrypt_context *ctx, int size) /* * fscrypt_master_key - an in-use master key + * + * This is referenced from each in-core inode that has been "unlocked" using a + * particular master key. It's primarily used to cache the HMAC transform so + * that the per-inode encryption keys can be derived efficiently with HKDF. It + * is securely erased once all inodes referencing it have been evicted. + * + * If the same master key is used on different filesystems (unusual, but + * possible), we'll create one of these structs for each filesystem. */ struct fscrypt_master_key { struct crypto_shash *mk_hmac; u8 mk_hash[FSCRYPT_KEY_HASH_SIZE]; + refcount_t mk_refcount; + struct rb_node mk_node; + struct super_block *mk_sb; }; /* diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index a9514078cf9d..2fca72826768 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -176,6 +176,14 @@ static void put_master_key(struct fscrypt_master_key *k) if (!k) return; + if (refcount_read(&k->mk_refcount) != 0) { /* in ->s_master_keys? */ + if (!refcount_dec_and_lock(&k->mk_refcount, + &k->mk_sb->s_master_keys_lock)) + return; + rb_erase(&k->mk_node, &k->mk_sb->s_master_keys); + spin_unlock(&k->mk_sb->s_master_keys_lock); + } + crypto_free_shash(k->mk_hmac); kzfree(k); } @@ -230,6 +238,87 @@ alloc_master_key(const struct fscrypt_key *payload) goto out; } +/* + * ->s_master_keys is a map of master keys currently in use by in-core inodes on + * a given filesystem, identified by key_hash which is a cryptographically + * secure identifier for an actual key payload. + * + * Note that master_key_descriptor cannot be used to identify the keys because + * master_key_descriptor only identifies the "location" of a key in the keyring, + * not the actual key payload --- i.e., buggy or malicious userspace may provide + * different keys with the same master_key_descriptor. + */ + +/* + * Search ->s_master_keys for the f
[PATCH v2 3/7] fscrypt: use HKDF-SHA512 to derive the per-inode encryption keys
From: Eric Biggers By design, the keys which userspace provides in the keyring are not used to encrypt data directly. Instead, a KDF (Key Derivation Function) is used to derive a unique encryption key for each inode, given a "master" key and a nonce. The current KDF encrypts the master key with AES-128-ECB using the nonce as the AES key. This KDF is ad-hoc and is not specified in any standard. While it does generate unique derived keys with sufficient entropy, it has several disadvantages: - It's reversible: an attacker who compromises a derived key, e.g. using a side channel attack, can "decrypt" it to get back to the master key. - It's not very extensible because it cannot easily be used to derive other key material that may be needed and it ties the length of the derived key closely to the length of the master key. - It doesn't evenly distribute the entropy from the master key. For example, the first 16 bytes of each derived key depend only on the first 16 bytes of the master key. - It uses a public value as an AES key, which is unusual. Ciphers are rarely evaluated under a threat model where the keys are public and the messages are secret. Solve all these problems for v2 encryption policies by changing the KDF to HKDF with SHA-512 as the underlying hash function. To derive each inode's encryption key, HKDF is executed with the master key as the input key material, a fixed salt, and the per-inode nonce prefixed with a context byte as the application-specific information string. Unlike the current KDF, HKDF has been formally published and standardized [1][2], is nonreversible, can be used to derive any number and length of secret and/or non-secret keys, and evenly distributes the entropy from the master key (excepting limits inherent to not using a random salt). Note that this introduces a dependency on the security and implementation of SHA-512, whereas before we were using only AES for both key derivation and encryption. However, by using HMAC rather than the hash function directly, HKDF is designed to remain secure even if various classes of attacks, e.g. collision attacks, are found against the underlying unkeyed hash function. Even HMAC-MD5 is still considered secure in practice, despite MD5 itself having been heavily compromised. We *could* avoid introducing a hash function by instantiating HKDF-Expand with CMAC-AES256 as the pseudorandom function rather than HMAC-SHA512. This would work; however, the HKDF specification doesn't explicitly allow a non-HMAC pseudorandom function, so it would be less standard. It would also require skipping HKDF-Extract and changing the API to accept only 32-byte master keys (since otherwise HKDF-Extract using CMAC-AES would produce a pseudorandom key only 16 bytes long which is only enough for AES-128, not AES-256). HKDF-SHA512 can require more "crypto work" per key derivation when compared to the current KDF. However, later in this series, we'll start caching the HMAC transform for each master key, which will actually make the real-world performance about the same or even significantly better than the AES-based KDF as currently implemented. Also, each KDF can actually be executed on the order of 1 million times per second, so KDF performance probably isn't actually the bottleneck in practice anyway. References: [1] Krawczyk (2010). "Cryptographic Extraction and Key Derivation: The HKDF Scheme". https://eprint.iacr.org/2010/264.pdf [2] RFC 5869. "HMAC-based Extract-and-Expand Key Derivation Function (HKDF)". https://tools.ietf.org/html/rfc5869 Signed-off-by: Eric Biggers --- fs/crypto/Kconfig | 2 + fs/crypto/fscrypt_private.h | 13 ++ fs/crypto/keyinfo.c | 484 +++- 3 files changed, 403 insertions(+), 96 deletions(-) diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 02b7d91c9231..bbd4e38b293c 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -8,6 +8,8 @@ config FS_ENCRYPTION select CRYPTO_CTS select CRYPTO_CTR select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_HMAC select KEYS help Enable encryption of files and directories. This diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 4b0205accae9..b5881cb26e8f 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -89,6 +89,13 @@ fscrypt_valid_context_format(const struct fscrypt_context *ctx, int size) return size >= 1 && size == fscrypt_context_size(ctx); } +/* + * fscrypt_master_key - an in-use master key + */ +struct fscrypt_master_key { + struct crypto_shash *mk_hmac; +}; + /* * fscrypt_info - the "encryption key" for an inode * @@ -102,6 +109,12 @@ struct fscrypt_info { struct crypto_skcipher *ci_ctfm; stru
[PATCH v2 4/7] fscrypt: validate modes and flags earlier when setting policy
From: Eric Biggers For FS_IOC_SET_ENCRYPTION_POLICY, currently the encryption modes and flags are only validated when a new encryption policy is being set, not when an existing policy is being compared to the one specified. However, we're going to start needing to compute the key_hash in both cases, and for this it's helpful to validate that the master key has the minimum length required by the specified encryption modes. Therefore, move the modes and flags validation earlier in the ioctl, next to where we validate the policy version. Signed-off-by: Eric Biggers --- fs/crypto/policy.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index fe525da9e79c..d1e58798da3c 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -60,13 +60,6 @@ static int create_encryption_context_from_policy(struct inode *inode, { struct fscrypt_context ctx; - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, -policy->filenames_encryption_mode)) - return -EINVAL; - - if (policy->flags & ~FS_POLICY_FLAGS_VALID) - return -EINVAL; - ctx.version = context_version_for_policy(policy); ctx.contents_encryption_mode = policy->contents_encryption_mode; ctx.filenames_encryption_mode = policy->filenames_encryption_mode; @@ -100,6 +93,13 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) policy.version != FS_POLICY_VERSION_HKDF) return -EINVAL; + if (!fscrypt_valid_enc_modes(policy.contents_encryption_mode, +policy.filenames_encryption_mode)) + return -EINVAL; + + if (policy.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; -- 2.14.0.rc0.400.g1c36432dff-goog
[PATCH v2 2/7] fscrypt: rename ->ci_master_key to ->ci_master_key_descriptor
From: Eric Biggers In struct fscrypt_info, ->ci_master_key is the master key descriptor, not the master key itself. In preparation for introducing a struct fscrypt_master_key and making ->ci_master_key point to it, rename the existing ->ci_master_key to ->ci_master_key_descriptor. Acked-by: Michael Halcrow Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 2 +- fs/crypto/keyinfo.c | 4 ++-- fs/crypto/policy.c | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 8329fb905ac6..4b0205accae9 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -110,7 +110,7 @@ struct fscrypt_info { u8 ci_data_mode; u8 ci_filename_mode; u8 ci_flags; - u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; + u8 ci_master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; }; typedef enum { diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 7e664a11340a..5591fd24e4b2 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -293,8 +293,8 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_flags = ctx.flags; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); + memcpy(crypt_info->ci_master_key_descriptor, ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); if (res) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index d6beb48f08fc..fe525da9e79c 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -226,7 +226,8 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) child_ci = child->i_crypt_info; if (parent_ci && child_ci) { - return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + return memcmp(parent_ci->ci_master_key_descriptor, + child_ci->ci_master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE) == 0 && (parent_ci->ci_context_version == child_ci->ci_context_version) && @@ -284,7 +285,7 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, ctx.contents_encryption_mode = ci->ci_data_mode; ctx.filenames_encryption_mode = ci->ci_filename_mode; ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, + memcpy(ctx.master_key_descriptor, ci->ci_master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); if (ctx.version != FSCRYPT_CONTEXT_V1) -- 2.14.0.rc0.400.g1c36432dff-goog
Re: [PATCH v5 2/5] lib: Add zstd modules
On Wed, Aug 09, 2017 at 07:35:53PM -0700, Nick Terrell wrote: > > It can compress at speeds approaching lz4, and quality approaching lzma. Well, for a very loose definition of "approaching", and certainly not at the same time. I doubt there's a use case for using the highest compression levels in kernel mode --- especially the ones using zstd_opt.h. > > The code was ported from the upstream zstd source repository. What version? > `linux/zstd.h` header was modified to match linux kernel style. > The cross-platform and allocation code was stripped out. Instead zstd > requires the caller to pass a preallocated workspace. The source files > were clang-formatted [1] to match the Linux Kernel style as much as > possible. It would be easier to compare to the upstream version if it was not all reformatted. There is a chance that bugs were introduced by Linux-specific changes, and it would be nice if they could be easily reviewed. (Also I don't know what clang-format settings you used, but there are still a lot of differences from the Linux coding style.) > > I benchmarked zstd compression as a special character device. I ran zstd > and zlib compression at several levels, as well as performing no > compression, which measure the time spent copying the data to kernel space. > Data is passed to the compresser 4096 B at a time. The benchmark file is > located in the upstream zstd source repository under > `contrib/linux-kernel/zstd_compress_test.c` [2]. > > I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM. > The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor, > 16 GB of RAM, and a SSD. I benchmarked using `silesia.tar` [3], which is > 211,988,480 B large. Run the following commands for the benchmark: > > sudo modprobe zstd_compress_test > sudo mknod zstd_compress_test c 245 0 > sudo cp silesia.tar zstd_compress_test > > The time is reported by the time of the userland `cp`. > The MB/s is computed with > > 1,536,217,008 B / time(buffer size, hash) > > which includes the time to copy from userland. > The Adjusted MB/s is computed with > > 1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)). > > The memory reported is the amount of memory the compressor requests. > > | Method | Size (B) | Time (s) | Ratio | MB/s| Adj MB/s | Mem (MB) | > |--|--|--|---|-|--|--| > | none | 11988480 |0.100 | 1 | 2119.88 |- |- | > | zstd -1 | 73645762 |1.044 | 2.878 | 203.05 | 224.56 | 1.23 | > | zstd -3 | 66988878 |1.761 | 3.165 | 120.38 | 127.63 | 2.47 | > | zstd -5 | 65001259 |2.563 | 3.261 | 82.71 |86.07 | 2.86 | > | zstd -10 | 60165346 | 13.242 | 3.523 | 16.01 |16.13 |13.22 | > | zstd -15 | 58009756 | 47.601 | 3.654 |4.45 | 4.46 |21.61 | > | zstd -19 | 54014593 | 102.835 | 3.925 |2.06 | 2.06 |60.15 | > | zlib -1 | 77260026 |2.895 | 2.744 | 73.23 |75.85 | 0.27 | > | zlib -3 | 72972206 |4.116 | 2.905 | 51.50 |52.79 | 0.27 | > | zlib -6 | 68190360 |9.633 | 3.109 | 22.01 |22.24 | 0.27 | > | zlib -9 | 67613382 | 22.554 | 3.135 |9.40 | 9.44 | 0.27 | > Theses benchmarks are misleading because they compress the whole file as a single stream without resetting the dictionary, which isn't how data will typically be compressed in kernel mode. With filesystem compression the data has to be divided into small chunks that can each be decompressed independently. That eliminates one of the primary advantages of Zstandard (support for large dictionary sizes). Eric
Re: [PATCH v5 2/5] lib: Add zstd modules
On Thu, Aug 10, 2017 at 07:32:18AM -0400, Austin S. Hemmelgarn wrote: > On 2017-08-10 04:30, Eric Biggers wrote: > >On Wed, Aug 09, 2017 at 07:35:53PM -0700, Nick Terrell wrote: > >> > >>It can compress at speeds approaching lz4, and quality approaching lzma. > > > >Well, for a very loose definition of "approaching", and certainly not at the > >same time. I doubt there's a use case for using the highest compression > >levels > >in kernel mode --- especially the ones using zstd_opt.h. > Large data-sets with WORM access patterns and infrequent writes > immediately come to mind as a use case for the highest compression > level. > > As a more specific example, the company I work for has a very large > amount of documentation, and we keep all old versions. This is all > stored on a file server which is currently using BTRFS. Once a > document is written, it's almost never rewritten, so write > performance only matters for the first write. However, they're read > back pretty frequently, so we need good read performance. As of > right now, the system is set to use LZO compression by default, and > then when a new document is added, the previous version of that > document gets re-compressed using zlib compression, which actually > results in pretty significant space savings most of the time. I > would absolutely love to use zstd compression with this system with > the highest compression level, because most people don't care how > long it takes to write the file out, but they do care how long it > takes to read a file (even if it's an older version). This may be a reasonable use case, but note this cannot just be the regular "zstd" compression setting, since filesystem compression by default must provide reasonable performance for many different access patterns. See the patch in this series which actually adds zstd compression to btrfs; it only uses level 1. I do not see a patch which adds a higher compression mode. It would need to be a special setting like "zstdhc" that users could opt-in to on specific directories. It also would need to be compared to simply compressing in userspace. In many cases compressing in userspace is probably the better solution for the use case in question because it works on any filesystem, allows using any compression algorithm, and if random access is not needed it is possible to compress each file as a single stream (like a .xz file), which produces a much better compression ratio than the block-by-block compression that filesystems have to use. Note also that LZ4HC is in the kernel source tree currently but no one is using it vs. the regular LZ4. I think it is the kind of thing that sounded useful originally, but at the end of the day no one really wants to use it in kernel mode. I'd certainly be interested in actual patches, though. Eric
Re: [PATCH v5 2/5] lib: Add zstd modules
On Thu, Aug 10, 2017 at 10:57:01AM -0400, Austin S. Hemmelgarn wrote: > Also didn't think to mention this, but I could see the max level > being very popular for use with SquashFS root filesystems used in > LiveCD's. Currently, they have to decide between read performance > and image size, while zstd would provide both. The high compression levels of Zstandard are indeed a great fit for SquashFS, but SquashFS images are created in userspace by squashfs-tools. The kernel only needs to be able to decompress them. (Also, while Zstandard provides very good tradeoffs and will probably become the preferred algorithm for SquashFS, it's misleading to imply that users won't have to make decisions anymore. It does not compress as well as XZ or decompress as fast as LZ4, except maybe in very carefully crafted benchmarks.) Eric
Re: [PATCH v5 2/5] lib: Add zstd modules
On Thu, Aug 10, 2017 at 01:41:21PM -0400, Chris Mason wrote: > On 08/10/2017 04:30 AM, Eric Biggers wrote: > >On Wed, Aug 09, 2017 at 07:35:53PM -0700, Nick Terrell wrote: > > >>The memory reported is the amount of memory the compressor requests. > >> > >>| Method | Size (B) | Time (s) | Ratio | MB/s| Adj MB/s | Mem (MB) | > >>|--|--|--|---|-|--|--| > >>| none | 11988480 |0.100 | 1 | 2119.88 |- |- | > >>| zstd -1 | 73645762 |1.044 | 2.878 | 203.05 | 224.56 | 1.23 | > >>| zstd -3 | 66988878 |1.761 | 3.165 | 120.38 | 127.63 | 2.47 | > >>| zstd -5 | 65001259 |2.563 | 3.261 | 82.71 |86.07 | 2.86 | > >>| zstd -10 | 60165346 | 13.242 | 3.523 | 16.01 |16.13 |13.22 | > >>| zstd -15 | 58009756 | 47.601 | 3.654 |4.45 | 4.46 |21.61 | > >>| zstd -19 | 54014593 | 102.835 | 3.925 |2.06 | 2.06 |60.15 | > >>| zlib -1 | 77260026 |2.895 | 2.744 | 73.23 |75.85 | 0.27 | > >>| zlib -3 | 72972206 |4.116 | 2.905 | 51.50 |52.79 | 0.27 | > >>| zlib -6 | 68190360 |9.633 | 3.109 | 22.01 |22.24 | 0.27 | > >>| zlib -9 | 67613382 | 22.554 | 3.135 |9.40 | 9.44 | 0.27 | > >> > > > >Theses benchmarks are misleading because they compress the whole file as a > >single stream without resetting the dictionary, which isn't how data will > >typically be compressed in kernel mode. With filesystem compression the data > >has to be divided into small chunks that can each be decompressed > >independently. > >That eliminates one of the primary advantages of Zstandard (support for large > >dictionary sizes). > > I did btrfs benchmarks of kernel trees and other normal data sets as > well. The numbers were in line with what Nick is posting here. > zstd is a big win over both lzo and zlib from a btrfs point of view. > > It's true Nick's patches only support a single compression level in > btrfs, but that's because btrfs doesn't have a way to pass in the > compression ratio. It could easily be a mount option, it was just > outside the scope of Nick's initial work. > I am not surprised --- Zstandard is closer to the state of the art, both format-wise and implementation-wise, than the other choices in BTRFS. My point is that benchmarks need to account for how much data is compressed at a time. This is a common mistake when comparing different compression algorithms; the algorithm name and compression level do not tell the whole story. The dictionary size is extremely significant. No one is going to compress or decompress a 200 MB file as a single stream in kernel mode, so it does not make sense to justify adding Zstandard *to the kernel* based on such a benchmark. It is going to be divided into chunks. How big are the chunks in BTRFS? I thought that it compressed only one page (4 KiB) at a time, but I hope that has been, or is being, improved; 32 KiB - 128 KiB should be a better amount. (And if the amount of data compressed at a time happens to be different between the different algorithms, note that BTRFS benchmarks are likely to be measuring that as much as the algorithms themselves.) Eric
Re: [PATCH 00/12] x86/crypto: Fix RBP usage in several crypto .S files
Hi Josh, On Tue, Aug 29, 2017 at 01:05:33PM -0500, Josh Poimboeuf wrote: > Many of the x86 crypto functions use RBP as a temporary register. This > breaks frame pointer convention, and breaks stack traces when unwinding > from an interrupt in the crypto code. > > Convert most* of them to leave RBP alone. > > These pass the crypto boot tests for me. Any further testing would be > appreciated! > > [*] There are still a few crypto files left that need fixing, but the > fixes weren't trivial and nobody reported unwinder warnings about > them yet, so I'm skipping them for now. > > Josh Poimboeuf (12): > x86/crypto: Fix RBP usage in blowfish-x86_64-asm_64.S > x86/crypto: Fix RBP usage in camellia-x86_64-asm_64.S > x86/crypto: Fix RBP usage in cast5-avx-x86_64-asm_64.S > x86/crypto: Fix RBP usage in cast6-avx-x86_64-asm_64.S > x86/crypto: Fix RBP usage in des3_ede-asm_64.S > x86/crypto: Fix RBP usage in sha1_avx2_x86_64_asm.S > x86/crypto: Fix RBP usage in sha1_ssse3_asm.S > x86/crypto: Fix RBP usage in sha256-avx-asm.S > x86/crypto: Fix RBP usage in sha256-avx2-asm.S > x86/crypto: Fix RBP usage in sha256-ssse3-asm.S > x86/crypto: Fix RBP usage in sha512-avx2-asm.S > x86/crypto: Fix RBP usage in twofish-avx-x86_64-asm_64.S > Thanks for fixing these! I don't have time to review these in detail, but I ran the crypto self-tests on the affected algorithms, and they all pass. I also benchmarked them before and after; the only noticable performance difference was that sha256-avx2 and sha512-avx2 became a few percent slower. I don't suppose there is a way around that? Otherwise it's probably not a big deal. For reference, this was the list of affected algorithms I used: shash: sha1-ssse3, sha1-avx, sha1-avx2, sha256-ssse3, sha256-avx, sha256-avx2, sha512-ssse3, sha512-avx, sha512-avx2 cipher: blowfish-asm, camellia-asm, des3_ede-asm skcipher: ecb-blowfish-asm, cbc-blowfish-asm, ctr-blowfish-asm, ecb-cast5-avx, cbc-cast5-avx, ctr-cast5-avx, ecb-cast6-avx, cbc-cast6-avx, ctr-cast6-avx, lrw-cast6-avx, xts-cast6-avx, ecb-camellia-asm, cbc-camellia-asm, ctr-camellia-asm, lrw-camellia-asm, xts-camellia-asm, ecb-des3_ede-asm, cbc-des3_ede-asm, ctr-des3_ede-asm, ecb-twofish-avx, cbc-twofish-avx, ctr-twofish-avx, lrw-twofish-avx, xts-twofish-avx Eric
Re: [PATCH 00/12] x86/crypto: Fix RBP usage in several crypto .S files
On Thu, Sep 07, 2017 at 09:15:34AM +0200, Ingo Molnar wrote: > > * Eric Biggers wrote: > > > Thanks for fixing these! I don't have time to review these in detail, but > > I ran > > the crypto self-tests on the affected algorithms, and they all pass. I also > > benchmarked them before and after; the only noticable performance > > difference was > > that sha256-avx2 and sha512-avx2 became a few percent slower. I don't > > suppose > > there is a way around that? Otherwise it's probably not a big deal. > > Which CPU model did you use for the test? > > Thanks, > > Ingo This was on Haswell, "Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz". Eric
Re: [PATCH 00/12] x86/crypto: Fix RBP usage in several crypto .S files
On Thu, Sep 07, 2017 at 11:26:47PM +0200, Ingo Molnar wrote: > > * Eric Biggers wrote: > > > On Thu, Sep 07, 2017 at 09:15:34AM +0200, Ingo Molnar wrote: > > > > > > * Eric Biggers wrote: > > > > > > > Thanks for fixing these! I don't have time to review these in detail, > > > > but I ran > > > > the crypto self-tests on the affected algorithms, and they all pass. I > > > > also > > > > benchmarked them before and after; the only noticable performance > > > > difference was > > > > that sha256-avx2 and sha512-avx2 became a few percent slower. I don't > > > > suppose > > > > there is a way around that? Otherwise it's probably not a big deal. > > > > > > Which CPU model did you use for the test? > > > > > > Thanks, > > > > > > Ingo > > > > This was on Haswell, "Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz". > > Any chance to test this with the latest microarchitecture - any Skylake > derivative > Intel CPU you have access to? > > Thanks, > > Ingo Tested with Skylake, "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz". The results were the following which seemed a bit worse than Haswell: sha256-avx2 became 3.5% slower sha512-avx2 became 7.5% slower Note: it's tricky to benchmark this, especially with just a few percent difference, so don't read too much into the exact numbers. Eric
Re: [PATCH 00/12] x86/crypto: Fix RBP usage in several crypto .S files
Hi Josh, On Wed, Sep 13, 2017 at 05:33:03PM -0500, Josh Poimboeuf wrote: > And here's v2 of the sha512-avx2 patch. It should hopefully gain back > most of the performance lost by v1. > > From: Josh Poimboeuf > Subject: [PATCH] x86/crypto: Fix RBP usage in sha512-avx2-asm.S > > Using RBP as a temporary register breaks frame pointer convention and > breaks stack traces when unwinding from an interrupt in the crypto code. > > Mix things up a little bit to get rid of the RBP usage, without > destroying performance. Use RDI instead of RBP for the TBL pointer. > That will clobber CTX, so save CTX on the stack and use RDI as CTX > before it gets clobbered, and R12 as CTX after it gets clobbered. > > Also remove the unused y4 variable. > I tested the v2 patches for both sha256-avx2 and sha512-avx2 on Skylake. They both pass the crypto self-tests, and there was no noticable performance difference compared to the unpatched versions. Thanks! Eric
Re: [PATCH 00/12] x86/crypto: Fix RBP usage in several crypto .S files
On Fri, Sep 15, 2017 at 07:34:31AM +0200, Ingo Molnar wrote: > > * Eric Biggers wrote: > > > Hi Josh, > > > > On Wed, Sep 13, 2017 at 05:33:03PM -0500, Josh Poimboeuf wrote: > > > And here's v2 of the sha512-avx2 patch. It should hopefully gain back > > > most of the performance lost by v1. > > > > > > From: Josh Poimboeuf > > > Subject: [PATCH] x86/crypto: Fix RBP usage in sha512-avx2-asm.S > > > > > > Using RBP as a temporary register breaks frame pointer convention and > > > breaks stack traces when unwinding from an interrupt in the crypto code. > > > > > > Mix things up a little bit to get rid of the RBP usage, without > > > destroying performance. Use RDI instead of RBP for the TBL pointer. > > > That will clobber CTX, so save CTX on the stack and use RDI as CTX > > > before it gets clobbered, and R12 as CTX after it gets clobbered. > > > > > > Also remove the unused y4 variable. > > > > > > > I tested the v2 patches for both sha256-avx2 and sha512-avx2 on Skylake. > > They > > both pass the crypto self-tests, and there was no noticable performance > > difference compared to the unpatched versions. Thanks! > > Cool, thanks for review and the testing! Can we add your Tested-by + Acked-by > tags > to the patches? > Yes, that's fine for all the patches in the series. Will these patches go in through the crypto tree or through the x86 tree? Eric
Re: [GIT PULL] KEYS: Fixes and crypto fixes
On Thu, Sep 28, 2017 at 09:14:58AM +1000, James Morris wrote: > On Wed, 27 Sep 2017, David Howells wrote: > > > (2) Fixing big_key to use safe crypto from Jason A. Donenfeld. > > > > I'm concerned about the lack of crypto review mentioned by Jason -- I > wonder if we can get this rewrite any more review from crypto folk. > > Also, are there any tests for this code? If not, it would be good to make > some. > There is a test for the big_key key type in the keyutils test suite. I also manually tested Jason's change. And as far as I can tell there isn't actually a whole lot to test besides adding a big_key larger than BIG_KEY_FILE_THRESHOLD bytes, reading it back, and verifying that the data is unchanged --- since that covers the code that was changed. An earlier version of the patch produced a warning with CONFIG_DEBUG_SG=y since it put the aead_request on the stack, but that's been fixed. It would be great if someone else would comment on the crypto too, but for what it's worth I'm satisfied with the crypto changes. GCM is a much better choice than ECB as long as we don't repeat (key, IV) pairs --- which we don't. And in any case ECB mode makes no sense in this context; you'd need a *very* good reason to actually choose to encrypt something with ECB mode. Unfortunately it tends to be a favorite of people who don't understand encryption modes... Plus, getting all the randomness at boot time didn't make sense because that's when entropy is the most scarce. Eric
[PATCH 3/4] crypto: qat - fix double free of ctx->p
From: Eric Biggers When setting the secret with the "qat-dh" Diffie-Hellman implementation, if allocating 'g' failed, then 'p' was freed twice: once immediately, and once later when the crypto_kpp tfm was destroyed. Fix it by using qat_dh_clear_ctx() in the error paths, as that sets the pointers to NULL. Fixes: c9839143ebbf ("crypto: qat - Add DH support") Cc: # v4.8+ Signed-off-by: Eric Biggers --- drivers/crypto/qat/qat_common/qat_asym_algs.c | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_asym_algs.c b/drivers/crypto/qat/qat_common/qat_asym_algs.c index 6f5dd68449c6..7655fdb499de 100644 --- a/drivers/crypto/qat/qat_common/qat_asym_algs.c +++ b/drivers/crypto/qat/qat_common/qat_asym_algs.c @@ -462,11 +462,8 @@ static int qat_dh_set_params(struct qat_dh_ctx *ctx, struct dh *params) } ctx->g = dma_zalloc_coherent(dev, ctx->p_size, &ctx->dma_g, GFP_KERNEL); - if (!ctx->g) { - dma_free_coherent(dev, ctx->p_size, ctx->p, ctx->dma_p); - ctx->p = NULL; + if (!ctx->g) return -ENOMEM; - } memcpy(ctx->g + (ctx->p_size - params->g_size), params->g, params->g_size); @@ -507,18 +504,22 @@ static int qat_dh_set_secret(struct crypto_kpp *tfm, const void *buf, ret = qat_dh_set_params(ctx, ¶ms); if (ret < 0) - return ret; + goto err_clear_ctx; ctx->xa = dma_zalloc_coherent(dev, ctx->p_size, &ctx->dma_xa, GFP_KERNEL); if (!ctx->xa) { - qat_dh_clear_ctx(dev, ctx); - return -ENOMEM; + ret = -ENOMEM; + goto err_clear_ctx; } memcpy(ctx->xa + (ctx->p_size - params.key_size), params.key, params.key_size); return 0; + +err_clear_ctx: + qat_dh_clear_ctx(dev, ctx); + return ret; } static unsigned int qat_dh_max_size(struct crypto_kpp *tfm) -- 2.15.0.403.gc27cc4dac6-goog
[PATCH 4/4] crypto: dh - don't permit 'key' or 'g' size longer than 'p'
From: Eric Biggers The "qat-dh" DH implementation assumes that 'key' and 'g' can be copied into a buffer with size 'p_size'. However it was never checked that that was actually the case, which allowed users to cause a buffer underflow via KEYCTL_DH_COMPUTE. Fix this by updating crypto_dh_decode_key() to verify this precondition for all DH implementations. Fixes: c9839143ebbf ("crypto: qat - Add DH support") Cc: # v4.8+ Signed-off-by: Eric Biggers --- crypto/dh_helper.c | 8 1 file changed, 8 insertions(+) diff --git a/crypto/dh_helper.c b/crypto/dh_helper.c index 708ae20d2d3c..7f00c771fe8d 100644 --- a/crypto/dh_helper.c +++ b/crypto/dh_helper.c @@ -83,6 +83,14 @@ int crypto_dh_decode_key(const char *buf, unsigned int len, struct dh *params) if (secret.len != crypto_dh_key_len(params)) return -EINVAL; + /* +* Don't permit the buffer for 'key' or 'g' to be larger than 'p', since +* some drivers assume otherwise. +*/ + if (params->key_size > params->p_size || + params->g_size > params->p_size) + return -EINVAL; + /* Don't allocate memory. Set pointers to data within * the given buffer */ -- 2.15.0.403.gc27cc4dac6-goog
[PATCH 2/4] crypto: dh - don't permit 'p' to be 0
From: Eric Biggers If 'p' is 0 for the software Diffie-Hellman implementation, then dh_max_size() returns 0. In the case of KEYCTL_DH_COMPUTE, this causes ZERO_SIZE_POINTER to be passed to sg_init_one(), which with CONFIG_DEBUG_SG=y triggers the 'BUG_ON(!virt_addr_valid(buf));' in sg_set_buf(). Fix this by making crypto_dh_decode_key() reject 0 for 'p'. p=0 makes no sense for any DH implementation because 'p' is supposed to be a prime number. Moreover, 'mod 0' is not mathematically defined. Bug report: kernel BUG at ./include/linux/scatterlist.h:140! invalid opcode: [#1] SMP KASAN CPU: 0 PID: 27112 Comm: syz-executor2 Not tainted 4.14.0-rc7-00010-gf5dbb5d0ce32-dirty #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.3-20171021_125229-anatol 04/01/2014 task: 88006caac0c0 task.stack: 88006c7c8000 RIP: 0010:sg_set_buf include/linux/scatterlist.h:140 [inline] RIP: 0010:sg_init_one+0x1b3/0x240 lib/scatterlist.c:156 RSP: 0018:88006c7cfb08 EFLAGS: 00010216 RAX: 0001 RBX: 88006c7cfe30 RCX: 64ee RDX: 81cf64c3 RSI: c9d72000 RDI: 92e937e0 RBP: 88006c7cfb30 R08: ed000d8f9fab R09: 88006c7cfd30 R10: 0005 R11: ed000d8f9faa R12: 88006c7cfd30 R13: R14: 0010 R15: 88006c7cfc50 FS: 7fce190fa700() GS:88003ea0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7fffc6b33db8 CR3: 3cf64000 CR4: 06f0 Call Trace: __keyctl_dh_compute+0xa95/0x19b0 security/keys/dh.c:360 keyctl_dh_compute+0xac/0x100 security/keys/dh.c:434 SYSC_keyctl security/keys/keyctl.c:1745 [inline] SyS_keyctl+0x72/0x2c0 security/keys/keyctl.c:1641 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4585c9 RSP: 002b:7fce190f9bd8 EFLAGS: 0216 ORIG_RAX: 00fa RAX: ffda RBX: 00738020 RCX: 004585c9 RDX: 2000d000 RSI: 2ff4 RDI: 0017 RBP: 0046 R08: 20008000 R09: R10: R11: 0216 R12: 7fff6e610cde R13: 7fff6e610cdf R14: 7fce190fa700 R15: Code: 03 0f b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 33 5b 45 89 6c 24 14 41 5c 41 5d 41 5e 41 5f 5d c3 e8 fd 8f 68 ff <0f> 0b e8 f6 8f 68 ff 0f 0b e8 ef 8f 68 ff 0f 0b e8 e8 8f 68 ff 20 RIP: sg_set_buf include/linux/scatterlist.h:140 [inline] RSP: 88006c7cfb08 RIP: sg_init_one+0x1b3/0x240 lib/scatterlist.c:156 RSP: 88006c7cfb08 Fixes: 802c7f1c84e4 ("crypto: dh - Add DH software implementation") Cc: # v4.8+ Signed-off-by: Eric Biggers --- crypto/dh_helper.c | 8 1 file changed, 8 insertions(+) diff --git a/crypto/dh_helper.c b/crypto/dh_helper.c index 8ba8a3f82620..708ae20d2d3c 100644 --- a/crypto/dh_helper.c +++ b/crypto/dh_helper.c @@ -90,6 +90,14 @@ int crypto_dh_decode_key(const char *buf, unsigned int len, struct dh *params) params->p = (void *)(ptr + params->key_size); params->g = (void *)(ptr + params->key_size + params->p_size); + /* +* Don't permit 'p' to be 0. It's not a prime number, and it's subject +* to corner cases such as 'mod 0' being undefined or +* crypto_kpp_maxsize() returning 0. +*/ + if (memchr_inv(params->p, 0, params->p_size) == NULL) + return -EINVAL; + return 0; } EXPORT_SYMBOL_GPL(crypto_dh_decode_key); -- 2.15.0.403.gc27cc4dac6-goog
[PATCH 1/4] crypto: dh - fix double free of ctx->p
From: Eric Biggers When setting the secret with the software Diffie-Hellman implementation, if allocating 'g' failed (e.g. if it was longer than MAX_EXTERN_MPI_BITS), then 'p' was freed twice: once immediately, and once later when the crypto_kpp tfm was destroyed. Fix it by using dh_free_ctx() in the error paths, as that sets the pointers to NULL. KASAN report: MPI: mpi too large (32760 bits) == BUG: KASAN: use-after-free in mpi_free+0x131/0x170 Read of size 4 at addr 88006c7cdf90 by task reproduce_doubl/367 CPU: 1 PID: 367 Comm: reproduce_doubl Not tainted 4.14.0-rc7-00040-g05298abde6fe #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0xb3/0x10b ? mpi_free+0x131/0x170 print_address_description+0x79/0x2a0 ? mpi_free+0x131/0x170 kasan_report+0x236/0x340 ? akcipher_register_instance+0x90/0x90 __asan_report_load4_noabort+0x14/0x20 mpi_free+0x131/0x170 ? akcipher_register_instance+0x90/0x90 dh_exit_tfm+0x3d/0x140 crypto_kpp_exit_tfm+0x52/0x70 crypto_destroy_tfm+0xb3/0x250 __keyctl_dh_compute+0x640/0xe90 ? kasan_slab_free+0x12f/0x180 ? dh_data_from_key+0x240/0x240 ? key_create_or_update+0x1ee/0xb20 ? key_instantiate_and_link+0x440/0x440 ? lock_contended+0xee0/0xee0 ? kfree+0xcf/0x210 ? SyS_add_key+0x268/0x340 keyctl_dh_compute+0xb3/0xf1 ? __keyctl_dh_compute+0xe90/0xe90 ? SyS_add_key+0x26d/0x340 ? entry_SYSCALL_64_fastpath+0x5/0xbe ? trace_hardirqs_on_caller+0x3f4/0x560 SyS_keyctl+0x72/0x2c0 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x43ccf9 RSP: 002b:7ffeeec96158 EFLAGS: 0246 ORIG_RAX: 00fa RAX: ffda RBX: 0248b9b9 RCX: 0043ccf9 RDX: 7ffeeec96170 RSI: 7ffeeec96160 RDI: 0017 RBP: 0046 R08: R09: 0248b9b9143dc936 R10: 1000 R11: 0246 R12: R13: 00409670 R14: 00409700 R15: Allocated by task 367: save_stack_trace+0x16/0x20 kasan_kmalloc+0xeb/0x180 kmem_cache_alloc_trace+0x114/0x300 mpi_alloc+0x4b/0x230 mpi_read_raw_data+0xbe/0x360 dh_set_secret+0x1dc/0x460 __keyctl_dh_compute+0x623/0xe90 keyctl_dh_compute+0xb3/0xf1 SyS_keyctl+0x72/0x2c0 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 367: save_stack_trace+0x16/0x20 kasan_slab_free+0xab/0x180 kfree+0xb5/0x210 mpi_free+0xcb/0x170 dh_set_secret+0x2d7/0x460 __keyctl_dh_compute+0x623/0xe90 keyctl_dh_compute+0xb3/0xf1 SyS_keyctl+0x72/0x2c0 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: 802c7f1c84e4 ("crypto: dh - Add DH software implementation") Cc: # v4.8+ Signed-off-by: Eric Biggers --- crypto/dh.c | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/crypto/dh.c b/crypto/dh.c index b1032a5c1bfa..b488f1782ced 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -71,10 +71,8 @@ static int dh_set_params(struct dh_ctx *ctx, struct dh *params) return -EINVAL; ctx->g = mpi_read_raw_data(params->g, params->g_size); - if (!ctx->g) { - mpi_free(ctx->p); + if (!ctx->g) return -EINVAL; - } return 0; } @@ -89,18 +87,20 @@ static int dh_set_secret(struct crypto_kpp *tfm, const void *buf, dh_free_ctx(ctx); if (crypto_dh_decode_key(buf, len, ¶ms) < 0) - return -EINVAL; + goto err_free_ctx; if (dh_set_params(ctx, ¶ms) < 0) - return -EINVAL; + goto err_free_ctx; ctx->xa = mpi_read_raw_data(params.key, params.key_size); - if (!ctx->xa) { - dh_clear_params(ctx); - return -EINVAL; - } + if (!ctx->xa) + goto err_free_ctx; return 0; + +err_free_ctx: + dh_free_ctx(ctx); + return -EINVAL; } static int dh_compute_value(struct kpp_request *req) -- 2.15.0.403.gc27cc4dac6-goog
[PATCH 0/4] crypto: dh - input validation fixes
From: Eric Biggers This series fixes several corner cases in the Diffie-Hellman key exchange implementations: - With CONFIG_DEBUG_SG=y and the software DH implementation, setting 'p' to 0 caused a BUG_ON(). - Both the software and QAT DH implementations had a double-free bug in the case where 'g' could not be allocated. - With the QAT DH implementation, setting 'g' or 'key' larger than 'p' caused a buffer underflow. Note that in kernels configured with CONFIG_KEY_DH_OPERATIONS=y, these bugs are reachable by unprivileged users via KEYCTL_DH_COMPUTE. Eric Biggers (4): crypto: dh - fix double free of ctx->p crypto: dh - don't permit 'p' to be 0 crypto: qat - fix double free of ctx->p crypto: dh - don't permit 'key' or 'g' size longer than 'p' crypto/dh.c | 18 +- crypto/dh_helper.c| 16 drivers/crypto/qat/qat_common/qat_asym_algs.c | 15 --- 3 files changed, 33 insertions(+), 16 deletions(-) -- 2.15.0.403.gc27cc4dac6-goog
Re: invalid opcode: 0000 [#1] SMP [aesni_intel]
On Mon, Oct 23, 2017 at 07:01:32PM +0300, SviMik wrote: > Hi! > > Got the following kernel panic: > > invalid opcode: [#1] SMP > CPU: 0 PID: 1449 Comm: openvpn Not tainted 4.8.13-1.el6.elrepo.x86_64 #1 > cut > Call Trace: > > [] ? enqueue_entity+0x45e/0x6f0 > [] ? aesni_gcm_enc_avx+0x95/0xd0 [aesni_intel] > [] helper_rfc4106_encrypt+0x167/0x2f0 [aesni_intel] > [] rfc4106_encrypt+0x5b/0x90 [aesni_intel] > cut > > The detailed bug report with full oops dump can be found here: > https://bugzilla.kernel.org/show_bug.cgi?id=197363 > > Could not trigger this bug intentionally, but it happened three times > already (all three dumps are available). Well, the program counter was at the 'vmovdqu' instruction, which is the first AVX instruction in the function. Is it possible you're running the kernel on a broken hypervisor that claims AVX instructions are supported but really they aren't? Note that AVX is different from AES-NI; the code being executed has both types of instructions. (And as a side note, 4.8 is not being maintained as a long term support kernel, so you really should switch to a different version.) Eric
Re: [PATCH 1/4] crypto: dh - fix double free of ctx->p
Hi Tudor, On Thu, Nov 02, 2017 at 12:55:56PM +0200, Tudor Ambarus wrote: > Hi, Eric, > > On 11/02/2017 12:25 AM, Eric Biggers wrote: > >When setting the secret with the software Diffie-Hellman implementation, > >if allocating 'g' failed (e.g. if it was longer than > >MAX_EXTERN_MPI_BITS), then 'p' was freed twice: once immediately, and > >once later when the crypto_kpp tfm was destroyed. Fix it by using > >dh_free_ctx() in the error paths, as that sets the pointers to NULL. > > Other solution would be to have just an one-line patch that sets p to > NULL after freeing it. The benefit of just setting p to NULL and not > using dh_free_ctx() is that you'll spare some cpu cycles. If g fails, > g and a will already be NULL, so freeing them and set them to NULL is > redundant. > > However, if you decide to always use dh_free_ctx(), you'll have to get > rid of dh_clear_params(), because it will be used just in dh_free_ctx(). > You can copy dh_clear_params() body to dh_free_ctx(). > This is on an error path, so a few cycles don't matter. I would much rather have the simpler code, with less chance to introduce exploitable bugs. Yes, I should inline dh_clear_params() into dh_free_ctx(). Eric
Re: [PATCH 2/4] crypto: dh - don't permit 'p' to be 0
On Thu, Nov 02, 2017 at 01:40:51PM +0200, Tudor Ambarus wrote: > Hi, Eric, > > On 11/02/2017 12:25 AM, Eric Biggers wrote: > >If 'p' is 0 for the software Diffie-Hellman implementation, then > >dh_max_size() returns 0. > > dh_set_secret() returns -EINVAL if p_len < 1536, see > dh_check_params_length(). What am I missing? > > Cheers, > ta You pass a buffer containing 0's, not a buffer of length 0. The buffer is interpreted as an arbitrary precision integer, so any length buffer filled with 0's has the mathematical value 0. Eric
Re: [PATCH 3/4] crypto: qat - fix double free of ctx->p
On Wed, Nov 01, 2017 at 03:25:16PM -0700, Eric Biggers wrote: > From: Eric Biggers > > When setting the secret with the "qat-dh" Diffie-Hellman implementation, > if allocating 'g' failed, then 'p' was freed twice: once immediately, > and once later when the crypto_kpp tfm was destroyed. Fix it by using > qat_dh_clear_ctx() in the error paths, as that sets the pointers to > NULL. > > Fixes: c9839143ebbf ("crypto: qat - Add DH support") > Cc: # v4.8+ > Signed-off-by: Eric Biggers > --- > drivers/crypto/qat/qat_common/qat_asym_algs.c | 15 --- > 1 file changed, 8 insertions(+), 7 deletions(-) > > diff --git a/drivers/crypto/qat/qat_common/qat_asym_algs.c > b/drivers/crypto/qat/qat_common/qat_asym_algs.c > index 6f5dd68449c6..7655fdb499de 100644 > --- a/drivers/crypto/qat/qat_common/qat_asym_algs.c > +++ b/drivers/crypto/qat/qat_common/qat_asym_algs.c > @@ -462,11 +462,8 @@ static int qat_dh_set_params(struct qat_dh_ctx *ctx, > struct dh *params) > } > > ctx->g = dma_zalloc_coherent(dev, ctx->p_size, &ctx->dma_g, GFP_KERNEL); > - if (!ctx->g) { > - dma_free_coherent(dev, ctx->p_size, ctx->p, ctx->dma_p); > - ctx->p = NULL; > + if (!ctx->g) Sorry, I misread this code (and I didn't have the hardware to test this driver); there is actually no bug here because it sets ctx->p to NULL. I think we should still do this patch to simplify the code, but I'll update the description to reflect that it's not actually fixing anything. Eric
[PATCH v2 5/5] crypto: dh - Remove pointless checks for NULL 'p' and 'g'
From: Eric Biggers Neither 'p' nor 'g' can be NULL, as they were unpacked using crypto_dh_decode_key(). And it makes no sense for them to be optional. So remove the NULL checks that were copy-and-pasted into both modules. Signed-off-by: Eric Biggers --- crypto/dh.c | 3 --- drivers/crypto/qat/qat_common/qat_asym_algs.c | 3 --- 2 files changed, 6 deletions(-) diff --git a/crypto/dh.c b/crypto/dh.c index aadaf36fb56f..5659fe7f446d 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -53,9 +53,6 @@ static int dh_check_params_length(unsigned int p_len) static int dh_set_params(struct dh_ctx *ctx, struct dh *params) { - if (unlikely(!params->p || !params->g)) - return -EINVAL; - if (dh_check_params_length(params->p_size << 3)) return -EINVAL; diff --git a/drivers/crypto/qat/qat_common/qat_asym_algs.c b/drivers/crypto/qat/qat_common/qat_asym_algs.c index 7655fdb499de..13c52d6bf630 100644 --- a/drivers/crypto/qat/qat_common/qat_asym_algs.c +++ b/drivers/crypto/qat/qat_common/qat_asym_algs.c @@ -443,9 +443,6 @@ static int qat_dh_set_params(struct qat_dh_ctx *ctx, struct dh *params) struct qat_crypto_instance *inst = ctx->inst; struct device *dev = &GET_DEV(inst->accel_dev); - if (unlikely(!params->p || !params->g)) - return -EINVAL; - if (qat_dh_check_params_length(params->p_size << 3)) return -EINVAL; -- 2.15.0
[PATCH v2 0/5] crypto: dh - input validation fixes
This series fixes several corner cases in the Diffie-Hellman key exchange implementations: 1. With the software DH implementation, using a large buffer for 'g' caused a double free. 2. With CONFIG_DEBUG_SG=y and the software DH implementation, setting 'p' to 0 caused a BUG_ON(). 3. With the QAT DH implementation, setting 'key' or 'g' larger than 'p' caused a buffer underflow. Note that in kernels configured with CONFIG_KEY_DH_OPERATIONS=y, these bugs are reachable by unprivileged users via KEYCTL_DH_COMPUTE. Patches 4 and 5 are cleanup only. Eric Biggers (5): crypto: dh - Fix double free of ctx->p crypto: dh - Don't permit 'p' to be 0 crypto: dh - Don't permit 'key' or 'g' size longer than 'p' crypto: qat - Clean up error handling in qat_dh_set_secret() crypto: dh - Remove pointless checks for NULL 'p' and 'g' crypto/dh.c | 36 ++- crypto/dh_helper.c| 16 drivers/crypto/qat/qat_common/qat_asym_algs.c | 18 ++ 3 files changed, 37 insertions(+), 33 deletions(-) -- 2.15.0
[PATCH v2 4/5] crypto: qat - Clean up error handling in qat_dh_set_secret()
From: Eric Biggers Update the error handling in qat_dh_set_secret() to mirror dh_set_secret(). The new version is less error-prone because freeing memory and setting the pointers to NULL is now only done in one place. Signed-off-by: Eric Biggers --- drivers/crypto/qat/qat_common/qat_asym_algs.c | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/crypto/qat/qat_common/qat_asym_algs.c b/drivers/crypto/qat/qat_common/qat_asym_algs.c index 6f5dd68449c6..7655fdb499de 100644 --- a/drivers/crypto/qat/qat_common/qat_asym_algs.c +++ b/drivers/crypto/qat/qat_common/qat_asym_algs.c @@ -462,11 +462,8 @@ static int qat_dh_set_params(struct qat_dh_ctx *ctx, struct dh *params) } ctx->g = dma_zalloc_coherent(dev, ctx->p_size, &ctx->dma_g, GFP_KERNEL); - if (!ctx->g) { - dma_free_coherent(dev, ctx->p_size, ctx->p, ctx->dma_p); - ctx->p = NULL; + if (!ctx->g) return -ENOMEM; - } memcpy(ctx->g + (ctx->p_size - params->g_size), params->g, params->g_size); @@ -507,18 +504,22 @@ static int qat_dh_set_secret(struct crypto_kpp *tfm, const void *buf, ret = qat_dh_set_params(ctx, ¶ms); if (ret < 0) - return ret; + goto err_clear_ctx; ctx->xa = dma_zalloc_coherent(dev, ctx->p_size, &ctx->dma_xa, GFP_KERNEL); if (!ctx->xa) { - qat_dh_clear_ctx(dev, ctx); - return -ENOMEM; + ret = -ENOMEM; + goto err_clear_ctx; } memcpy(ctx->xa + (ctx->p_size - params.key_size), params.key, params.key_size); return 0; + +err_clear_ctx: + qat_dh_clear_ctx(dev, ctx); + return ret; } static unsigned int qat_dh_max_size(struct crypto_kpp *tfm) -- 2.15.0
[PATCH v2 3/5] crypto: dh - Don't permit 'key' or 'g' size longer than 'p'
From: Eric Biggers The "qat-dh" DH implementation assumes that 'key' and 'g' can be copied into a buffer with size 'p_size'. However it was never checked that that was actually the case, which most likely allowed users to cause a buffer underflow via KEYCTL_DH_COMPUTE. Fix this by updating crypto_dh_decode_key() to verify this precondition for all DH implementations. Fixes: c9839143ebbf ("crypto: qat - Add DH support") Cc: # v4.8+ Signed-off-by: Eric Biggers --- crypto/dh_helper.c | 8 1 file changed, 8 insertions(+) diff --git a/crypto/dh_helper.c b/crypto/dh_helper.c index 708ae20d2d3c..7f00c771fe8d 100644 --- a/crypto/dh_helper.c +++ b/crypto/dh_helper.c @@ -83,6 +83,14 @@ int crypto_dh_decode_key(const char *buf, unsigned int len, struct dh *params) if (secret.len != crypto_dh_key_len(params)) return -EINVAL; + /* +* Don't permit the buffer for 'key' or 'g' to be larger than 'p', since +* some drivers assume otherwise. +*/ + if (params->key_size > params->p_size || + params->g_size > params->p_size) + return -EINVAL; + /* Don't allocate memory. Set pointers to data within * the given buffer */ -- 2.15.0
[PATCH v2 2/5] crypto: dh - Don't permit 'p' to be 0
From: Eric Biggers If 'p' is 0 for the software Diffie-Hellman implementation, then dh_max_size() returns 0. In the case of KEYCTL_DH_COMPUTE, this causes ZERO_SIZE_PTR to be passed to sg_init_one(), which with CONFIG_DEBUG_SG=y triggers the 'BUG_ON(!virt_addr_valid(buf));' in sg_set_buf(). Fix this by making crypto_dh_decode_key() reject 0 for 'p'. p=0 makes no sense for any DH implementation because 'p' is supposed to be a prime number. Moreover, 'mod 0' is not mathematically defined. Bug report: kernel BUG at ./include/linux/scatterlist.h:140! invalid opcode: [#1] SMP KASAN CPU: 0 PID: 27112 Comm: syz-executor2 Not tainted 4.14.0-rc7-00010-gf5dbb5d0ce32-dirty #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.3-20171021_125229-anatol 04/01/2014 task: 88006caac0c0 task.stack: 88006c7c8000 RIP: 0010:sg_set_buf include/linux/scatterlist.h:140 [inline] RIP: 0010:sg_init_one+0x1b3/0x240 lib/scatterlist.c:156 RSP: 0018:88006c7cfb08 EFLAGS: 00010216 RAX: 0001 RBX: 88006c7cfe30 RCX: 64ee RDX: 81cf64c3 RSI: c9d72000 RDI: 92e937e0 RBP: 88006c7cfb30 R08: ed000d8f9fab R09: 88006c7cfd30 R10: 0005 R11: ed000d8f9faa R12: 88006c7cfd30 R13: R14: 0010 R15: 88006c7cfc50 FS: 7fce190fa700() GS:88003ea0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7fffc6b33db8 CR3: 3cf64000 CR4: 06f0 Call Trace: __keyctl_dh_compute+0xa95/0x19b0 security/keys/dh.c:360 keyctl_dh_compute+0xac/0x100 security/keys/dh.c:434 SYSC_keyctl security/keys/keyctl.c:1745 [inline] SyS_keyctl+0x72/0x2c0 security/keys/keyctl.c:1641 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4585c9 RSP: 002b:7fce190f9bd8 EFLAGS: 0216 ORIG_RAX: 00fa RAX: ffda RBX: 00738020 RCX: 004585c9 RDX: 2000d000 RSI: 2ff4 RDI: 0017 RBP: 0046 R08: 20008000 R09: R10: R11: 0216 R12: 7fff6e610cde R13: 7fff6e610cdf R14: 7fce190fa700 R15: Code: 03 0f b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 33 5b 45 89 6c 24 14 41 5c 41 5d 41 5e 41 5f 5d c3 e8 fd 8f 68 ff <0f> 0b e8 f6 8f 68 ff 0f 0b e8 ef 8f 68 ff 0f 0b e8 e8 8f 68 ff 20 RIP: sg_set_buf include/linux/scatterlist.h:140 [inline] RSP: 88006c7cfb08 RIP: sg_init_one+0x1b3/0x240 lib/scatterlist.c:156 RSP: 88006c7cfb08 Fixes: 802c7f1c84e4 ("crypto: dh - Add DH software implementation") Cc: # v4.8+ Reviewed-by: Tudor Ambarus Signed-off-by: Eric Biggers --- crypto/dh_helper.c | 8 1 file changed, 8 insertions(+) diff --git a/crypto/dh_helper.c b/crypto/dh_helper.c index 8ba8a3f82620..708ae20d2d3c 100644 --- a/crypto/dh_helper.c +++ b/crypto/dh_helper.c @@ -90,6 +90,14 @@ int crypto_dh_decode_key(const char *buf, unsigned int len, struct dh *params) params->p = (void *)(ptr + params->key_size); params->g = (void *)(ptr + params->key_size + params->p_size); + /* +* Don't permit 'p' to be 0. It's not a prime number, and it's subject +* to corner cases such as 'mod 0' being undefined or +* crypto_kpp_maxsize() returning 0. +*/ + if (memchr_inv(params->p, 0, params->p_size) == NULL) + return -EINVAL; + return 0; } EXPORT_SYMBOL_GPL(crypto_dh_decode_key); -- 2.15.0
[PATCH v2 1/5] crypto: dh - Fix double free of ctx->p
From: Eric Biggers When setting the secret with the software Diffie-Hellman implementation, if allocating 'g' failed (e.g. if it was longer than MAX_EXTERN_MPI_BITS), then 'p' was freed twice: once immediately, and once later when the crypto_kpp tfm was destroyed. Fix it by using dh_free_ctx() (renamed to dh_clear_ctx()) in the error paths, as that correctly sets the pointers to NULL. KASAN report: MPI: mpi too large (32760 bits) == BUG: KASAN: use-after-free in mpi_free+0x131/0x170 Read of size 4 at addr 88006c7cdf90 by task reproduce_doubl/367 CPU: 1 PID: 367 Comm: reproduce_doubl Not tainted 4.14.0-rc7-00040-g05298abde6fe #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0xb3/0x10b ? mpi_free+0x131/0x170 print_address_description+0x79/0x2a0 ? mpi_free+0x131/0x170 kasan_report+0x236/0x340 ? akcipher_register_instance+0x90/0x90 __asan_report_load4_noabort+0x14/0x20 mpi_free+0x131/0x170 ? akcipher_register_instance+0x90/0x90 dh_exit_tfm+0x3d/0x140 crypto_kpp_exit_tfm+0x52/0x70 crypto_destroy_tfm+0xb3/0x250 __keyctl_dh_compute+0x640/0xe90 ? kasan_slab_free+0x12f/0x180 ? dh_data_from_key+0x240/0x240 ? key_create_or_update+0x1ee/0xb20 ? key_instantiate_and_link+0x440/0x440 ? lock_contended+0xee0/0xee0 ? kfree+0xcf/0x210 ? SyS_add_key+0x268/0x340 keyctl_dh_compute+0xb3/0xf1 ? __keyctl_dh_compute+0xe90/0xe90 ? SyS_add_key+0x26d/0x340 ? entry_SYSCALL_64_fastpath+0x5/0xbe ? trace_hardirqs_on_caller+0x3f4/0x560 SyS_keyctl+0x72/0x2c0 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x43ccf9 RSP: 002b:7ffeeec96158 EFLAGS: 0246 ORIG_RAX: 00fa RAX: ffda RBX: 0248b9b9 RCX: 0043ccf9 RDX: 7ffeeec96170 RSI: 7ffeeec96160 RDI: 0017 RBP: 0046 R08: R09: 0248b9b9143dc936 R10: 1000 R11: 0246 R12: R13: 00409670 R14: 00409700 R15: Allocated by task 367: save_stack_trace+0x16/0x20 kasan_kmalloc+0xeb/0x180 kmem_cache_alloc_trace+0x114/0x300 mpi_alloc+0x4b/0x230 mpi_read_raw_data+0xbe/0x360 dh_set_secret+0x1dc/0x460 __keyctl_dh_compute+0x623/0xe90 keyctl_dh_compute+0xb3/0xf1 SyS_keyctl+0x72/0x2c0 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 367: save_stack_trace+0x16/0x20 kasan_slab_free+0xab/0x180 kfree+0xb5/0x210 mpi_free+0xcb/0x170 dh_set_secret+0x2d7/0x460 __keyctl_dh_compute+0x623/0xe90 keyctl_dh_compute+0xb3/0xf1 SyS_keyctl+0x72/0x2c0 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: 802c7f1c84e4 ("crypto: dh - Add DH software implementation") Cc: # v4.8+ Signed-off-by: Eric Biggers --- crypto/dh.c | 33 + 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/crypto/dh.c b/crypto/dh.c index b1032a5c1bfa..aadaf36fb56f 100644 --- a/crypto/dh.c +++ b/crypto/dh.c @@ -21,19 +21,12 @@ struct dh_ctx { MPI xa; }; -static inline void dh_clear_params(struct dh_ctx *ctx) +static void dh_clear_ctx(struct dh_ctx *ctx) { mpi_free(ctx->p); mpi_free(ctx->g); - ctx->p = NULL; - ctx->g = NULL; -} - -static void dh_free_ctx(struct dh_ctx *ctx) -{ - dh_clear_params(ctx); mpi_free(ctx->xa); - ctx->xa = NULL; + memset(ctx, 0, sizeof(*ctx)); } /* @@ -71,10 +64,8 @@ static int dh_set_params(struct dh_ctx *ctx, struct dh *params) return -EINVAL; ctx->g = mpi_read_raw_data(params->g, params->g_size); - if (!ctx->g) { - mpi_free(ctx->p); + if (!ctx->g) return -EINVAL; - } return 0; } @@ -86,21 +77,23 @@ static int dh_set_secret(struct crypto_kpp *tfm, const void *buf, struct dh params; /* Free the old MPI key if any */ - dh_free_ctx(ctx); + dh_clear_ctx(ctx); if (crypto_dh_decode_key(buf, len, ¶ms) < 0) - return -EINVAL; + goto err_clear_ctx; if (dh_set_params(ctx, ¶ms) < 0) - return -EINVAL; + goto err_clear_ctx; ctx->xa = mpi_read_raw_data(params.key, params.key_size); - if (!ctx->xa) { - dh_clear_params(ctx); - return -EINVAL; - } + if (!ctx->xa) + goto err_clear_ctx; return 0; + +err_clear_ctx: + dh_clear_ctx(ctx); + return -EINVAL; } static int dh_compute_value(struct kpp_request *req) @@ -158,7 +151,7 @@ static void dh_exit_tfm(struct crypto_kpp *tfm) { struct dh_ctx *ctx = dh_
Re: general protection fault in asn1_ber_decoder
On Mon, Nov 06, 2017 at 10:36:00AM -0800, syzbot wrote: > kasan: GPF could be caused by NULL-ptr deref or user memory access > general protection fault: [#1] SMP KASAN > Dumping ftrace buffer: >(ftrace buffer empty) > Modules linked in: > CPU: 3 PID: 2984 Comm: syzkaller229187 Not tainted > 4.14.0-rc7-next-20171103+ #10 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 > task: 88003ab66000 task.stack: 880039d58000 > RIP: 0010:asn1_ber_decoder+0x41e/0x1af0 lib/asn1_decoder.c:233 > RSP: 0018:880039d5f8d0 EFLAGS: 00010246 > RAX: RBX: RCX: > RDX: RSI: 88006981bf00 RDI: 853f1920 > RBP: 880039d5fb88 R08: 0060 R09: > R10: R11: R12: > R13: R14: 880039d5fb60 R15: dc00 > FS: 010ac880() GS:88006df0() knlGS: > CS: 0010 DS: ES: CR0: 80050033 > CR2: 20f49ffb CR3: 6aeb8000 CR4: 06e0 > DR0: DR1: DR2: > DR3: DR6: fffe0ff0 DR7: 0400 > Call Trace: > pkcs7_parse_message+0x2b3/0x730 crypto/asymmetric_keys/pkcs7_parser.c:139 > verify_pkcs7_signature+0x8d/0x290 certs/system_keyring.c:216 > pkcs7_preparse+0x7b/0xc0 crypto/asymmetric_keys/pkcs7_key_type.c:63 > key_create_or_update+0x533/0x1040 security/keys/key.c:855 > SYSC_add_key security/keys/keyctl.c:122 [inline] > SyS_add_key+0x18a/0x340 security/keys/keyctl.c:62 > entry_SYSCALL_64_fastpath+0x1f/0xbe > RIP: 0033:0x434f39 > RSP: 002b:7fff51fda138 EFLAGS: 0286 ORIG_RAX: 00f8 > RAX: ffda RBX: 004002b0 RCX: 00434f39 > RDX: 2000 RSI: 20f49ffb RDI: 20f4a000 > RBP: 0086 R08: R09: > R10: R11: 0286 R12: > R13: 004018b0 R14: 00401940 R15: > Code: 19 ff 48 8d 43 01 49 89 86 80 fe ff ff 48 89 85 a8 fd ff ff 48 > 8b 85 c0 fd ff ff 48 01 d8 48 89 c2 48 89 c1 48 c1 ea 03 83 e1 07 > <42> 0f b6 14 3a 38 ca 7f 08 84 d2 0f 85 cd 0f 00 00 0f b6 00 88 > RIP: asn1_ber_decoder+0x41e/0x1af0 lib/asn1_decoder.c:233 RSP: > 880039d5f8d0 I'll be sending a fix for this. The bug is an integer underflow in the expression 'if (unlikely(dp >= datalen - 1))', which causes a NULL pointer dereference if you try to add a "pkcs7_test" key with an empty payload (requires CONFIG_PKCS7_TEST_KEY=y). Eric
Re: general protection fault in asn1_ber_decoder
On Mon, Nov 06, 2017 at 10:05:45PM +, David Howells wrote: > diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c > index fef5d2e114be..048de2c20ae9 100644 > --- a/lib/asn1_decoder.c > +++ b/lib/asn1_decoder.c > @@ -201,6 +201,13 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder, > if (datalen > 65535) > return -EMSGSIZE; > > + /* We don't currently support 0-length messages - the underrun checks > + * will fail if datalen is 0 because we check against datalen - 1 with > + * unsigned arithmetic. > + */ > + if (datalen == 0) > + return -EBADMSG; > + Hi David, you just beat me to it, but I don't think this is the best way to fix the problem. The length check just needs to be rewritten to not overflow. Also it seems there is another broken length check later in the function. How about this: commit 8bbe85872c660c891eb978a037f590198319e3b2 Author: Eric Biggers Date: Mon Nov 6 10:06:32 2017 -0800 KEYS: fix NULL pointer dereference during ASN.1 parsing syzkaller reported a NULL pointer dereference in asn1_ber_decoder(). It can be reproduced by the following command, assuming CONFIG_PKCS7_TEST_KEY=y: keyctl add pkcs7_test desc '' @s The bug is that if the data buffer is empty, an integer underflow occurs in the following check: if (unlikely(dp >= datalen - 1)) goto data_overrun_error; This results in the NULL data pointer being dereferenced. Fix it by checking for 'datalen - dp < 2' instead. Also fix the similar check for 'dp >= datalen - n' later in the same function. That one possibly could result in a buffer overread. The NULL pointer dereference was reproducible using the "pkcs7_test" key type but not the "asymmetric" key type because the "asymmetric" key type checks for a 0-length payload before calling into the ASN.1 decoder but the "pkcs7_test" key type does not. The bug report was: BUG: unable to handle kernel NULL pointer dereference at (null) IP: asn1_ber_decoder+0x17f/0xe60 lib/asn1_decoder.c:233 PGD 7b708067 P4D 7b708067 PUD 7b6ee067 PMD 0 Oops: [#1] SMP Modules linked in: CPU: 0 PID: 522 Comm: syz-executor1 Not tainted 4.14.0-rc8 #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.3-20171021_125229-anatol 04/01/2014 task: 9b6b3798c040 task.stack: 9b6b3797 RIP: 0010:asn1_ber_decoder+0x17f/0xe60 lib/asn1_decoder.c:233 RSP: 0018:9b6b37973c78 EFLAGS: 00010216 RAX: RBX: RCX: 021c RDX: 814a04ed RSI: b1524066e000 RDI: 910759e0 RBP: 9b6b37973d60 R08: 0001 R09: 9b6b3caa4180 R10: R11: R12: 0002 R13: R14: R15: FS: 7f10ed1f2700() GS:9b6b3ea0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 7b6f3000 CR4: 06f0 Call Trace: pkcs7_parse_message+0xee/0x240 crypto/asymmetric_keys/pkcs7_parser.c:139 verify_pkcs7_signature+0x33/0x180 certs/system_keyring.c:216 pkcs7_preparse+0x41/0x70 crypto/asymmetric_keys/pkcs7_key_type.c:63 key_create_or_update+0x180/0x530 security/keys/key.c:855 SYSC_add_key security/keys/keyctl.c:122 [inline] SyS_add_key+0xbf/0x250 security/keys/keyctl.c:62 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4585c9 RSP: 002b:7f10ed1f1bd8 EFLAGS: 0216 ORIG_RAX: 00f8 RAX: ffda RBX: 7f10ed1f2700 RCX: 004585c9 RDX: 2000 RSI: 20008ffb RDI: 20008000 RBP: R08: R09: R10: R11: 0216 R12: 7fff1b2260ae R13: 7fff1b2260af R14: 7f10ed1f2700 R15: Code: dd ca ff 48 8b 45 88 48 83 e8 01 4c 39 f0 0f 86 a8 07 00 00 e8 53 dd ca ff 49 8d 46 01 48 89 85 58 ff ff ff 48 8b 85 60 ff ff ff <42> 0f b6 0c 30 89 c8 88 8d 75 ff ff ff 83 e0 1f 89 8d 28 ff ff RIP: asn1_ber_decoder+0x17f/0xe60 lib/asn1_decoder.c:233 RSP: 9b6b37973c78 CR2: 0000 Fixes: 42d5ec27f873 ("X.509: Add an ASN.1 decoder") Reported-by: syzbot Cc: # v3.7+ Signed-off-by: Eric Biggers diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index fef5d2e114be..1ef0cec38d78 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -228,7 +228,7 @@ int asn1_