There are two parts to accelerating AES-GCM with special instructions on amd64: AESNI for AES and PCLMUL for GMAC. Currently we link those: If a machine has AESNI, use AESNI and PCLMUL, otherwise do everything in software.
For the last few years, all low-end Intel CPUs (Celeron, Pentium G, i3) have included PCLMUL even if they don't have AESNI. Here's a patch that uses the optimized PCLMUL code on these CPUs. A quick check with tcpbench(1) suggests that this about doubles the throughput for an aes-128-gcm security association. To allow testing on well-endowed CPUs, I have unhooked the AESNI path in identifycpu(). This will obviously not be part of the final commit. This version includes early feedback from mikeb@ that we need fpu_kernel_enter() before calling aesni_gmac_update(). Index: crypto/gmac.c =================================================================== RCS file: /cvs/src/sys/crypto/gmac.c,v retrieving revision 1.4 diff -u -p -r1.4 gmac.c --- crypto/gmac.c 12 Nov 2014 17:52:02 -0000 1.4 +++ crypto/gmac.c 2 Nov 2015 22:29:39 -0000 @@ -29,7 +29,10 @@ #include <crypto/gmac.h> void ghash_gfmul(uint32_t *, uint32_t *, uint32_t *); -void ghash_update(GHASH_CTX *, uint8_t *, size_t); +void ghash_update_mi(GHASH_CTX *, uint8_t *, size_t); + +/* Allow overriding with optimized MD function */ +void (*ghash_update)(GHASH_CTX *, uint8_t *, size_t) = ghash_update_mi; /* Computes a block multiplication in the GF(2^128) */ void @@ -70,7 +73,7 @@ ghash_gfmul(uint32_t *X, uint32_t *Y, ui } void -ghash_update(GHASH_CTX *ctx, uint8_t *X, size_t len) +ghash_update_mi(GHASH_CTX *ctx, uint8_t *X, size_t len) { uint32_t *x = (uint32_t *)X; uint32_t *s = (uint32_t *)ctx->S; @@ -131,11 +134,12 @@ AES_GMAC_Update(AES_GMAC_CTX *ctx, const if (len > 0) { plen = len % GMAC_BLOCK_LEN; if (len >= GMAC_BLOCK_LEN) - ghash_update(&ctx->ghash, (uint8_t *)data, len - plen); + (*ghash_update)(&ctx->ghash, (uint8_t *)data, + len - plen); if (plen) { bcopy((uint8_t *)data + (len - plen), (uint8_t *)blk, plen); - ghash_update(&ctx->ghash, (uint8_t *)blk, + (*ghash_update)(&ctx->ghash, (uint8_t *)blk, GMAC_BLOCK_LEN); } } Index: crypto/gmac.h =================================================================== RCS file: /cvs/src/sys/crypto/gmac.h,v retrieving revision 1.2 diff -u -p -r1.2 gmac.h --- crypto/gmac.h 5 Dec 2012 23:20:15 -0000 1.2 +++ crypto/gmac.h 3 Nov 2015 15:24:52 -0000 @@ -38,6 +38,8 @@ typedef struct _AES_GMAC_CTX { } AES_GMAC_CTX; __BEGIN_DECLS +extern void (*ghash_update)(GHASH_CTX *, uint8_t *, size_t); + void AES_GMAC_Init(AES_GMAC_CTX *); void AES_GMAC_Setkey(AES_GMAC_CTX *, const uint8_t *, uint16_t); void AES_GMAC_Reinit(AES_GMAC_CTX *, const uint8_t *, uint16_t); Index: arch/amd64/amd64/aesni.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/aesni.c,v retrieving revision 1.35 diff -u -p -r1.35 aesni.c --- arch/amd64/amd64/aesni.c 28 Aug 2015 19:59:36 -0000 1.35 +++ arch/amd64/amd64/aesni.c 3 Nov 2015 15:33:42 -0000 @@ -120,6 +120,9 @@ int aesni_swauth(struct cryptop *, struc int aesni_encdec(struct cryptop *, struct cryptodesc *, struct cryptodesc *, struct aesni_session *); +void pclmul_setup(void); +void ghash_update_pclmul(GHASH_CTX *, uint8_t *, size_t); + void aesni_setup(void) { @@ -662,4 +665,18 @@ out: crp->crp_etype = err; crypto_done(crp); return (err); +} + +void +pclmul_setup(void) +{ + ghash_update = ghash_update_pclmul; +} + +void +ghash_update_pclmul(GHASH_CTX *ghash, uint8_t *src, size_t len) +{ + fpu_kernel_enter(); + aesni_gmac_update(ghash, src, len); + fpu_kernel_exit(); } Index: arch/amd64/amd64/autoconf.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/autoconf.c,v retrieving revision 1.43 diff -u -p -r1.43 autoconf.c --- arch/amd64/amd64/autoconf.c 17 Jul 2015 21:53:56 -0000 1.43 +++ arch/amd64/amd64/autoconf.c 2 Nov 2015 22:57:14 -0000 @@ -97,6 +97,9 @@ void rdrand(void *); void viac3_crypto_setup(void); extern int amd64_has_xcrypt; +void pclmul_setup(void); +extern int amd64_has_pclmul; + void aesni_setup(void); extern int amd64_has_aesni; #endif @@ -145,6 +148,9 @@ cpu_configure(void) */ if (amd64_has_xcrypt) viac3_crypto_setup(); + + if (amd64_has_pclmul) + pclmul_setup(); if (amd64_has_aesni) aesni_setup(); Index: arch/amd64/amd64/identcpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v retrieving revision 1.64 diff -u -p -r1.64 identcpu.c --- arch/amd64/amd64/identcpu.c 12 Aug 2015 05:31:41 -0000 1.64 +++ arch/amd64/amd64/identcpu.c 3 Nov 2015 15:25:49 -0000 @@ -52,6 +52,7 @@ int cpuspeed; int amd64_has_xcrypt; #ifdef CRYPTO +int amd64_has_pclmul; int amd64_has_aesni; #endif int has_rdrand; @@ -560,8 +561,11 @@ identifycpu(struct cpu_info *ci) setperf_setup = est_init; #ifdef CRYPTO + if (cpu_ecxfeature & CPUIDECX_PCLMUL) + amd64_has_pclmul = 1; + if (cpu_ecxfeature & CPUIDECX_AES) - amd64_has_aesni = 1; + /*amd64_has_aesni = 1*/; #endif if (cpu_ecxfeature & CPUIDECX_RDRAND) -- Christian "naddy" Weisgerber na...@mips.inka.de