Here's a cleaned-up diff.  Briefly tested on amd64 & sparc64.  I'll
do some more testing tomorrow.  This already has mikeb@'s blessing.

Index: regress/sys/crypto/gmac/Makefile
===================================================================
RCS file: /cvs/src/regress/sys/crypto/gmac/Makefile,v
retrieving revision 1.2
diff -u -p -r1.2 Makefile
--- regress/sys/crypto/gmac/Makefile    18 Jan 2014 05:54:52 -0000      1.2
+++ regress/sys/crypto/gmac/Makefile    12 Oct 2014 19:05:35 -0000
@@ -3,7 +3,7 @@
 DIR=${.CURDIR}/../../../../sys
 
 PROG=  gmac_test
-SRCS+= rijndael.c gmac.c gmac_test.c
+SRCS+= rijndael.c gfmult.c gmac.c gmac_test.c
 CDIAGFLAGS=    -Wall
 CDIAGFLAGS+=   -Werror
 CDIAGFLAGS+=   -Wpointer-arith
Index: sys/crypto/gfmult.c
===================================================================
RCS file: sys/crypto/gfmult.c
diff -N sys/crypto/gfmult.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ sys/crypto/gfmult.c 12 Oct 2014 17:28:42 -0000
@@ -0,0 +1,275 @@
+/*-
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by John-Mark Gurney under
+ * the sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1.  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ * 2.  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     $FreeBSD$
+ *
+ */
+
+#include <crypto/gfmult.h>
+
+#define REV_POLY_REDUCT        0xe1    /* 0x87 bit reversed */
+
+/* reverse the bits of a nibble */
+static const uint8_t nib_rev[] = {
+       0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+       0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf,
+};
+
+/* calulate v * 2 */
+static inline struct gf128
+gf128_mulalpha(struct gf128 v)
+{
+       uint64_t mask;
+
+       mask = !!(v.v[1] & 1);
+       mask = ~(mask - 1);
+       v.v[1] = (v.v[1] >> 1) | ((v.v[0] & 1) << 63);
+       v.v[0] = (v.v[0] >> 1) ^ ((mask & REV_POLY_REDUCT) << 56);
+
+       return v;
+}
+
+/*
+ * Generate a table for 0-16 * h.  Store the results in the table w/ indexes
+ * bit reversed, and the words striped across the values.
+ */
+void
+gf128_genmultable(struct gf128 h, struct gf128table *t)
+{
+       struct gf128 tbl[16];
+       int i;
+
+       tbl[0] = MAKE_GF128(0, 0);
+       tbl[1] = h;
+
+       for (i = 2; i < 16; i += 2) {
+               tbl[i] = gf128_mulalpha(tbl[i / 2]);
+               tbl[i + 1] = gf128_add(tbl[i], h);
+       }
+
+       for (i = 0; i < 16; i++) {
+               t->a[nib_rev[i]] = tbl[i].v[0] >> 32;
+               t->b[nib_rev[i]] = tbl[i].v[0];
+               t->c[nib_rev[i]] = tbl[i].v[1] >> 32;
+               t->d[nib_rev[i]] = tbl[i].v[1];
+       }
+}
+
+/*
+ * Generate tables containing h, h^2, h^3 and h^4, starting at 0.
+ */
+void
+gf128_genmultable4(struct gf128 h, struct gf128table4 *t)
+{
+       struct gf128 h2, h3, h4;
+
+       gf128_genmultable(h, &t->tbls[0]);
+
+       h2 = gf128_mul(h, &t->tbls[0]);
+
+       gf128_genmultable(h2, &t->tbls[1]);
+
+       h3 = gf128_mul(h, &t->tbls[1]);
+       gf128_genmultable(h3, &t->tbls[2]);
+
+       h4 = gf128_mul(h2, &t->tbls[1]);
+       gf128_genmultable(h4, &t->tbls[3]);
+}
+
+/*
+ * Read a row from the table.
+ */
+static inline struct gf128
+readrow(struct gf128table *tbl, unsigned bits)
+{
+       struct gf128 r;
+
+       bits = bits % 16;
+
+       r.v[0] = ((uint64_t)tbl->a[bits] << 32) | tbl->b[bits];
+       r.v[1] = ((uint64_t)tbl->c[bits] << 32) | tbl->d[bits];
+
+       return r;
+}
+
+/*
+ * These are the reduction values.  Since we are dealing with bit reversed
+ * version, the values need to be bit reversed, AND the indexes are also
+ * bit reversed to make lookups quicker.
+ */
+static uint16_t reduction[] = {
+       0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0,
+       0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0,
+};
+
+/*
+ * Calculate:
+ * (x*2^4 + word[3,0]*h) *
+ * 2^4 + word[7,4]*h) *
+ * ...
+ * 2^4 + word[63,60]*h
+ */
+static struct gf128
+gfmultword(uint64_t word, struct gf128 x, struct gf128table *tbl)
+{
+       struct gf128 row;
+       unsigned bits;
+       unsigned redbits;
+       int i;
+
+       for (i = 0; i < 64; i += 4) {
+               bits = word % 16;
+
+               /* fetch row */
+               row = readrow(tbl, bits);
+
+               /* x * 2^4 */
+               redbits = x.v[1] % 16;
+               x.v[1] = (x.v[1] >> 4) | (x.v[0] % 16) << 60;
+               x.v[0] >>= 4;
+               x.v[0] ^= (uint64_t)reduction[redbits] << (64 - 16);
+
+               word >>= 4;
+
+               x = gf128_add(x, row);
+       }
+
+       return x;
+}
+
+/*
+ * Calculate
+ * (x*2^4 + worda[3,0]*h^4+wordb[3,0]*h^3+...+wordd[3,0]*h) *
+ * ...
+ * 2^4 + worda[63,60]*h^4+ ... + wordd[63,60]*h
+ *
+ * Passing/returning struct is .5% faster than passing in via pointer on
+ * amd64.
+ */
+static struct gf128
+gfmultword4(uint64_t worda, uint64_t wordb, uint64_t wordc, uint64_t wordd,
+    struct gf128 x, struct gf128table4 *tbl)
+{
+       struct gf128 rowa, rowb, rowc, rowd;
+       unsigned bitsa, bitsb, bitsc, bitsd;
+       unsigned redbits;
+       int i;
+
+       /*
+        * XXX - nibble reverse words to save a shift? probably not as
+        * nibble reverse would take 20 ops (5 * 4) verse 16
+        */
+
+       for (i = 0; i < 64; i += 4) {
+               bitsa = worda % 16;
+               bitsb = wordb % 16;
+               bitsc = wordc % 16;
+               bitsd = wordd % 16;
+
+               /* fetch row */
+               rowa = readrow(&tbl->tbls[3], bitsa);
+               rowb = readrow(&tbl->tbls[2], bitsb);
+               rowc = readrow(&tbl->tbls[1], bitsc);
+               rowd = readrow(&tbl->tbls[0], bitsd);
+
+               /* x * 2^4 */
+               redbits = x.v[1] % 16;
+               x.v[1] = (x.v[1] >> 4) | (x.v[0] % 16) << 60;
+               x.v[0] >>= 4;
+               x.v[0] ^= (uint64_t)reduction[redbits] << (64 - 16);
+
+               worda >>= 4;
+               wordb >>= 4;
+               wordc >>= 4;
+               wordd >>= 4;
+
+               x = gf128_add(x, gf128_add(rowa, gf128_add(rowb,
+                   gf128_add(rowc, rowd))));
+       }
+
+       return x;
+}
+
+struct gf128
+gf128_mul(struct gf128 v, struct gf128table *tbl)
+{
+       struct gf128 ret;
+
+       ret = MAKE_GF128(0, 0);
+
+       ret = gfmultword(v.v[1], ret, tbl);
+       ret = gfmultword(v.v[0], ret, tbl);
+
+       return ret;
+}
+
+/*
+ * Calculate a*h^4 + b*h^3 + c*h^2 + d*h, or:
+ * (((a*h+b)*h+c)*h+d)*h
+ */
+struct gf128
+gf128_mul4(struct gf128 a, struct gf128 b, struct gf128 c, struct gf128 d,
+    struct gf128table4 *tbl)
+{
+       struct gf128 tmp;
+
+       tmp = MAKE_GF128(0, 0);
+
+       tmp = gfmultword4(a.v[1], b.v[1], c.v[1], d.v[1], tmp, tbl);
+       tmp = gfmultword4(a.v[0], b.v[0], c.v[0], d.v[0], tmp, tbl);
+
+       return tmp;
+}
+
+/*
+ * a = data[0..15] + r
+ * b = data[16..31]
+ * c = data[32..47]
+ * d = data[48..63]
+ *
+ * Calculate a*h^4 + b*h^3 + c*h^2 + d*h, or:
+ * (((a*h+b)*h+c)*h+d)*h
+ */
+struct gf128
+gf128_mul4b(struct gf128 r, const uint8_t *v, struct gf128table4 *tbl)
+{
+       struct gf128 a, b, c, d;
+       struct gf128 tmp;
+
+       tmp = MAKE_GF128(0, 0);
+
+       a = gf128_add(r, gf128_read(&v[0*16]));
+       b = gf128_read(&v[1*16]);
+       c = gf128_read(&v[2*16]);
+       d = gf128_read(&v[3*16]);
+
+       tmp = gfmultword4(a.v[1], b.v[1], c.v[1], d.v[1], tmp, tbl);
+       tmp = gfmultword4(a.v[0], b.v[0], c.v[0], d.v[0], tmp, tbl);
+
+       return tmp;
+}
Index: sys/crypto/gfmult.h
===================================================================
RCS file: sys/crypto/gfmult.h
diff -N sys/crypto/gfmult.h
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ sys/crypto/gfmult.h 12 Oct 2014 19:54:03 -0000
@@ -0,0 +1,125 @@
+/*-
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by John-Mark Gurney under
+ * the sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1.  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ * 2.  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     $FreeBSD$
+ *
+ */
+
+#ifndef _GFMULT_H_
+#define _GFMULT_H_
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#define be64dec(buf)   bemtoh64(buf)
+#define be64enc(buf, x)        htobem64(buf, x)
+#else
+#include <endian.h>
+#define be64dec(buf)   be64toh(*(uint64_t *)buf)
+#define be64enc(buf, x)        (*(uint64_t *)buf = htobe64(x))
+#endif
+
+/* XXX GCC 4.2 cannot align stack variables to 64 */
+#define REQ_ALIGN      (16/* * 4*/)
+/*
+ * The rows are striped across cache lines.  Note that the indexes
+ * are bit reversed to make accesses quicker.
+ */
+struct gf128table {
+       uint32_t a[16] __aligned(REQ_ALIGN);    /* bits   0 - 31 */
+       uint32_t b[16] __aligned(REQ_ALIGN);    /* bits  63 - 32 */
+       uint32_t c[16] __aligned(REQ_ALIGN);    /* bits  95 - 64 */
+       uint32_t d[16] __aligned(REQ_ALIGN);    /* bits 127 - 96 */
+} __aligned(REQ_ALIGN);
+
+/*
+ * A set of tables that contain h, h^2, h^3, h^4.  To be used w/ gf128_mul4.
+ */
+struct gf128table4 {
+       struct gf128table       tbls[4];
+};
+
+/*
+ * GCM per spec is bit reversed in memory.  So byte 0 is really bit reversed
+ * and contains bits 0-7.  We can deal w/ this by using right shifts and
+ * related math instead of having to bit reverse everything.  This means that
+ * the low bits are in v[0] (bits 0-63) and reverse order, while the high
+ * bits are in v[1] (bits 64-127) and reverse order.  The high bit of v[0] is
+ * bit 0, and the low bit of v[1] is bit 127.
+ */
+struct gf128 {
+       uint64_t v[2];
+};
+
+/* Note that we don't bit reverse in MAKE_GF128. */
+#define MAKE_GF128(a, b)       ((struct gf128){.v = { (a), (b) } })
+#define GF128_EQ(a, b)         ((((a).v[0] ^ (b).v[0]) | \
+                                   ((a).v[1] ^ (b).v[1])) == 0)
+
+static inline struct gf128
+gf128_read(const uint8_t *buf)
+{
+       struct gf128 r;
+
+       r.v[0] = be64dec(buf);
+       buf += sizeof(uint64_t);
+
+       r.v[1] = be64dec(buf);
+
+       return r;
+}
+
+static inline void
+gf128_write(struct gf128 v, uint8_t *buf)
+{
+       uint64_t tmp;
+
+       be64enc(buf, v.v[0]);
+       buf += sizeof tmp;
+
+       be64enc(buf, v.v[1]);
+}
+
+static inline struct gf128 __pure /* XXX - __pure2 instead */
+gf128_add(struct gf128 a, struct gf128 b)
+{
+       a.v[0] ^= b.v[0];
+       a.v[1] ^= b.v[1];
+
+       return a;
+}
+
+void gf128_genmultable(struct gf128 h, struct gf128table *t);
+void gf128_genmultable4(struct gf128 h, struct gf128table4 *t);
+struct gf128 gf128_mul(struct gf128 v, struct gf128table *tbl);
+struct gf128 gf128_mul4(struct gf128 a, struct gf128 b, struct gf128 c,
+    struct gf128 d, struct gf128table4 *tbl);
+struct gf128 gf128_mul4b(struct gf128 r, const uint8_t *v,
+    struct gf128table4 *tbl);
+
+#endif /* _GFMULT_H_ */
Index: sys/crypto/gmac.c
===================================================================
RCS file: /cvs/src/sys/crypto/gmac.c,v
retrieving revision 1.3
diff -u -p -r1.3 gmac.c
--- sys/crypto/gmac.c   11 Jan 2011 15:44:23 -0000      1.3
+++ sys/crypto/gmac.c   12 Oct 2014 20:09:20 -0000
@@ -16,6 +16,35 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+/*-
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by John-Mark Gurney under
+ * the sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1.  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ * 2.  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
 /*
  * This code implements the Message Authentication part of the
  * Galois/Counter Mode (as being described in the RFC 4543) using
@@ -28,91 +57,33 @@
 #include <crypto/rijndael.h>
 #include <crypto/gmac.h>
 
-void   ghash_gfmul(uint32_t *, uint32_t *, uint32_t *);
-void   ghash_update(GHASH_CTX *, uint8_t *, size_t);
-
-/* Computes a block multiplication in the GF(2^128) */
-void
-ghash_gfmul(uint32_t *X, uint32_t *Y, uint32_t *product)
-{
-       uint32_t        v[4];
-       uint32_t        z[4] = { 0, 0, 0, 0};
-       uint8_t         *x = (uint8_t *)X;
-       uint32_t        mul;
-       int             i;
-
-       v[0] = betoh32(Y[0]);
-       v[1] = betoh32(Y[1]);
-       v[2] = betoh32(Y[2]);
-       v[3] = betoh32(Y[3]);
-
-       for (i = 0; i < GMAC_BLOCK_LEN * 8; i++) {
-               /* update Z */
-               if (x[i >> 3] & (1 << (~i & 7))) {
-                       z[0] ^= v[0];
-                       z[1] ^= v[1];
-                       z[2] ^= v[2];
-                       z[3] ^= v[3];
-               } /* else: we preserve old values */
-
-               /* update V */
-               mul = v[3] & 1;
-               v[3] = (v[2] << 31) | (v[3] >> 1);
-               v[2] = (v[1] << 31) | (v[2] >> 1);
-               v[1] = (v[0] << 31) | (v[1] >> 1);
-               v[0] = (v[0] >> 1) ^ (0xe1000000 * mul);
-       }
-
-       product[0] = htobe32(z[0]);
-       product[1] = htobe32(z[1]);
-       product[2] = htobe32(z[2]);
-       product[3] = htobe32(z[3]);
-}
-
-void
-ghash_update(GHASH_CTX *ctx, uint8_t *X, size_t len)
-{
-       uint32_t        *x = (uint32_t *)X;
-       uint32_t        *s = (uint32_t *)ctx->S;
-       uint32_t        *y = (uint32_t *)ctx->Z;
-       int             i;
-
-       for (i = 0; i < len / GMAC_BLOCK_LEN; i++) {
-               s[0] = y[0] ^ x[0];
-               s[1] = y[1] ^ x[1];
-               s[2] = y[2] ^ x[2];
-               s[3] = y[3] ^ x[3];
-
-               ghash_gfmul((uint32_t *)ctx->S, (uint32_t *)ctx->H,
-                   (uint32_t *)ctx->S);
-
-               y = s;
-               x += 4;
-       }
-
-       bcopy(ctx->S, ctx->Z, GMAC_BLOCK_LEN);
-}
-
 #define AESCTR_NONCESIZE       4
 
 void
 AES_GMAC_Init(AES_GMAC_CTX *ctx)
 {
-       bzero(ctx->ghash.H, GMAC_BLOCK_LEN);
-       bzero(ctx->ghash.S, GMAC_BLOCK_LEN);
-       bzero(ctx->ghash.Z, GMAC_BLOCK_LEN);
-       bzero(ctx->J, GMAC_BLOCK_LEN);
+       bzero(ctx, sizeof(*ctx));
 }
 
 void
 AES_GMAC_Setkey(AES_GMAC_CTX *ctx, const uint8_t *key, uint16_t klen)
 {
+       const uint8_t   zeros[GMAC_BLOCK_LEN] = {};
+       struct gf128    h;
+       uint8_t         hbuf[GMAC_BLOCK_LEN];
+
        ctx->rounds = rijndaelKeySetupEnc(ctx->K, (u_char *)key,
            (klen - AESCTR_NONCESIZE) * 8);
        /* copy out salt to the counter block */
        bcopy(key + klen - AESCTR_NONCESIZE, ctx->J, AESCTR_NONCESIZE);
        /* prepare a hash subkey */
-       rijndaelEncrypt(ctx->K, ctx->rounds, ctx->ghash.H, ctx->ghash.H);
+       rijndaelEncrypt(ctx->K, ctx->rounds, zeros, hbuf);
+
+       h = gf128_read(hbuf);
+       gf128_genmultable4(h, &ctx->ghashtbl);
+
+       explicit_bzero(&h, sizeof(h));
+       explicit_bzero(hbuf, sizeof(hbuf));
 }
 
 void
@@ -125,20 +96,34 @@ AES_GMAC_Reinit(AES_GMAC_CTX *ctx, const
 int
 AES_GMAC_Update(AES_GMAC_CTX *ctx, const uint8_t *data, uint16_t len)
 {
-       uint32_t        blk[4] = { 0, 0, 0, 0 };
-       int             plen;
+       struct gf128    v;
+       uint8_t         buf[GMAC_BLOCK_LEN] = {};
+       int             i;
+
+       v = ctx->hash;
 
-       if (len > 0) {
-               plen = len % GMAC_BLOCK_LEN;
-               if (len >= GMAC_BLOCK_LEN)
-                       ghash_update(&ctx->ghash, (uint8_t *)data, len - plen);
-               if (plen) {
-                       bcopy((uint8_t *)data + (len - plen), (uint8_t *)blk,
-                           plen);
-                       ghash_update(&ctx->ghash, (uint8_t *)blk,
-                           GMAC_BLOCK_LEN);
+       while (len > 0) {
+               if (len >= 4*GMAC_BLOCK_LEN) {
+                       i = 4*GMAC_BLOCK_LEN;
+                       v = gf128_mul4b(v, data, &ctx->ghashtbl);
+               } else if (len >= GMAC_BLOCK_LEN) {
+                       i = GMAC_BLOCK_LEN;
+                       v = gf128_add(v, gf128_read(data));
+                       v = gf128_mul(v, &ctx->ghashtbl.tbls[0]);
+               } else {
+                       i = len;
+                       bcopy(data, buf, i);
+                       v = gf128_add(v, gf128_read(&buf[0]));
+                       v = gf128_mul(v, &ctx->ghashtbl.tbls[0]);
+                       explicit_bzero(buf, sizeof buf);
                }
+               len -= i;
+               data += i;
        }
+
+       ctx->hash = v;
+       explicit_bzero(&v, sizeof v);
+
        return (0);
 }
 
@@ -146,12 +131,12 @@ void
 AES_GMAC_Final(uint8_t digest[GMAC_DIGEST_LEN], AES_GMAC_CTX *ctx)
 {
        uint8_t         keystream[GMAC_BLOCK_LEN];
-       int             i;
+       struct gf128    a;
 
        /* do one round of GCTR */
        ctx->J[GMAC_BLOCK_LEN - 1] = 1;
        rijndaelEncrypt(ctx->K, ctx->rounds, ctx->J, keystream);
-       for (i = 0; i < GMAC_DIGEST_LEN; i++)
-               digest[i] = ctx->ghash.S[i] ^ keystream[i];
+       a = gf128_add(ctx->hash, gf128_read(keystream));
+       gf128_write(a, digest);
        explicit_bzero(keystream, sizeof(keystream));
 }
Index: sys/crypto/gmac.h
===================================================================
RCS file: /cvs/src/sys/crypto/gmac.h,v
retrieving revision 1.2
diff -u -p -r1.2 gmac.h
--- sys/crypto/gmac.h   5 Dec 2012 23:20:15 -0000       1.2
+++ sys/crypto/gmac.h   12 Oct 2014 17:40:58 -0000
@@ -19,6 +19,7 @@
 #ifndef _GMAC_H_
 #define _GMAC_H_
 
+#include <crypto/gfmult.h>
 #include <crypto/rijndael.h>
 
 #define GMAC_BLOCK_LEN         16
@@ -31,7 +32,8 @@ typedef struct _GHASH_CTX {
 } GHASH_CTX;
 
 typedef struct _AES_GMAC_CTX {
-       GHASH_CTX       ghash;
+       struct gf128table4 ghashtbl;
+       struct gf128    hash;
        uint32_t        K[4*(AES_MAXROUNDS + 1)];
        uint8_t         J[GMAC_BLOCK_LEN];              /* counter block */
        int             rounds;
-- 
Christian "naddy" Weisgerber                          na...@mips.inka.de

Reply via email to