From: Declan Murphy <declan.mur...@intel.com>

Add support for the Hashing Control Unit (HCU) included in the Offload
Crypto Subsystem (OCS) of the Intel Keem Bay SoC, thus enabling
hardware-accelerated hashing on the Keem Bay SoC for the following
algorithms:
- sha224 and hmac(sha224)
- sha256 and hmac(sha256)
- sha384 and hmac(sha384)
- sha512 and hmac(sha512)
- sm3    and hmac(sm3)

The driver passes crypto manager self-tests, including the extra tests
(CRYPTO_MANAGER_EXTRA_TESTS=y).

Signed-off-by: Declan Murphy <declan.mur...@intel.com>
Co-developed-by: Daniele Alessandrelli <daniele.alessandre...@intel.com>
Signed-off-by: Daniele Alessandrelli <daniele.alessandre...@intel.com>
Acked-by: Mark Gross <mgr...@linux.intel.com>
---
 drivers/crypto/Kconfig                        |    2 +
 drivers/crypto/Makefile                       |    1 +
 drivers/crypto/keembay/Kconfig                |   20 +
 drivers/crypto/keembay/Makefile               |    5 +
 drivers/crypto/keembay/keembay-ocs-hcu-core.c | 1484 +++++++++++++++++
 drivers/crypto/keembay/ocs-hcu.c              |  593 +++++++
 drivers/crypto/keembay/ocs-hcu.h              |  115 ++
 7 files changed, 2220 insertions(+)
 create mode 100644 drivers/crypto/keembay/Kconfig
 create mode 100644 drivers/crypto/keembay/Makefile
 create mode 100644 drivers/crypto/keembay/keembay-ocs-hcu-core.c
 create mode 100644 drivers/crypto/keembay/ocs-hcu.c
 create mode 100644 drivers/crypto/keembay/ocs-hcu.h

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index c2950127def6..e636be70b18d 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -881,4 +881,6 @@ config CRYPTO_DEV_SA2UL
          used for crypto offload.  Select this if you want to use hardware
          acceleration for cryptographic algorithms on these devices.
 
+source "drivers/crypto/keembay/Kconfig"
+
 endif # CRYPTO_HW
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index 53fc115cf459..fff9a70348e1 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -51,3 +51,4 @@ obj-$(CONFIG_CRYPTO_DEV_ARTPEC6) += axis/
 obj-$(CONFIG_CRYPTO_DEV_ZYNQMP_AES) += xilinx/
 obj-y += hisilicon/
 obj-$(CONFIG_CRYPTO_DEV_AMLOGIC_GXL) += amlogic/
+obj-y += keembay/
diff --git a/drivers/crypto/keembay/Kconfig b/drivers/crypto/keembay/Kconfig
new file mode 100644
index 000000000000..db8954dcd057
--- /dev/null
+++ b/drivers/crypto/keembay/Kconfig
@@ -0,0 +1,20 @@
+config CRYPTO_DEV_KEEMBAY_OCS_HCU
+       tristate "Support for Intel Keem Bay OCS HCU HW acceleration"
+       select CRYPTO_ENGINE
+       depends on OF || COMPILE_TEST
+       help
+         Support for Intel Keem Bay Offload and Crypto Subsystem (OCS) Hash
+         Control Unit (HCU) hardware acceleration for use with Crypto API.
+
+         Provides OCS HCU hardware acceleration of sha256, sha384, sha512, and
+         sm3, as well as the HMAC variant of these algorithms.
+
+config CRYPTO_DEV_KEEMBAY_OCS_HCU_HMAC_SHA224
+       bool "Enable sha224 and hmac(sha224) support in Intel Keem Bay OCS HCU"
+       depends on CRYPTO_DEV_KEEMBAY_OCS_HCU
+       help
+         Enables support for sha224 and hmac(sha224) algorithms in the Intel
+         Keem Bay OCS HCU driver. Intel recommends not to use these
+         algorithms.
+
+         Provides OCS HCU hardware acceleration of sha224 and hmac(224).
diff --git a/drivers/crypto/keembay/Makefile b/drivers/crypto/keembay/Makefile
new file mode 100644
index 000000000000..2aa2647fab1b
--- /dev/null
+++ b/drivers/crypto/keembay/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for Keem Bay OCS Crypto API Linux drivers
+#
+obj-$(CONFIG_CRYPTO_DEV_KEEMBAY_OCS_HCU) += keembay-ocs-hcu.o
+keembay-ocs-hcu-objs := keembay-ocs-hcu-core.o ocs-hcu.o
diff --git a/drivers/crypto/keembay/keembay-ocs-hcu-core.c 
b/drivers/crypto/keembay/keembay-ocs-hcu-core.c
new file mode 100644
index 000000000000..2c8f8f331501
--- /dev/null
+++ b/drivers/crypto/keembay/keembay-ocs-hcu-core.c
@@ -0,0 +1,1484 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Keem Bay OCS HCU Crypto Driver.
+ *
+ * Copyright (C) 2018-2020 Intel Corporation
+ */
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+
+#include <crypto/engine.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/sm3.h>
+#include <crypto/hmac.h>
+#include <crypto/internal/hash.h>
+
+#include "ocs-hcu.h"
+
+#define DRV_NAME       "keembay-ocs-hcu"
+
+/* Flag marking a final request. */
+#define REQ_FINAL                      BIT(0)
+/* Flag set when there is data to be processed by the HW. */
+#define REQ_FLAGS_DO_DATA              BIT(1)
+/* Flag set when there is intermediate data from a previous request. */
+#define REQ_FLAGS_INTERMEDIATE_DATA    BIT(2)
+/* Flag marking a HMAC request. */
+#define REQ_FLAGS_HMAC                 BIT(3)
+/* Flag set when HW HMAC is being used. */
+#define REQ_FLAGS_HMAC_HW              BIT(4)
+/* Flag set when SW HMAC is being used. */
+#define REQ_FLAGS_HMAC_SW              BIT(5)
+/* Flag set when HW HMAC key has been already set (only used in HW HMAC). */
+#define REQ_FLAGS_HMAC_HW_KEY_SET      BIT(6)
+/* Flag set when OPAD computing is pending (only used in SW HMAC). */
+#define REQ_FLAGS_HMAC_SW_DO_OPAD      BIT(7)
+
+#define KMB_OCS_HCU_ALIGNMENT          8
+#define KMB_OCS_HCU_ALIGN_MASK         (KMB_OCS_HCU_ALIGNMENT - 1)
+
+/**
+ * struct ocs_hcu_ctx: OCS HCU Transform context.
+ * @engine_ctx:         Crypto Engine context.
+ * @hcu_dev:    The OCS HCU device used by the transformation.
+ * @key:        The key (used only for HMAC transformations).
+ * @key_len:    The length of the key.
+ * @is_sm3_tfm:  Whether or not this is an SM3 transformation.
+ * @is_hmac_tfm: Whether or not this is a HMAC transformation.
+ */
+struct ocs_hcu_ctx {
+       struct crypto_engine_ctx engine_ctx;
+       struct ocs_hcu_dev *hcu_dev;
+       u8 key[SHA512_BLOCK_SIZE];
+       size_t key_len;
+       bool is_sm3_tfm;
+       bool is_hmac_tfm;
+};
+
+/**
+ * struct dma_buf - DMA-able buffer.
+ * @dma_addr:  The DMA address of the buffer.
+ * @vaddr:     The Kernel virtual address of the buffer.
+ * @size:      The size of the buffer.
+ */
+struct dma_buf {
+       dma_addr_t dma_addr;
+       u8         *vaddr;
+       size_t     size;
+};
+
+/**
+ * struct ocs_dma_list - OCS-specific DMA linked list.
+ * @head:      The head of the list (points to the array backing the list).
+ * @tail:      The current tail of the list; NULL if the list is empty.
+ * @dma_addr:  The DMA address of @head (i.e., the DMA address of the backing
+ *             array).
+ * @nents:     Maximum number of entries in the list (i.e., number of elements
+ *             in the backing array).
+ *
+ * The OCS DMA list is an array-backed list of OCS DMA descriptors. The array
+ * backing the list is allocated with dma_alloc_coherent() and pointed by
+ * @head.
+ */
+struct ocs_dma_list {
+       struct ocs_dma_desc     *head;
+       struct ocs_dma_desc     *tail;
+       dma_addr_t              dma_addr;
+       size_t                  nents;
+};
+
+/**
+ * struct ocs_hcu_rctx - Context for the request.
+ * @flags:         Flags tracking request status.
+ * @algo:          Algorithm to use for the request.
+ * @blk_sz:        Block size of the transformation / request.
+ * @dig_sz:        Digest size of the transformation / request.
+ * @dma_list:      OCS DMA linked list.
+ * @hcu_dev:       OCS HCU device to be used to service the request.
+ * @idata:         Intermediate data (used to store intermediate results).
+ * @buffer:        Buffer to store: partial block of data and SW HMAC
+ *                 artifacts (ipad, opad, etc.).
+ * @buf_cnt:       Number of bytes currently stored in the buffer.
+ * @dma_buf:       DMA-able copy of buffer.
+ * @sg:                    Head of the scatterlist entries containing data.
+ * @sg_data_total:  Total data in the SG list at any time.
+ * @sg_data_offset: Offset into the data of the current individual SG node.
+ * @sg_dma_nents:   Number of sg entries mapped in dma_list.
+ */
+struct ocs_hcu_rctx {
+       u32                     flags;
+       u32                     algo;
+       size_t                  blk_sz;
+       size_t                  dig_sz;
+       struct ocs_dma_list     dma_list;
+       struct ocs_hcu_dev      *hcu_dev;
+       struct ocs_hcu_idata    idata;
+       /*
+        * Buffer is double the block size because we need space for SW HMAC
+        * artifacts, i.e:
+        * - ipad (1 block) + a possible partial block of data.
+        * - opad (1 block) + digest of H(k ^ ipad || m)
+        */
+       u8                      buffer[2 * SHA512_BLOCK_SIZE];
+       size_t                  buf_cnt;
+       struct dma_buf          dma_buf;
+       struct scatterlist      *sg;
+       unsigned int            sg_data_total;
+       unsigned int            sg_data_offset;
+       unsigned int            sg_dma_nents;
+};
+
+/**
+ * struct ocs_hcu_drv - Driver data
+ * @dev_list:  The list of HCU devices.
+ * @lock:      The lock protecting dev_list.
+ */
+struct ocs_hcu_drv {
+       struct list_head dev_list;
+       spinlock_t lock; /* Protects dev_list. */
+};
+
+static struct ocs_hcu_drv ocs_hcu = {
+       .dev_list = LIST_HEAD_INIT(ocs_hcu.dev_list),
+       .lock = __SPIN_LOCK_UNLOCKED(ocs_hcu.lock),
+};
+
+/*
+ * Return the total amount of data in the request; that is: the data in the
+ * request buffer + the data in the sg list.
+ */
+static inline unsigned int kmb_get_total_data(struct ocs_hcu_rctx *rctx)
+{
+       return rctx->sg_data_total + rctx->buf_cnt;
+}
+
+/* Move remaining content of scatter-gather list to context buffer. */
+static int flush_sg_to_ocs_buffer(struct ocs_hcu_rctx *rctx)
+{
+       size_t count;
+
+       if (rctx->sg_data_total > (sizeof(rctx->buffer) - rctx->buf_cnt)) {
+               WARN(1, "sg data does not fit in buffer\n");
+               return -EINVAL;
+       }
+
+       while (rctx->sg_data_total) {
+               if (!rctx->sg) {
+                       WARN(1, "sg == NULL, but sg_data_total != 0\n");
+                       return -EINVAL;
+               }
+               /*
+                * If current sg has been fully processed, skip to the next
+                * one.
+                */
+               if (rctx->sg_data_offset == rctx->sg->length) {
+                       rctx->sg = sg_next(rctx->sg);
+                       rctx->sg_data_offset = 0;
+                       continue;
+               }
+               /*
+                * Determine the maximum data available to copy from the node.
+                * Minimum of the length left in the sg node, or the total data
+                * in the request.
+                */
+               count = min(rctx->sg->length - rctx->sg_data_offset,
+                           rctx->sg_data_total);
+               /* Copy from scatter-list entry to context buffer. */
+               scatterwalk_map_and_copy(&rctx->buffer[rctx->buf_cnt],
+                                        rctx->sg, rctx->sg_data_offset,
+                                        count, 0);
+
+               rctx->sg_data_offset += count;
+               rctx->sg_data_total -= count;
+               rctx->buf_cnt += count;
+       }
+
+       return 0;
+}
+
+static struct ocs_hcu_dev *kmb_ocs_hcu_find_dev(struct ahash_request *req)
+{
+       struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+       struct ocs_hcu_ctx *tctx = crypto_ahash_ctx(tfm);
+
+       /* If the HCU device for the request was previously set, return it. */
+       if (tctx->hcu_dev)
+               return tctx->hcu_dev;
+
+       /*
+        * Otherwise, get the first HCU device available (there should be one
+        * and only one device).
+        */
+       spin_lock_bh(&ocs_hcu.lock);
+       tctx->hcu_dev = list_first_entry_or_null(&ocs_hcu.dev_list,
+                                                struct ocs_hcu_dev,
+                                                list);
+       spin_unlock_bh(&ocs_hcu.lock);
+
+       WARN(!tctx->hcu_dev, "No HCU device available\n");
+
+       return tctx->hcu_dev;
+}
+
+/* Free OCS DMA linked list and DMA-able context buffer. */
+static void kmb_ocs_free_dma_list(struct ocs_hcu_rctx *rctx)
+{
+       struct ocs_hcu_dev *hcu_dev = rctx->hcu_dev;
+       struct device *dev = hcu_dev->dev;
+
+       /* Free DMA-able context buffer, if it was allocated. */
+       if (rctx->dma_buf.vaddr) {
+               memzero_explicit(rctx->dma_buf.vaddr, rctx->dma_buf.size);
+               dma_free_coherent(dev, rctx->dma_buf.size, rctx->dma_buf.vaddr,
+                                 rctx->dma_buf.dma_addr);
+               memset(&rctx->dma_buf, 0, sizeof(rctx->dma_buf));
+       }
+
+       if (!rctx->dma_list.head)
+               return;
+
+       if (rctx->sg_dma_nents > 0)
+               dma_unmap_sg(dev, hcu_dev->req->src, rctx->sg_dma_nents,
+                            DMA_TO_DEVICE);
+
+       dma_free_coherent(dev,
+                         sizeof(*rctx->dma_list.head) * rctx->dma_list.nents,
+                         rctx->dma_list.head, rctx->dma_list.dma_addr);
+
+       memset(&rctx->dma_list, 0, sizeof(rctx->dma_list));
+}
+
+/* Add a new DMA entry at the end of the OCS DMA list. */
+static int kmb_ocs_add_dma_tail(struct ocs_hcu_rctx *rctx,
+                               dma_addr_t addr, size_t len)
+{
+       struct ocs_hcu_dev *hcu_dev = rctx->hcu_dev;
+       struct device *dev = hcu_dev->dev;
+       struct ocs_dma_list *dma_list = &rctx->dma_list;
+       struct ocs_dma_desc *old_tail = dma_list->tail;
+       struct ocs_dma_desc *new_tail = old_tail ?
+                                       old_tail + 1 : dma_list->head;
+
+       if (!len)
+               return 0;
+
+       if (addr & ~OCS_HCU_DMA_BIT_MASK) {
+               dev_err(dev,
+                       "Unexpected error: Invalid DMA address for OCS HCU\n");
+               return -EINVAL;
+       }
+
+       if (new_tail - dma_list->head >= dma_list->nents) {
+               dev_err(dev, "Unexpected error: OCS DMA list is full\n");
+               return -ENOMEM;
+       }
+
+       /*
+        * If there was an old tail (i.e., this is not the first element we are
+        * adding), un-terminate the old tail and make it point to the new one.
+        */
+       if (old_tail) {
+               old_tail->ll_flags &= ~OCS_LL_DMA_FLAG_TERMINATE;
+               /*
+                * The old tail 'nxt_desc' must point to the DMA address of the
+                * new tail.
+                */
+               old_tail->nxt_desc = dma_list->dma_addr +
+                                    sizeof(*dma_list->tail) * (new_tail -
+                                                               dma_list->head);
+       }
+
+       new_tail->src_adr = (u32)addr;
+       new_tail->src_len = (u32)len;
+       new_tail->ll_flags = OCS_LL_DMA_FLAG_TERMINATE;
+       new_tail->nxt_desc = 0;
+
+       /* Update list tail with new tail. */
+       dma_list->tail = new_tail;
+
+       return 0;
+}
+
+/*
+ * Initialize OCS DMA list:
+ * - Allocate DMA linked list (number of elements: SG entries to process +
+ *   context buffer, if not empty).
+ * - Allocate DMA-able context buffer (if needed) and add it to the DMA list.
+ * - Add SG entries to the DMA list.
+ *
+ * Note: if this is a final request, we process all the data in the SG list,
+ * otherwise we can only process up to the maximum amount of block-aligned data
+ * (the remainder will be put into the context buffer and processed in the next
+ * request).
+ */
+static int kmb_ocs_init_dma_list(struct ocs_hcu_dev *hcu_dev)
+{
+       struct ahash_request *req = hcu_dev->req;
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+       struct device *dev = hcu_dev->dev;
+       struct scatterlist *sg = req->src;
+       unsigned int remainder = 0;
+       unsigned int total;
+       size_t nents;
+       size_t count;
+       int rc;
+       int i;
+
+       total = kmb_get_total_data(rctx);
+       if (!total)
+               return 0;
+
+       /*
+        * If this is not a final DMA (terminated DMA), the data passed to the
+        * HCU must be aligned to the block size; compute the remainder data to
+        * be processed in the next request.
+        */
+       if (!(rctx->flags & REQ_FINAL))
+               remainder = total % rctx->blk_sz;
+
+       /* Determine the number of scatter gather list nodes to process. */
+       nents = sg_nents_for_len(sg, rctx->sg_data_total - remainder);
+       rctx->sg_dma_nents = nents;
+
+       /* Add extra DMA entry for data in context buffer, if any. */
+       if (rctx->buf_cnt)
+               nents++;
+
+       /* Total size of the DMA list to allocate. */
+       rctx->dma_list.head =
+               dma_alloc_coherent(dev, sizeof(*rctx->dma_list.head) * nents,
+                                  &rctx->dma_list.dma_addr, GFP_KERNEL);
+       if (!rctx->dma_list.head) {
+               dev_err(dev, "Failed to allocated DMA list\n");
+               return -ENOMEM;
+       }
+       rctx->dma_list.nents = nents;
+       rctx->dma_list.tail = NULL;
+
+       rc = dma_map_sg(hcu_dev->dev, req->src, rctx->sg_dma_nents,
+                       DMA_TO_DEVICE);
+       if (rc != rctx->sg_dma_nents) {
+               dev_err(dev, "Failed to MAP SG\n");
+               rc = -ENOMEM;
+               goto cleanup;
+       }
+
+       /*
+        * If context buffer is not empty, create DMA-able buffer for it, copy
+        * context to new buffer and add it to the DMA list.
+        */
+       if (rctx->buf_cnt) {
+               rctx->dma_buf.size = rctx->buf_cnt;
+               rctx->dma_buf.vaddr =
+                       dma_alloc_coherent(dev, rctx->dma_buf.size,
+                                          &rctx->dma_buf.dma_addr,
+                                          GFP_KERNEL);
+               if (!rctx->dma_buf.vaddr) {
+                       dev_err(dev, "Failed to allocate DMA Buf\n");
+                       goto cleanup;
+               }
+               memcpy(rctx->dma_buf.vaddr, rctx->buffer, rctx->buf_cnt);
+               rc = kmb_ocs_add_dma_tail(rctx, rctx->dma_buf.dma_addr,
+                                         rctx->dma_buf.size);
+               if (rc)
+                       goto cleanup;
+               rctx->buf_cnt = 0;
+       }
+
+       /* Add the SG nodes to be processed to the DMA linked list. */
+       for_each_sg(req->src, rctx->sg, rctx->sg_dma_nents, i) {
+               count = min(rctx->sg_data_total - remainder,
+                           rctx->sg->length - rctx->sg_data_offset);
+               /*
+                * Do not create a zero length DMA descriptor. Check in case of
+                * zero length SG node.
+                */
+               if (count == 0)
+                       continue;
+               /* Add sg to HCU DMA list. */
+               rc = kmb_ocs_add_dma_tail(rctx, rctx->sg->dma_address,
+                                         count);
+               if (rc)
+                       goto cleanup;
+
+               /* Update amount of data remaining in SG list. */
+               rctx->sg_data_total -= count;
+
+               /*
+                * If  remaining data is equal to remainder (note: 'less than'
+                * case will never happen in practice), we are done: update
+                * offset and exit the loop.
+                */
+               if (rctx->sg_data_total <= remainder) {
+                       rctx->sg_data_offset += count;
+                       break;
+               }
+
+               /*
+                * If we get here is because we need to process the next sg in
+                * the list; set offset within the sg to 0.
+                */
+               rctx->sg_data_offset = 0;
+       }
+
+       return 0;
+cleanup:
+       dev_err(dev, "Failed to map DMA buffer.\n");
+       kmb_ocs_free_dma_list(rctx);
+
+       return rc;
+}
+
+static void kmb_ocs_hcu_secure_cleanup(struct ocs_hcu_dev *hcu_dev)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(hcu_dev->req);
+
+       /* Clear buffer of any data. */
+       memzero_explicit(rctx->buffer, sizeof(rctx->buffer));
+       /*
+        * Clear the key in HW.
+        *
+        * NOTE: the key in the tfm context (ctx->key) is cleared in
+        * kmb_ocs_hcu_hmac_cra_exit(), i.e., when tfm is de-initialized.
+        */
+       ocs_hcu_clear_key(hcu_dev);
+}
+
+static void kmb_ocs_hcu_finish_request(struct ocs_hcu_dev *hcu_dev, int error)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(hcu_dev->req);
+
+       if (error)
+               goto exit;
+
+       /* Get the intermediate data (including digest). */
+       error = ocs_hcu_get_intermediate_data(hcu_dev, &rctx->idata,
+                                             rctx->algo);
+       if (error)
+               goto exit;
+       /*
+        * If this is not the final request
+        * - Set the intermediate data flag so that intermediate data will be
+        *   written back to HW during the next request processing.
+        * - Move remaining data in the SG list to the request buffer, so that
+        *   it will be processed during the next request.
+        */
+       if (!(rctx->flags & REQ_FINAL)) {
+               rctx->flags |= REQ_FLAGS_INTERMEDIATE_DATA;
+               error = flush_sg_to_ocs_buffer(rctx);
+               goto exit;
+       }
+
+       /*
+        * If we get here, this was the final request and the digest in the
+        * intermediate date is actually the final digest: copy it to the
+        * request result field.
+        */
+       memcpy(hcu_dev->req->result, rctx->idata.digest, rctx->dig_sz);
+
+exit:
+       if (error || rctx->flags & REQ_FINAL)
+               kmb_ocs_hcu_secure_cleanup(hcu_dev);
+       ocs_hcu_hw_disable(hcu_dev);
+       kmb_ocs_free_dma_list(rctx);
+       crypto_finalize_hash_request(hcu_dev->engine, hcu_dev->req, error);
+}
+
+static int kmb_ocs_hcu_set_hw(struct ocs_hcu_dev *hcu_dev, int algo)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(hcu_dev->req);
+       struct crypto_ahash *tfm = crypto_ahash_reqtfm(hcu_dev->req);
+       struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm);
+       struct device *dev = hcu_dev->dev;
+       int rc;
+
+       rc = ocs_hcu_wait_busy(hcu_dev);
+       if (rc) {
+               dev_err(hcu_dev->dev, "%s HW Busy, cannot continue.\n",
+                       __func__);
+               return -EIO;
+       }
+
+       /* Configure the hardware for the current request. */
+       rc = ocs_hcu_hw_cfg(hcu_dev, algo);
+       if (rc)
+               return rc;
+
+       if (rctx->flags & REQ_FLAGS_INTERMEDIATE_DATA) {
+               ocs_hcu_set_intermediate_data(hcu_dev, &rctx->idata, algo);
+               rctx->flags &= ~REQ_FLAGS_INTERMEDIATE_DATA;
+       }
+
+       /*
+        * Set key if all the following conditions are met:
+        * 1. We are using HW HMAC
+        * 2. Key has not been set already
+        */
+       if (rctx->flags & REQ_FLAGS_HMAC_HW &&
+           !(rctx->flags & REQ_FLAGS_HMAC_HW_KEY_SET)) {
+               if (ctx->key_len > HCU_HW_KEY_LEN) {
+                       dev_err(dev,
+                               "Unexpected error: key len > HCU_HW_KEY_LEN\n");
+                       return -EINVAL;
+               }
+               /*
+                * Hardware requires all the bytes of the HW Key vector to be
+                * written. So pad with zero until we reach HCU_HW_KEY_LEN.
+                */
+               memzero_explicit(&ctx->key[ctx->key_len],
+                                HCU_HW_KEY_LEN - ctx->key_len);
+               rc = ocs_hcu_write_key(hcu_dev, ctx->key, HCU_HW_KEY_LEN);
+               if (rc)
+                       return rc;
+               rctx->flags |= REQ_FLAGS_HMAC_HW_KEY_SET;
+       }
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_hash_pio(struct ocs_hcu_dev *hcu_dev,
+                               u8 *buf, u32 sz, u32 algo, bool term)
+{
+       int rc = 0;
+
+       /* Configure the hardware for the current request. */
+       rc = kmb_ocs_hcu_set_hw(hcu_dev, algo);
+
+       if (rc)
+               return rc;
+
+       if (sz != ocs_hcu_hash_cpu(hcu_dev, buf, sz, algo, term))
+               return -EIO;
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_tx_dma(struct ocs_hcu_dev *hcu_dev)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(hcu_dev->req);
+       bool term = !!(rctx->flags & REQ_FINAL);
+       int rc;
+
+       if (!rctx->dma_list.head)
+               return 0;
+
+       /* Configure the hardware for the current request. */
+       rc = kmb_ocs_hcu_set_hw(hcu_dev, rctx->algo);
+
+       if (rc)
+               return rc;
+
+       /* Start the DMA engine with the descriptor address stored. */
+       rc = ocs_hcu_ll_dma_start(hcu_dev, rctx->dma_list.dma_addr, term);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_handle_queue(struct ahash_request *req)
+{
+       struct ocs_hcu_dev *hcu_dev = kmb_ocs_hcu_find_dev(req);
+
+       if (!hcu_dev)
+               return -ENOENT;
+
+       return crypto_transfer_hash_request_to_engine(hcu_dev->engine,
+                                                     req);
+}
+
+static int kmb_ocs_hcu_prepare_request(struct crypto_engine *engine, void 
*areq)
+{
+       struct ahash_request *req = container_of(areq, struct ahash_request,
+                                                base);
+       struct ocs_hcu_dev *hcu_dev = kmb_ocs_hcu_find_dev(req);
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+       int rc;
+
+       if (!hcu_dev)
+               return -ENOENT;
+
+       hcu_dev->req = req;
+
+       /*
+        * If the request has no data (i.e., it's a final request without data),
+        * we are done.
+        */
+       if (kmb_get_total_data(rctx) == 0)
+               return 0;
+       /* Otherwise, move the data into the HCU DMA linked list. */
+       rc = kmb_ocs_init_dma_list(hcu_dev);
+       if (rc)
+               return rc;
+       /* Set the flag to notify of pending data. */
+       rctx->flags |= REQ_FLAGS_DO_DATA;
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_do_final(struct ocs_hcu_dev *hcu_dev)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(hcu_dev->req);
+       int rc;
+
+       rc = kmb_ocs_hcu_set_hw(hcu_dev, rctx->algo);
+       if (rc)
+               return rc;
+
+       ocs_hcu_tx_data_done(hcu_dev);
+
+       return 0;
+}
+
+static int prepare_ipad(struct ahash_request *req)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+       struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+       struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm);
+       int i;
+
+       WARN(rctx->buf_cnt != 0, "rctx->buf_cnt = %zu\n", rctx->buf_cnt);
+       WARN(!(rctx->flags & REQ_FLAGS_HMAC_SW), "Not HMAC SW");
+       /*
+        * Key length must be equal to block size. If key is shorter,
+        * we pad it with zero (note: key cannot be longer, since
+        * longer keys are hashed by kmb_ocs_hcu_setkey()).
+        */
+       WARN_ON(ctx->key_len > rctx->blk_sz);
+       memzero_explicit(&ctx->key[ctx->key_len],
+                        rctx->blk_sz - ctx->key_len);
+       ctx->key_len = rctx->blk_sz;
+       /*
+        * Prepare IPAD for HMAC. Only done for first block.
+        * HMAC(k,m) = H(k ^ opad || H(k ^ ipad || m))
+        * k ^ ipad will be first hashed block.
+        * k ^ opad will be calculated in the final request.
+        * Only needed if not using HW HMAC.
+        */
+       for (i = 0; i < rctx->blk_sz; i++)
+               rctx->buffer[i] = ctx->key[i] ^ HMAC_IPAD_VALUE;
+       rctx->buf_cnt = rctx->blk_sz;
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_do_one_request(struct crypto_engine *engine, void *areq)
+{
+       struct ahash_request *req = container_of(areq, struct ahash_request,
+                                                base);
+       struct ocs_hcu_dev *hcu_dev = kmb_ocs_hcu_find_dev(req);
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+
+       if (!hcu_dev)
+               return -ENOENT;
+
+       hcu_dev->req = req;
+
+       /* Initialize the hardware. */
+       ocs_hcu_hw_init(hcu_dev);
+
+       /*
+        * If there is data to process (i.e., this is an update request or a
+        * final request with data), process the data.
+        */
+       if (rctx->flags & REQ_FLAGS_DO_DATA) {
+               rctx->flags &= ~REQ_FLAGS_DO_DATA;
+               return kmb_ocs_hcu_tx_dma(hcu_dev);
+       }
+       /* Otherwise, if this is a final request, finalize the hash. */
+       if (rctx->flags & REQ_FINAL)
+               return kmb_ocs_hcu_do_final(hcu_dev);
+       /* If we end up here, something was wrong with the request. */
+       return -EINVAL;
+}
+
+static int kmb_ocs_hcu_init(struct ahash_request *req)
+{
+       struct ocs_hcu_dev *hcu_dev = kmb_ocs_hcu_find_dev(req);
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+       struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+       struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm);
+
+       if (!hcu_dev)
+               return -ENOENT;
+
+       /* Initialize entire request context to zero. */
+       memset(rctx, 0, sizeof(*rctx));
+
+       rctx->hcu_dev = hcu_dev;
+       rctx->dig_sz = crypto_ahash_digestsize(tfm);
+
+       switch (rctx->dig_sz) {
+#ifdef CONFIG_CRYPTO_DEV_KEEMBAY_OCS_HCU_HMAC_SHA224
+       case SHA224_DIGEST_SIZE:
+               rctx->blk_sz = SHA224_BLOCK_SIZE;
+               rctx->algo = OCS_HCU_ALGO_SHA224;
+               break;
+#endif /* CONFIG_CRYPTO_DEV_KEEMBAY_OCS_HCU_HMAC_SHA224 */
+       case SHA256_DIGEST_SIZE:
+               rctx->blk_sz = SHA256_BLOCK_SIZE;
+               /*
+                * SHA256 and SM3 have the same digest size: use info from tfm
+                * context to find out which one we should use.
+                */
+               rctx->algo = ctx->is_sm3_tfm ? OCS_HCU_ALGO_SM3 :
+                                              OCS_HCU_ALGO_SHA256;
+               break;
+       case SHA384_DIGEST_SIZE:
+               rctx->blk_sz = SHA384_BLOCK_SIZE;
+               rctx->algo = OCS_HCU_ALGO_SHA384;
+               break;
+       case SHA512_DIGEST_SIZE:
+               rctx->blk_sz = SHA512_BLOCK_SIZE;
+               rctx->algo = OCS_HCU_ALGO_SHA512;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* Clear the intermediate data. */
+       memzero_explicit(&rctx->idata, sizeof(rctx->idata));
+       /* If this is not a HMAC request, we are done. */
+       if (!ctx->is_hmac_tfm)
+               return 0;
+
+       rctx->flags |= REQ_FLAGS_HMAC;
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_update(struct ahash_request *req)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+       int rc;
+
+       if (!req->nbytes)
+               return 0;
+
+       /* Check for overflow */
+       if (check_add_overflow(rctx->sg_data_total, req->nbytes,
+                              &rctx->sg_data_total))
+               return -EINVAL;
+
+       rctx->sg_data_offset = 0;
+       rctx->sg = req->src;
+
+       /*
+        * If we are doing HMAC, then we must use SW-assisted HMAC, since HW
+        * HMAC does not support context switching (so can only be used with
+        * finup() or digest().
+        */
+       if (rctx->flags & REQ_FLAGS_HMAC &&
+           !(rctx->flags & REQ_FLAGS_HMAC_SW)) {
+               rctx->flags |= REQ_FLAGS_HMAC_SW;
+               rc = prepare_ipad(req);
+               if (rc)
+                       return rc;
+       }
+
+       /*
+        * If remaining sg_data fits into ctx buffer, just copy it there; we'll
+        * process it at the next update() or final().
+        */
+       if (rctx->sg_data_total <= (sizeof(rctx->buffer) - rctx->buf_cnt))
+               return flush_sg_to_ocs_buffer(rctx);
+
+       return kmb_ocs_hcu_handle_queue(req);
+}
+
+static int kmb_ocs_hcu_final(struct ahash_request *req)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+       struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+       struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm);
+       int rc;
+
+       rctx->sg = req->src;
+       rctx->sg_data_offset = 0;
+       rctx->flags |= REQ_FINAL;
+
+       /*
+        * If this is a HMAC request and, so far, we didn't have to switch to
+        * SW HMAC, check if we can use HW HMAC.
+        */
+       if (rctx->flags & REQ_FLAGS_HMAC &&
+           !(rctx->flags & REQ_FLAGS_HMAC_SW)) {
+               /*
+                * If we are here, it means we never processed any data so far,
+                * so we can use HW HMAC, but only if there is some data to
+                * process (since OCS HW MAC does not support zero-length
+                * messages) and the key length is supported by the hardware
+                * (OCS HCU HW only supports length <= 64); if HW HMAC cannot
+                * be used, fall back to SW-assisted HMAC.
+                */
+               if (kmb_get_total_data(rctx) &&
+                   ctx->key_len <= HCU_HW_KEY_LEN) {
+                       rctx->algo |= OCS_HCU_ALGO_HMAC_MASK;
+                       rctx->flags |= REQ_FLAGS_HMAC_HW;
+               } else {
+                       rctx->flags |= REQ_FLAGS_HMAC_SW;
+                       rc = prepare_ipad(req);
+                       if (rc)
+                               return rc;
+               }
+       }
+       /*
+        * If SW HMAC is being used, request OPAD computation (do in IRQ
+        * thread, after the full message has been hashed).
+        */
+       if (rctx->flags & REQ_FLAGS_HMAC_SW)
+               rctx->flags |= REQ_FLAGS_HMAC_SW_DO_OPAD;
+
+       return kmb_ocs_hcu_handle_queue(req);
+}
+
+static int kmb_ocs_hcu_finup(struct ahash_request *req)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+
+       /* Check for overflow */
+       if (check_add_overflow(rctx->sg_data_total, req->nbytes,
+                              &rctx->sg_data_total))
+               return -EINVAL;
+
+       return kmb_ocs_hcu_final(req);
+}
+
+static int kmb_ocs_hcu_digest(struct ahash_request *req)
+{
+       int rc = 0;
+       struct ocs_hcu_dev *hcu_dev = kmb_ocs_hcu_find_dev(req);
+
+       if (!hcu_dev)
+               return -ENOENT;
+
+       rc = kmb_ocs_hcu_init(req);
+       if (rc)
+               return rc;
+
+       rc = kmb_ocs_hcu_finup(req);
+
+       return rc;
+}
+
+static int kmb_ocs_hcu_export(struct ahash_request *req, void *out)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+
+       /* Intermediate data is always stored and applied per request. */
+       memcpy(out, rctx, sizeof(*rctx));
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_import(struct ahash_request *req, const void *in)
+{
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(req);
+
+       /* Intermediate data is always stored and applied per request. */
+       memcpy(rctx, in, sizeof(*rctx));
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_setkey(struct crypto_ahash *tfm, const u8 *key,
+                             unsigned int keylen)
+{
+       struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm);
+       unsigned int digestsize = crypto_ahash_digestsize(tfm);
+       size_t blk_sz = crypto_ahash_blocksize(tfm);
+       struct crypto_ahash *ahash_tfm;
+       struct ahash_request *req;
+       struct crypto_wait wait;
+       struct scatterlist sg;
+       const char *alg_name;
+       u8 *buf;
+       int rc;
+
+       /*
+        * Key length must be equal to block size:
+        * - If key is shorter, we are done for now (the key will be padded
+        *   later on); this is to maximize the use of HW HMAC (which works
+        *   only for keys <= 64 bytes).
+        * - If key is longer, we hash it.
+        */
+       if (keylen <= blk_sz) {
+               memcpy(ctx->key, key, keylen);
+               ctx->key_len = keylen;
+               return 0;
+       }
+
+       switch (digestsize) {
+#ifdef CONFIG_CRYPTO_DEV_KEEMBAY_OCS_HCU_HMAC_SHA224
+       case SHA224_DIGEST_SIZE:
+               alg_name = "sha224-keembay-ocs";
+               break;
+#endif /* CONFIG_CRYPTO_DEV_KEEMBAY_OCS_HCU_HMAC_SHA224 */
+       case SHA256_DIGEST_SIZE:
+               alg_name = ctx->is_sm3_tfm ? "sm3-keembay-ocs" :
+                                            "sha256-keembay-ocs";
+               break;
+       case SHA384_DIGEST_SIZE:
+               alg_name = "sha384-keembay-ocs";
+               break;
+       case SHA512_DIGEST_SIZE:
+               alg_name = "sha512-keembay-ocs";
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ahash_tfm = crypto_alloc_ahash(alg_name, 0, 0);
+       if (IS_ERR(ahash_tfm))
+               return PTR_ERR(ahash_tfm);
+
+       req = ahash_request_alloc(ahash_tfm, GFP_KERNEL);
+       if (!req) {
+               rc = -ENOMEM;
+               goto err_free_ahash;
+       }
+
+       crypto_init_wait(&wait);
+       ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+                                  crypto_req_done, &wait);
+       crypto_ahash_clear_flags(ahash_tfm, ~0);
+
+       buf = kzalloc(keylen, GFP_KERNEL);
+       if (!buf) {
+               rc = -ENOMEM;
+               goto err_free_req;
+       }
+
+       memcpy(buf, key, keylen);
+       sg_init_one(&sg, buf, keylen);
+       ahash_request_set_crypt(req, &sg, ctx->key, keylen);
+
+       rc = crypto_wait_req(crypto_ahash_digest(req), &wait);
+       if (rc == 0)
+               ctx->key_len = digestsize;
+
+       kfree(buf);
+err_free_req:
+       ahash_request_free(req);
+err_free_ahash:
+       crypto_free_ahash(ahash_tfm);
+
+       return rc;
+}
+
+/* Set request size and initialize tfm context. */
+static void __cra_init(struct crypto_tfm *tfm, struct ocs_hcu_ctx *ctx)
+{
+       crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+                                sizeof(struct ocs_hcu_rctx));
+
+       /* Init context to 0. */
+       memzero_explicit(ctx, sizeof(*ctx));
+       /* Set engine ops. */
+       ctx->engine_ctx.op.do_one_request = kmb_ocs_hcu_do_one_request;
+       ctx->engine_ctx.op.prepare_request = kmb_ocs_hcu_prepare_request;
+}
+
+static int kmb_ocs_hcu_sha_cra_init(struct crypto_tfm *tfm)
+{
+       struct ocs_hcu_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       __cra_init(tfm, ctx);
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_sm3_cra_init(struct crypto_tfm *tfm)
+{
+       struct ocs_hcu_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       __cra_init(tfm, ctx);
+
+       ctx->is_sm3_tfm = true;
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_hmac_sm3_cra_init(struct crypto_tfm *tfm)
+{
+       struct ocs_hcu_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       __cra_init(tfm, ctx);
+
+       ctx->is_sm3_tfm = true;
+       ctx->is_hmac_tfm = true;
+
+       return 0;
+}
+
+static int kmb_ocs_hcu_hmac_cra_init(struct crypto_tfm *tfm)
+{
+       struct ocs_hcu_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       __cra_init(tfm, ctx);
+
+       ctx->is_hmac_tfm = true;
+
+       return 0;
+}
+
+/* Function called when 'tfm' is de-initialized. */
+static void kmb_ocs_hcu_hmac_cra_exit(struct crypto_tfm *tfm)
+{
+       struct ocs_hcu_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       /* Clear the key. */
+       memzero_explicit(ctx->key, sizeof(ctx->key));
+}
+
+static struct ahash_alg ocs_hcu_algs[] = {
+#ifdef CONFIG_CRYPTO_DEV_KEEMBAY_OCS_HCU_HMAC_SHA224
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .halg = {
+               .digestsize     = SHA224_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "sha224",
+                       .cra_driver_name        = "sha224-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA224_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_sha_cra_init,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .setkey         = kmb_ocs_hcu_setkey,
+       .halg = {
+               .digestsize     = SHA224_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "hmac(sha224)",
+                       .cra_driver_name        = "hmac-sha224-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA224_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_hmac_cra_init,
+                       .cra_exit               = kmb_ocs_hcu_hmac_cra_exit,
+               }
+       }
+},
+#endif /* CONFIG_CRYPTO_DEV_KEEMBAY_OCS_HCU_HMAC_SHA224 */
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .halg = {
+               .digestsize     = SHA256_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "sha256",
+                       .cra_driver_name        = "sha256-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA256_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_sha_cra_init,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .setkey         = kmb_ocs_hcu_setkey,
+       .halg = {
+               .digestsize     = SHA256_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "hmac(sha256)",
+                       .cra_driver_name        = "hmac-sha256-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA256_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_hmac_cra_init,
+                       .cra_exit               = kmb_ocs_hcu_hmac_cra_exit,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .halg = {
+               .digestsize     = SM3_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "sm3",
+                       .cra_driver_name        = "sm3-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SM3_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_sm3_cra_init,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .setkey         = kmb_ocs_hcu_setkey,
+       .halg = {
+               .digestsize     = SM3_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "hmac(sm3)",
+                       .cra_driver_name        = "hmac-sm3-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SM3_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_hmac_sm3_cra_init,
+                       .cra_exit               = kmb_ocs_hcu_hmac_cra_exit,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .halg = {
+               .digestsize     = SHA384_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "sha384",
+                       .cra_driver_name        = "sha384-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA384_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_sha_cra_init,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .setkey         = kmb_ocs_hcu_setkey,
+       .halg = {
+               .digestsize     = SHA384_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "hmac(sha384)",
+                       .cra_driver_name        = "hmac-sha384-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA384_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_hmac_cra_init,
+                       .cra_exit               = kmb_ocs_hcu_hmac_cra_exit,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .halg = {
+               .digestsize     = SHA512_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "sha512",
+                       .cra_driver_name        = "sha512-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA512_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_sha_cra_init,
+               }
+       }
+},
+{
+       .init           = kmb_ocs_hcu_init,
+       .update         = kmb_ocs_hcu_update,
+       .final          = kmb_ocs_hcu_final,
+       .finup          = kmb_ocs_hcu_finup,
+       .digest         = kmb_ocs_hcu_digest,
+       .export         = kmb_ocs_hcu_export,
+       .import         = kmb_ocs_hcu_import,
+       .setkey         = kmb_ocs_hcu_setkey,
+       .halg = {
+               .digestsize     = SHA512_DIGEST_SIZE,
+               .statesize      = sizeof(struct ocs_hcu_rctx),
+               .base   = {
+                       .cra_name               = "hmac(sha512)",
+                       .cra_driver_name        = "hmac-sha512-keembay-ocs",
+                       .cra_priority           = 255,
+                       .cra_flags              = CRYPTO_ALG_ASYNC,
+                       .cra_blocksize          = SHA512_BLOCK_SIZE,
+                       .cra_ctxsize            = sizeof(struct ocs_hcu_ctx),
+                       .cra_alignmask          = KMB_OCS_HCU_ALIGN_MASK,
+                       .cra_module             = THIS_MODULE,
+                       .cra_init               = kmb_ocs_hcu_hmac_cra_init,
+                       .cra_exit               = kmb_ocs_hcu_hmac_cra_exit,
+               }
+       }
+}
+
+};
+
+static irqreturn_t kmb_ocs_hcu_irq_thread(int irq, void *dev_id)
+{
+       struct ocs_hcu_dev *hcu_dev = (struct ocs_hcu_dev *)dev_id;
+       struct ocs_hcu_rctx *rctx = ahash_request_ctx(hcu_dev->req);
+       struct crypto_ahash *tfm = crypto_ahash_reqtfm(hcu_dev->req);
+       struct ocs_hcu_ctx *ctx = crypto_ahash_ctx(tfm);
+       int rc = 0;
+       int i;
+
+       if (hcu_dev->flags & HCU_FLAGS_HCU_ERROR_MASK) {
+               rc = -EIO;
+               goto finish;
+       }
+
+       /*
+        * If the DO_OPAD flag is set, it means that we are finalizing a SW
+        * HMAC request and we just computed the result of H(k ^ ipad || m).
+        *
+        * We now need to complete the HMAC calculation with the OPAD step,
+        * that is, we need to compute H(k ^ opad || digest), where digest is
+        * the digest we just obtained, i.e., H(k ^ ipad || m).
+        */
+       if (rctx->flags & REQ_FLAGS_HMAC_SW_DO_OPAD) {
+               /* Gets the intermediate data (which includes the digest). */
+               rc = ocs_hcu_get_intermediate_data(hcu_dev, &rctx->idata,
+                                                  rctx->algo);
+               if (rc)
+                       goto finish;
+               /*
+                * Compute k ^ opad and store it in the request buffer (which
+                * is not used anymore at this point).
+                * Note: key has been padded / hashed already (so keylen ==
+                * blksz) .
+                */
+               WARN_ON(ctx->key_len != rctx->blk_sz);
+               for (i = 0; i < rctx->blk_sz; i++)
+                       rctx->buffer[i] = ctx->key[i] ^ HMAC_OPAD_VALUE;
+               /* Now append the digest to the rest of the buffer. */
+               for (i = 0; (i < rctx->dig_sz); i++)
+                       rctx->buffer[rctx->blk_sz + i] = rctx->idata.digest[i];
+               /*
+                * Remove the DO_OPAD flag (so we won't end up here again after
+                * final hash is computed).
+                */
+               rctx->flags &= ~(REQ_FLAGS_HMAC_SW_DO_OPAD);
+               /* Now hash the buffer to obtain the final HMAC. */
+               rc = kmb_ocs_hcu_hash_pio(hcu_dev, rctx->buffer,
+                                         rctx->blk_sz + rctx->dig_sz,
+                                         rctx->algo, true);
+               if (rc)
+                       goto finish;
+
+               return IRQ_HANDLED;
+       }
+
+finish:
+       kmb_ocs_hcu_finish_request(hcu_dev, rc);
+
+       return IRQ_HANDLED;
+}
+
+/* Device tree driver match. */
+static const struct of_device_id kmb_ocs_hcu_of_match[] = {
+       {
+               .compatible = "intel,keembay-ocs-hcu",
+       },
+       {}
+};
+
+static int kmb_ocs_hcu_remove(struct platform_device *pdev)
+{
+       struct ocs_hcu_dev *hcu_dev;
+       int rc;
+
+       hcu_dev = platform_get_drvdata(pdev);
+       if (!hcu_dev)
+               return -ENODEV;
+
+       crypto_unregister_ahashes(ocs_hcu_algs, ARRAY_SIZE(ocs_hcu_algs));
+
+       spin_lock(&ocs_hcu.lock);
+       list_del(&hcu_dev->list);
+       spin_unlock(&ocs_hcu.lock);
+
+       rc = crypto_engine_exit(hcu_dev->engine);
+
+       ocs_hcu_hw_disable(hcu_dev);
+
+       return rc;
+}
+
+static int kmb_ocs_hcu_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct ocs_hcu_dev *hcu_dev;
+       struct resource *hcu_mem;
+       int rc;
+
+       hcu_dev = devm_kzalloc(dev, sizeof(*hcu_dev), GFP_KERNEL);
+       if (!hcu_dev)
+               return -ENOMEM;
+
+       hcu_dev->dev = dev;
+
+       platform_set_drvdata(pdev, hcu_dev);
+       rc = dma_set_mask_and_coherent(&pdev->dev, OCS_HCU_DMA_BIT_MASK);
+       if (rc)
+               return rc;
+
+       INIT_LIST_HEAD(&hcu_dev->list);
+
+       /* Get the memory address and remap. */
+       hcu_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!hcu_mem) {
+               dev_err(dev, "Could not retrieve io mem resource.\n");
+               rc = -ENODEV;
+               goto list_del;
+       }
+
+       hcu_dev->io_base = devm_ioremap_resource(dev, hcu_mem);
+       if (IS_ERR(hcu_dev->io_base)) {
+               dev_err(dev, "Could not io-remap mem resource.\n");
+               rc = PTR_ERR(hcu_dev->io_base);
+               goto list_del;
+       }
+
+       /* Get and request IRQ. */
+       hcu_dev->irq = platform_get_irq(pdev, 0);
+       if (hcu_dev->irq < 0) {
+               dev_err(dev, "Could not retrieve IRQ.\n");
+               rc = hcu_dev->irq;
+               goto list_del;
+       }
+
+       rc = devm_request_threaded_irq(&pdev->dev, hcu_dev->irq,
+                                      ocs_hcu_irq_handler,
+                                      kmb_ocs_hcu_irq_thread,
+                                      0, "keembay-ocs-hcu", hcu_dev);
+       if (rc < 0) {
+               dev_err(dev, "Could not request IRQ.\n");
+               goto list_del;
+       }
+
+       spin_lock(&ocs_hcu.lock);
+       list_add_tail(&hcu_dev->list, &ocs_hcu.dev_list);
+       spin_unlock(&ocs_hcu.lock);
+
+       /* Initialize crypto engine */
+       hcu_dev->engine = crypto_engine_alloc_init(dev, 1);
+       if (!hcu_dev->engine)
+               goto list_del;
+
+       rc = crypto_engine_start(hcu_dev->engine);
+       if (rc) {
+               dev_err(dev, "Could not start engine.\n");
+               goto cleanup;
+       }
+
+       /* Security infrastructure guarantees OCS clock is enabled. */
+
+       rc = crypto_register_ahashes(ocs_hcu_algs, ARRAY_SIZE(ocs_hcu_algs));
+       if (rc) {
+               dev_err(dev, "Could not register algorithms.\n");
+               goto cleanup;
+       }
+
+       return 0;
+cleanup:
+       crypto_engine_exit(hcu_dev->engine);
+list_del:
+       spin_lock(&ocs_hcu.lock);
+       list_del(&hcu_dev->list);
+       spin_unlock(&ocs_hcu.lock);
+
+       return rc;
+}
+
+/* The OCS driver is a platform device. */
+static struct platform_driver kmb_ocs_hcu_driver = {
+       .probe = kmb_ocs_hcu_probe,
+       .remove = kmb_ocs_hcu_remove,
+       .driver = {
+                       .name = DRV_NAME,
+                       .of_match_table = kmb_ocs_hcu_of_match,
+               },
+};
+
+module_platform_driver(kmb_ocs_hcu_driver);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/crypto/keembay/ocs-hcu.c b/drivers/crypto/keembay/ocs-hcu.c
new file mode 100644
index 000000000000..5523a03975a7
--- /dev/null
+++ b/drivers/crypto/keembay/ocs-hcu.c
@@ -0,0 +1,593 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Keem Bay OCS HCU Crypto Driver.
+ *
+ * Copyright (C) 2018-2020 Intel Corporation
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/iopoll.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include <crypto/sha.h>
+
+#include "ocs-hcu.h"
+
+#define OCS_HCU_MODE                   0x00
+#define OCS_HCU_CHAIN                  0x04
+#define OCS_HCU_OPERATION              0x08
+#define OCS_HCU_KEY_0                  0x0C
+#define OCS_HCU_KEY_1                  0x10
+#define OCS_HCU_KEY_2                  0x14
+#define OCS_HCU_KEY_3                  0x18
+#define OCS_HCU_KEY_4                  0x1C
+#define OCS_HCU_KEY_5                  0x20
+#define OCS_HCU_KEY_6                  0x24
+#define OCS_HCU_KEY_7                  0x28
+#define OCS_HCU_KEY_8                  0x2C
+#define OCS_HCU_KEY_9                  0x30
+#define OCS_HCU_KEY_10                 0x34
+#define OCS_HCU_KEY_11                 0x38
+#define OCS_HCU_KEY_12                 0x3C
+#define OCS_HCU_KEY_13                 0x40
+#define OCS_HCU_KEY_14                 0x44
+#define OCS_HCU_KEY_15                 0x48
+#define OCS_HCU_ISR                    0x50
+#define OCS_HCU_IER                    0x54
+#define OCS_HCU_STATUS                 0x58
+#define OCS_HCU_MSG_LEN_LO             0x60
+#define OCS_HCU_MSG_LEN_HI             0x64
+#define OCS_HCU_KEY_BYTE_ORDER_CFG     0x80
+#define OCS_HCU_DMA_SRC_ADDR           0x400
+#define OCS_HCU_DMA_DST_ADDR           0x404
+#define OCS_HCU_DMA_SRC_SIZE           0x408
+#define OCS_HCU_DMA_DST_SIZE           0x40C
+#define OCS_HCU_DMA_DMA_MODE           0x410
+#define OCS_HCU_DMA_OTHER_MODE         0x414
+#define OCS_HCU_DMA_NEXT_SRC_DESCR     0x418
+#define OCS_HCU_DMA_NEXT_DST_DESCR     0x41C
+#define OCS_HCU_DMA_WHILE_ACTIVE_MODE  0x420
+#define OCS_HCU_DMA_LOG                        0x424
+#define OCS_HCU_DMA_STATUS             0x428
+#define OCS_HCU_DMA_PERF_CNTR          0x42C
+#define OCS_HCU_DMA_VALID_SAI_31_0     0x440
+#define OCS_HCU_DMA_VALID_SAI_63_32    0x444
+#define OCS_HCU_DMA_VALID_SAI_95_64    0x448
+#define OCS_HCU_DMA_VALID_SAI_127_96   0x44C
+#define OCS_HCU_DMA_VALID_SAI_159_128  0x450
+#define OCS_HCU_DMA_VALID_SAI_191_160  0x454
+#define OCS_HCU_DMA_VALID_SAI_223_192  0x458
+#define OCS_HCU_DMA_VALID_SAI_255_224  0x45C
+#define OCS_HCU_DMA_MSI_ISR            0x480
+#define OCS_HCU_DMA_MSI_IER            0x484
+#define OCS_HCU_DMA_MSI_MASK           0x488
+#define OCS_HCU_DMA_MSI_MA             0x800
+#define OCS_HCU_DMA_MSI_DA             0x804
+#define OCS_HCU_DMA_MSI_EN             0x808
+#define OCS_HCU_DMA_INBUFFER_WRITE_FIFO        0x600
+#define OCS_HCU_DMA_OUTBUFFER_READ_FIFO        0x700
+
+/* Register bit definitions. */
+#define HCU_MODE_ALGO_SHIFT            16
+#define HCU_MODE_ALGO_MASK             (OCS_HCU_ALGO_MASK \
+                                        << HCU_MODE_ALGO_SHIFT)
+
+#define HCU_STATUS_BUSY_MASK           BIT(0)
+
+#define HCU_BYTE_ORDER_SWAP            BIT(0)
+
+#define HCU_IRQ_HASH_DONE              BIT(2)
+#define HCU_IRQ_HASH_ERR               (BIT(3) | BIT(1) | BIT(0))
+
+#define HCU_DMA_IRQ_SRC_DONE           BIT(0)
+#define HCU_DMA_IRQ_DST_DONE           BIT(1)
+#define HCU_DMA_IRQ_SAI_ERR            BIT(2)
+#define HCU_DMA_IRQ_BAD_COMP_ERR       BIT(3)
+#define HCU_DMA_IRQ_INBUF_RD_ERR       BIT(4)
+#define HCU_DMA_IRQ_INBUF_WD_ERR       BIT(5)
+#define HCU_DMA_IRQ_OUTBUF_WR_ERR      BIT(6)
+#define HCU_DMA_IRQ_OUTBUF_RD_ERR      BIT(7)
+#define HCU_DMA_IRQ_CRD_ERR            BIT(8)
+#define HCU_DMA_IRQ_ERR_MASK           (HCU_DMA_IRQ_SAI_ERR | \
+                                        HCU_DMA_IRQ_BAD_COMP_ERR | \
+                                        HCU_DMA_IRQ_INBUF_RD_ERR | \
+                                        HCU_DMA_IRQ_INBUF_WD_ERR | \
+                                        HCU_DMA_IRQ_OUTBUF_WR_ERR | \
+                                        HCU_DMA_IRQ_OUTBUF_RD_ERR | \
+                                        HCU_DMA_IRQ_CRD_ERR)
+
+#define HCU_DMA_SNOOP_MASK             (0x7 << 28)
+#define HCU_DMA_SRC_LL_EN              BIT(25)
+#define HCU_DMA_EN                     BIT(31)
+#define HCU_DMA_STAT_SRC_DONE          BIT(15)
+
+#define HCU_MODE_HMAC_SHIFT            22
+#define HCU_MODE_HMAC_MASK             BIT(HCU_MODE_HMAC_SHIFT)
+
+#define KMB_HCU_ENDIANNESS_VALUE       (0x2A)
+
+#define HCU_DMA_MSI_UNMASK             BIT(0)
+#define HCU_DMA_MSI_DISABLE            0
+#define HCU_IRQ_DISABLE                        0
+
+#define OCS_HCU_START                  BIT(0)
+#define OCS_HCU_TERMINATE              BIT(1)
+
+#define HCU_CHAIN_WRITE_ENDIANNESS_OFFSET      30
+#define HCU_CHAIN_READ_ENDIANNESS_OFFSET       28
+#define HCU_DATA_WRITE_ENDIANNESS_OFFSET       26
+
+#define OCS_HCU_NUM_CHAINS_SHA256_224_SM3      8
+#define OCS_HCU_NUM_CHAINS_SHA384_512          16
+
+#define OCS_HCU_HASH_FINAL_CPU_RETRIES         1000
+/*
+ * While polling on a busy HCU, wait maximum 200us between one check and the
+ * other.
+ */
+#define OCS_HCU_WAIT_BUSY_RETRY_DELAY_US       200
+/* Wait on a busy HCU for maximum 1 second. */
+#define OCS_HCU_WAIT_BUSY_TIMEOUT_US           1000000
+
+static inline u32 ocs_hcu_num_chains(u32 algo)
+{
+       switch (algo & OCS_HCU_ALGO_MASK) {
+       case OCS_HCU_ALGO_SHA224:
+       case OCS_HCU_ALGO_SHA256:
+       case OCS_HCU_ALGO_SM3:
+               return OCS_HCU_NUM_CHAINS_SHA256_224_SM3;
+       case OCS_HCU_ALGO_SHA384:
+       case OCS_HCU_ALGO_SHA512:
+               return OCS_HCU_NUM_CHAINS_SHA384_512;
+       default:
+               return 0;
+       };
+}
+
+static inline u32 ocs_hcu_block_size(u32 algo)
+{
+       switch (algo & OCS_HCU_ALGO_MASK) {
+       case OCS_HCU_ALGO_SHA224:
+               return SHA224_BLOCK_SIZE;
+       case OCS_HCU_ALGO_SHA256:
+       case OCS_HCU_ALGO_SM3:
+               /* SM3 shares the same block size. */
+               return SHA256_BLOCK_SIZE;
+       case OCS_HCU_ALGO_SHA384:
+               return SHA384_BLOCK_SIZE;
+       case OCS_HCU_ALGO_SHA512:
+               return SHA512_BLOCK_SIZE;
+       default:
+               return 0;
+       }
+}
+
+/**
+ * ocs_hcu_wait_busy() - Wait for HCU OCS hardware to became usable.
+ * @hcu_dev:   OCS HCU device to wait for.
+ *
+ * Return: 0 if device free, -ETIMEOUT if device busy and internal timeout has
+ *        expired.
+ */
+int ocs_hcu_wait_busy(struct ocs_hcu_dev *hcu_dev)
+{
+       long val;
+
+       return readl_poll_timeout(hcu_dev->io_base + OCS_HCU_STATUS, val,
+                                 !(val & HCU_STATUS_BUSY_MASK),
+                                 OCS_HCU_WAIT_BUSY_RETRY_DELAY_US,
+                                 OCS_HCU_WAIT_BUSY_TIMEOUT_US);
+}
+
+static void ocs_hcu_done_irq_en(struct ocs_hcu_dev *hcu_dev)
+{
+       /* Clear any pending interrupts. */
+       writel(0xFFFFFFFF, hcu_dev->io_base + OCS_HCU_ISR);
+       /* Enable error and HCU done interrupts. */
+       writel(HCU_IRQ_HASH_DONE | HCU_IRQ_HASH_ERR,
+              hcu_dev->io_base + OCS_HCU_IER);
+}
+
+static void ocs_hcu_dma_irq_en(struct ocs_hcu_dev *hcu_dev)
+{
+       /* Clear any pending interrupts. */
+       writel(0xFFFFFFFF, hcu_dev->io_base + OCS_HCU_DMA_MSI_ISR);
+       /* Only operating on DMA source completion and error interrupts. */
+       writel(HCU_DMA_IRQ_ERR_MASK | HCU_DMA_IRQ_SRC_DONE,
+              hcu_dev->io_base + OCS_HCU_DMA_MSI_IER);
+       /* Unmask */
+       writel(HCU_DMA_MSI_UNMASK, hcu_dev->io_base + OCS_HCU_DMA_MSI_MASK);
+}
+
+static void ocs_hcu_irq_dis(struct ocs_hcu_dev *hcu_dev)
+{
+       writel(HCU_IRQ_DISABLE, hcu_dev->io_base + OCS_HCU_IER);
+       writel(HCU_DMA_MSI_DISABLE, hcu_dev->io_base + OCS_HCU_DMA_MSI_IER);
+}
+
+/**
+ * ocs_hcu_get_intermediate_data() - Get intermediate data.
+ * @hcu_dev:   The target HCU device.
+ * @data:      Where to store the intermediate.
+ * @algo:      The algorithm being used.
+ *
+ * This function is used to save the current hashing process state in order to
+ * continue it in the future.
+ *
+ * Note: once all data has been processed, the intermediate data actually
+ * contains the hashing result. So this function is also used to retrieve the
+ * final result of a hashing process.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int ocs_hcu_get_intermediate_data(struct ocs_hcu_dev *hcu_dev,
+                                 struct ocs_hcu_idata *data, u32 algo)
+{
+       const int n = ocs_hcu_num_chains(algo);
+       u32 *chain;
+       int busy;
+       int i;
+
+       /* Data not requested. */
+       if (!data)
+               return -EINVAL;
+
+       chain = (u32 *)data->digest;
+
+       /* Ensure that the OCS is no longer busy before reading the chains. */
+       busy = ocs_hcu_wait_busy(hcu_dev);
+
+       if (busy)
+               return -EBUSY;
+
+       for (i = 0; i < n; i++)
+               chain[i] = readl(hcu_dev->io_base + OCS_HCU_CHAIN);
+
+       data->msg_len_lo = readl(hcu_dev->io_base + OCS_HCU_MSG_LEN_LO);
+       data->msg_len_hi = readl(hcu_dev->io_base + OCS_HCU_MSG_LEN_HI);
+
+       return 0;
+}
+
+/**
+ * ocs_hcu_set_intermediate_data() - Set intermediate data.
+ * @hcu_dev:   The target HCU device.
+ * @data:      The intermediate data to be set.
+ * @algo:      The algorithm being used.
+ *
+ * This function is used to continue a previous hashing process.
+ */
+void ocs_hcu_set_intermediate_data(struct ocs_hcu_dev *hcu_dev,
+                                  struct ocs_hcu_idata *data, u32 algo)
+{
+       const int n = ocs_hcu_num_chains(algo);
+       u32 *chain = (u32 *)data->digest;
+       int i;
+
+       for (i = 0; i < n; i++)
+               writel(chain[i], hcu_dev->io_base + OCS_HCU_CHAIN);
+
+       writel(data->msg_len_lo, hcu_dev->io_base + OCS_HCU_MSG_LEN_LO);
+       writel(data->msg_len_hi, hcu_dev->io_base + OCS_HCU_MSG_LEN_HI);
+}
+
+/**
+ * ocs_hcu_hw_init() - Initialize the HCU device.
+ * @hcu_dev:   The HCU device to initialize.
+ */
+void ocs_hcu_hw_init(struct ocs_hcu_dev *hcu_dev)
+{
+       u32 cfg = 0;
+
+       /* Return if HW is already initialized. */
+       if (hcu_dev->flags & HCU_FLAGS_HCU_INIT)
+               return;
+       /* Initialize hardware. */
+       cfg = KMB_HCU_ENDIANNESS_VALUE << HCU_DATA_WRITE_ENDIANNESS_OFFSET;
+       writel(cfg, hcu_dev->io_base + OCS_HCU_MODE);
+       hcu_dev->flags |= HCU_FLAGS_HCU_INIT;
+}
+
+/**
+ * ocs_hcu_hw_cfg() - Configure the HCU hardware.
+ * @hcu_dev:   The HCU device to configure.
+ * @algo:      The algorithm to be used by the HCU device.
+ *
+ * NOTE: This function must be called after ocs_hcu_hw_init().
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int ocs_hcu_hw_cfg(struct ocs_hcu_dev *hcu_dev, u32 algo)
+{
+       u32 cfg = readl(hcu_dev->io_base + OCS_HCU_MODE);
+       u32 ocs_algo = algo & OCS_HCU_ALGO_MASK;
+
+       if (ocs_algo != OCS_HCU_ALGO_SHA256 &&
+           ocs_algo != OCS_HCU_ALGO_SHA224 &&
+           ocs_algo != OCS_HCU_ALGO_SHA384 &&
+           ocs_algo != OCS_HCU_ALGO_SHA512 &&
+           ocs_algo != OCS_HCU_ALGO_SM3)
+               return -EINVAL;
+
+       if ((hcu_dev->flags & HCU_FLAGS_HCU_INIT) == 0)
+               return -EPERM;
+
+       cfg &= ~HCU_MODE_ALGO_MASK;
+       cfg |= ocs_algo << HCU_MODE_ALGO_SHIFT;
+       cfg &= ~HCU_MODE_HMAC_MASK;
+       if (algo & OCS_HCU_ALGO_HMAC_MASK)
+               cfg |= BIT(HCU_MODE_HMAC_SHIFT);
+
+       writel(cfg, hcu_dev->io_base + OCS_HCU_MODE);
+
+       return 0;
+}
+
+/**
+ * ocs_hcu_hw_disable() - Disable the HCU hardware.
+ * @hcu_dev:   The HCU device to disable.
+ */
+void ocs_hcu_hw_disable(struct ocs_hcu_dev *hcu_dev)
+{
+       if ((hcu_dev->flags & HCU_FLAGS_HCU_INIT) == HCU_FLAGS_HCU_INIT) {
+               /* Clear hardware. */
+               writel(0, hcu_dev->io_base + OCS_HCU_MODE);
+               ocs_hcu_irq_dis(hcu_dev);
+               hcu_dev->flags &= ~HCU_FLAGS_HCU_INIT;
+       }
+}
+
+/**
+ * ocs_hcu_tx_data_done() - Request the hardware to compute the final hash.
+ * @hcu_dev:   The HCU device to use.
+ */
+void ocs_hcu_tx_data_done(struct ocs_hcu_dev *hcu_dev)
+{
+       /*
+        * Enable HCU interrupts, so that HCU_DONE will be triggered once the
+        * final hash is computed.
+        */
+       ocs_hcu_done_irq_en(hcu_dev);
+       writel(OCS_HCU_TERMINATE, hcu_dev->io_base + OCS_HCU_OPERATION);
+}
+
+static unsigned int ocs_hcu_hash_final_cpu(struct ocs_hcu_dev *hcu_dev,
+                                          u8 *buf, u32 sz)
+{
+       int retries = OCS_HCU_HASH_FINAL_CPU_RETRIES;
+       u32 sz_32 = sz / sizeof(u32);
+       u32 *buf_32 = (u32 *)buf;
+       int i;
+
+       /* Write in using full register size. */
+       for (i = 0; i < sz_32; i++)
+               writel(buf_32[i], hcu_dev->io_base +
+                      OCS_HCU_DMA_INBUFFER_WRITE_FIFO);
+
+       /* Write final bytes into buffer. */
+       for (i = sz_32 * sizeof(u32); i < sz; i++)
+               writeb(buf[i], hcu_dev->io_base +
+                      OCS_HCU_DMA_INBUFFER_WRITE_FIFO);
+
+       /* Wait until the writes are complete. */
+       do {
+               if ((readl(hcu_dev->io_base + OCS_HCU_DMA_STATUS) &
+                   HCU_DMA_STAT_SRC_DONE))
+                       break;
+
+               if ((readl(hcu_dev->io_base + OCS_HCU_DMA_MSI_ISR) &
+                   HCU_DMA_IRQ_ERR_MASK))
+                       return 0;
+
+               usleep_range(100, 200);
+       } while (retries--);
+
+       return sz;
+}
+
+static unsigned int ocs_hcu_hash_block_aligned_cpu(struct ocs_hcu_dev *hcu_dev,
+                                                  u8 *buf, u32 sz, u32 algo)
+{
+       u32 blk_sz = ocs_hcu_block_size(algo);
+       u32 *buf_32 = (u32 *)buf;
+       u32 blk_sz_32;
+       u32 num_blks;
+       int i;
+
+       if (blk_sz == 0)
+               return 0;
+
+       blk_sz_32 = blk_sz / sizeof(u32);
+       num_blks = sz / blk_sz;
+
+       for (i = 0; i < (blk_sz_32 * num_blks); i++)
+               writel(buf_32[i], hcu_dev->io_base +
+                          OCS_HCU_DMA_INBUFFER_WRITE_FIFO);
+
+       if (readl(hcu_dev->io_base + OCS_HCU_DMA_MSI_ISR) &
+           HCU_DMA_IRQ_ERR_MASK)
+               return 0;
+
+       return i * sizeof(u32);
+}
+
+static void ocs_hcu_start_hash(struct ocs_hcu_dev *hcu_dev)
+{
+       writel(OCS_HCU_START, hcu_dev->io_base + OCS_HCU_OPERATION);
+}
+
+/**
+ * ocs_hcu_hash_cpu() - Perform OCS HCU hashing without using DMA.
+ * @hcu_dev:   The OCS HCU device to use.
+ * @buf:       The data to hash.
+ * @sz:                The size of the data to hash.
+ * @algo:      The hashing algorithm to use.
+ * @finalize:  Whether or not this is the last hashing operation and therefore
+ *             the final hash should be compute even if data is not
+ *             block-aligned.
+ *
+ * Return: the number of bytes hashed.
+ */
+unsigned int ocs_hcu_hash_cpu(struct ocs_hcu_dev *hcu_dev,
+                             u8 *buf, u32 sz, u32 algo, bool finalize)
+{
+       unsigned int written;
+
+       if (!buf)
+               return 0;
+
+       written = ocs_hcu_hash_block_aligned_cpu(hcu_dev, buf, sz, algo);
+
+       if (finalize) {
+               sz -= written;
+               if (sz) {
+                       buf += written;
+                       written += ocs_hcu_hash_final_cpu(hcu_dev, buf, sz);
+               }
+               ocs_hcu_tx_data_done(hcu_dev);
+       }
+
+       return written;
+}
+
+/**
+ * ocs_hcu_ll_dma_start() - Start OCS HCU hashing via DMA
+ * @hcu_dev:   The OCS HCU device to use.
+ * @head:      The head of the OCS DMA list to hash.
+ * @finalize:  Whether or not this is the last hashing operation and therefore
+ *             the final hash should be compute even if data is not
+ *             block-aligned.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int ocs_hcu_ll_dma_start(struct ocs_hcu_dev *hcu_dev, dma_addr_t head,
+                        bool finalize)
+{
+       u32 cfg = HCU_DMA_SNOOP_MASK | HCU_DMA_SRC_LL_EN | HCU_DMA_EN;
+
+       if (head == DMA_MAPPING_ERROR || head & ~OCS_HCU_DMA_BIT_MASK)
+               return -EINVAL;
+
+       /*
+        * For final requests we use HCU_DONE IRQ to be notified when all input
+        * data has been processed by the HCU; however, we cannot do so for
+        * non-final requests, because we don't get a HCU_DONE IRQ when we
+        * don't terminate the operation.
+        *
+        * Therefore, for non-final requests, we use the DMA IRQ, which
+        * triggers when DMA has finishing feeding all the input data to the
+        * HCU, but the HCU may still be processing it. This is fine, since we
+        * will wait for the HCU processing to be completed when we try to read
+        * intermediate results, in ocs_hcu_get_intermediate_data().
+        */
+       if (finalize)
+               ocs_hcu_done_irq_en(hcu_dev);
+       else
+               ocs_hcu_dma_irq_en(hcu_dev);
+
+       writel(head, hcu_dev->io_base + OCS_HCU_DMA_NEXT_SRC_DESCR);
+       writel(0, hcu_dev->io_base + OCS_HCU_DMA_SRC_SIZE);
+       writel(0, hcu_dev->io_base + OCS_HCU_DMA_DST_SIZE);
+       ocs_hcu_start_hash(hcu_dev);
+       writel(cfg, hcu_dev->io_base + OCS_HCU_DMA_DMA_MODE);
+
+       if (finalize)
+               writel(OCS_HCU_TERMINATE, hcu_dev->io_base + OCS_HCU_OPERATION);
+
+       return 0;
+}
+
+/**
+ * ocs_hcu_write_key() - Write key to OCS HMAC KEY registers.
+ * @hcu_dev:   The OCS HCU device the key should be written to.
+ * @key:       The key to be written.
+ * @len:       The size of the key to write. It must be HCU_HW_KEY_LEN.
+ *
+ * Return:     0 on success, negative error code otherwise.
+ */
+int ocs_hcu_write_key(struct ocs_hcu_dev *hcu_dev, u8 *key, size_t len)
+{
+       u32 *key_32 = (u32 *)key;
+       size_t len32;
+       int i = 0;
+
+       if (len != HCU_HW_KEY_LEN)
+               return -EINVAL;
+
+       /*
+        * OCS hardware expects the MSB of the key to be written at the highest
+        * address of the HCU Key vector; in other word, the key must be
+        * written in reverse order.
+        *
+        * Therefore, we first enable byte swapping for the HCU key vector;
+        * so that bytes of 32-bit word written to OCS_HCU_KEY_[0..15] will be
+        * swapped:
+        * 3 <---> 0, 2 <---> 1.
+        */
+       writel(HCU_BYTE_ORDER_SWAP,
+              hcu_dev->io_base + OCS_HCU_KEY_BYTE_ORDER_CFG);
+       /*
+        * And then we write the 32-bit words composing the key starting from
+        * the end of the key.
+        */
+       len32 = HCU_HW_KEY_LEN / sizeof(u32);
+       for (i = 0; i < len32; i++)
+               writel(key_32[len32 - 1 - i],
+                      hcu_dev->io_base + OCS_HCU_KEY_0 + (sizeof(u32) * i));
+
+       return 0;
+}
+
+/**
+ * ocs_hcu_write_key() - Clear key stored in OCS HMAC KEY registers.
+ * @hcu_dev:   The OCS HCU device whose key registers should be cleared.
+ */
+void ocs_hcu_clear_key(struct ocs_hcu_dev *hcu_dev)
+{
+       int reg_off;
+
+       /* Clear OCS_HCU_KEY_[0..15] */
+       for (reg_off = 0; reg_off < HCU_HW_KEY_LEN; reg_off += sizeof(u32))
+               writel(0, hcu_dev->io_base + OCS_HCU_KEY_0 + reg_off);
+}
+
+irqreturn_t ocs_hcu_irq_handler(int irq, void *dev_id)
+{
+       struct ocs_hcu_dev *hcu_dev = dev_id;
+       u32 hcu_irq = readl(hcu_dev->io_base + OCS_HCU_ISR);
+       u32 dma_irq = readl(hcu_dev->io_base + OCS_HCU_DMA_MSI_ISR);
+       irqreturn_t rc = IRQ_NONE;
+
+       /* Check the HCU status. */
+       if (hcu_irq & HCU_IRQ_HASH_ERR) {
+               hcu_dev->flags |= HCU_FLAGS_HCU_ERR;
+               rc = IRQ_WAKE_THREAD;
+       } else if (hcu_irq & HCU_IRQ_HASH_DONE) {
+               rc = IRQ_WAKE_THREAD;
+       }
+       /* Clear the HCU interrupt. */
+       writel(hcu_irq, hcu_dev->io_base + OCS_HCU_ISR);
+
+       /* Check the DMA status. */
+       if (dma_irq & HCU_DMA_IRQ_ERR_MASK) {
+               hcu_dev->flags |= HCU_FLAGS_HCU_DMA_ERR;
+               rc = IRQ_WAKE_THREAD;
+               goto exit;
+       }
+       if (dma_irq & HCU_DMA_IRQ_SRC_DONE) {
+               /* DMA is complete, indicate that the HCU is done this
+                * transaction.
+                */
+               rc = IRQ_WAKE_THREAD;
+       }
+
+exit:
+       /* Clear the HCU DMA interrupt. */
+       writel(dma_irq, hcu_dev->io_base + OCS_HCU_DMA_MSI_ISR);
+
+       return rc;
+}
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/crypto/keembay/ocs-hcu.h b/drivers/crypto/keembay/ocs-hcu.h
new file mode 100644
index 000000000000..3c7e3881791c
--- /dev/null
+++ b/drivers/crypto/keembay/ocs-hcu.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel Keem Bay OCS HCU Crypto Driver.
+ *
+ * Copyright (C) 2018-2020 Intel Corporation
+ */
+
+#include <linux/dma-mapping.h>
+
+#ifndef _CRYPTO_OCS_HCU_H
+#define _CRYPTO_OCS_HCU_H
+
+#define OCS_LL_DMA_FLAG_TERMINATE      BIT(31)
+#define OCS_LL_DMA_FLAG_FREEZE         BIT(30)
+#define OCS_LL_DMA_FLAG_RESERVED       (BIT(30) - 1)
+
+#define OCS_HCU_ALGO_SHA256            2
+#define OCS_HCU_ALGO_SHA224            3
+#define OCS_HCU_ALGO_SHA384            4
+#define OCS_HCU_ALGO_SHA512            5
+#define OCS_HCU_ALGO_SM3               6
+
+#define OCS_HCU_ALGO_MASK              (BIT(3) - 1)
+
+#define OCS_HCU_ALGO_HMAC_SHIFT                4
+#define OCS_HCU_ALGO_HMAC_MASK         BIT(OCS_HCU_ALGO_HMAC_SHIFT)
+
+#define OCS_HCU_DMA_NO_SNOOP           1
+#define OCS_HCU_DMA_SNOOP              0
+#define OCS_HCU_DMA_BTF_SWAP           1
+#define OCS_HCU_DMA_BTF_NO_SWAP                0
+#define OCS_HCU_DMA_ADDR_MODE_FIXED    1
+#define OCS_HCU_DMA_ADDR_MODE_LINEAR   0
+#define OCS_HCU_DMA_BIT_MASK           DMA_BIT_MASK(32)
+
+#define OCS_HCU_MAX_CHAIN_NUM          16
+
+/* Device flags */
+#define HCU_FLAGS_HCU_INIT             BIT(0)
+#define HCU_FLAGS_HCU_ERR              BIT(17)
+#define HCU_FLAGS_HCU_DMA_ERR          BIT(18)
+#define HCU_FLAGS_HCU_ERROR_MASK       (HCU_FLAGS_HCU_DMA_ERR | \
+                                        HCU_FLAGS_HCU_ERR)
+
+#define HCU_HW_KEY_LEN                 64
+
+/**
+ * HCU device context.
+ * @list: List of device contexts.
+ * @dev: OCS HCU device.
+ * @irq: IRQ number.
+ * @io_base: IO Base of HCU.
+ * @flags: HW flags indicating state.
+ * @req: Request being operated on.
+ * @engine: Crypto engine for the device.
+ */
+struct ocs_hcu_dev {
+       struct list_head list;
+       struct device *dev;
+       int irq;
+       /* Base address of OCS HCU */
+       void __iomem *io_base;
+       /* Status of the OCS HCU device */
+       u32 flags;
+       /* Active request. */
+       struct ahash_request *req;
+       struct crypto_engine *engine;
+};
+
+/**
+ * struct ocs_dma_desc - OCS DMA linked list descriptor.
+ * @src_addr:  Source address of the data.
+ * @src_len:   Length of data to be fetched.
+ * @nxt_desc:  Next descriptor to fetch.
+ * @ll_flags:  Flags (Freeze @ terminate) for the DMA engine.
+ */
+struct ocs_dma_desc {
+       u32 src_adr;
+       u32 src_len;
+       u32 nxt_desc;
+       u32 ll_flags;
+};
+
+/**
+ * Structure to contain the intermediate data generated by the HCU.
+ * @msg_len_lo: Length of data the HCU has operated on in bits, low 32b.
+ * @msg_len_hi: Length of data the HCU has operated on in bits, high 32b.
+ * @digest: The digest read from the HCU. If the HCU is terminated, it will
+ *         contain the actual hash digest. Otherwise it is the intermediate
+ *         state.
+ */
+struct ocs_hcu_idata {
+       u32 msg_len_lo;
+       u32 msg_len_hi;
+       u8 digest[SHA512_DIGEST_SIZE] __aligned(sizeof(u32));
+};
+
+irqreturn_t ocs_hcu_irq_handler(int irq, void *dev_id);
+int ocs_hcu_get_intermediate_data(struct ocs_hcu_dev *hcu_dev,
+                                 struct ocs_hcu_idata *data, u32 algo);
+void ocs_hcu_set_intermediate_data(struct ocs_hcu_dev *hcu_dev,
+                                  struct ocs_hcu_idata *data, u32 algo);
+void ocs_hcu_hw_init(struct ocs_hcu_dev *hcu_dev);
+void ocs_hcu_hw_disable(struct ocs_hcu_dev *hcu_dev);
+int ocs_hcu_hw_cfg(struct ocs_hcu_dev *hcu_dev, u32 algo);
+unsigned int ocs_hcu_hash_cpu(struct ocs_hcu_dev *hcu_dev,
+                             u8 *buf, u32 sz, u32 algo, bool finalize);
+int ocs_hcu_ll_dma_start(struct ocs_hcu_dev *hcu_dev, dma_addr_t head,
+                        bool finalize);
+void ocs_hcu_tx_data_done(struct ocs_hcu_dev *hcu_dev);
+int ocs_hcu_wait_busy(struct ocs_hcu_dev *hcu_dev);
+int ocs_hcu_write_key(struct ocs_hcu_dev *hcu_dev, u8 *key, size_t len);
+void ocs_hcu_clear_key(struct ocs_hcu_dev *hcu_dev);
+
+#endif /* _CRYPTO_OCS_HCU_H */
-- 
2.26.2

Reply via email to