This only works with separate buffer meta-data, i.e. non-interleaved meta-data. The sgl method requires block multiple transfers, which may not happen with the interleaved meta-data, so more work needed to double buffer. I don't think there is any use for the interleaved meta-data anyway from a host driver perspected, except for perhaps controller PRACT bit set to 1.
Signed-off-by: Keith Busch <keith.bu...@intel.com> --- hw/nvme.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++----------- hw/nvme.h | 3 ++ 2 files changed, 61 insertions(+), 14 deletions(-) diff --git a/hw/nvme.c b/hw/nvme.c index aec1ef7..8260e7c 100644 --- a/hw/nvme.c +++ b/hw/nvme.c @@ -59,7 +59,7 @@ enum { }; #define DBGBIT(x) (1 << x) -int debug_flags = DBGBIT(IO_DBG) | DBGBIT(DBG) | DBGBIT(INFO) | DBGBIT(ERR) | DBGBIT(ADM_DBG); +int debug_flags = DBGBIT(INFO) | DBGBIT(ERR) | DBGBIT(ADM_DBG); #define NVME_LOG(level, fmt, ...) \ do {\ if (debug_flags & DBGBIT(level)) { \ @@ -460,14 +460,32 @@ static void nvme_rw_cb(void *opaque, int ret) if (!req->rw) { nvme_update_ns_util(ns, req->slba, req->nlb); } - if (!ret) { req->cqe.status = NVME_SUCCESS << 1; + if (req->meta_size) { + uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); + int64_t offset = ns->meta_start_block * BDRV_SECTOR_SIZE + + req->slba * ns->id_ns.lbaf[lba_index].ms; + void *buf = g_malloc(req->meta_size); + if (req->rw) { + ret = bdrv_pread(n->conf.bs, offset, buf, req->meta_size); + pci_dma_write(&n->dev, req->mptr, buf, req->meta_size); + } else { + pci_dma_read(&n->dev, req->mptr, buf, req->meta_size); + ret = bdrv_pwrite(n->conf.bs, offset, buf, req->meta_size); + } + if (ret != req->meta_size) { + NVME_LOG(ERR, "meta-data transfer error ret:%d", ret); + req->cqe.status = NVME_INTERNAL_DEV_ERROR << 1; + } + g_free(buf); + } } else { NVME_LOG(ERR, "nsid:%u nlb:%u slba:%"PRIu64" ret:%d", ns->id, req->nlb, req->slba, ret); req->cqe.status = NVME_INTERNAL_DEV_ERROR << 1; } + nvme_enqueue_req_completion(cq, req); } @@ -478,6 +496,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; uint64_t data_size = (rw->nlb + 1) << data_shift; + uint64_t meta_size = (rw->nlb + 1) * ns->id_ns.lbaf[lba_index].ms; int data_dir = rw->opcode == NVME_CMD_WRITE ? 0 : 1; uint16_t ret; @@ -498,6 +517,13 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, offsetof(NvmeRwCmd, nlb), rw->slba + rw->nlb, ns->id); return NVME_INVALID_FIELD | NVME_DNR; } + if (meta_size && !rw->mptr && + !NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas)) { + NVME_LOG(ERR, "meta-data pointer required for namespace format"); + nvme_set_error_page(n, req->sq->id, cmd->cid, NVME_INVALID_FIELD, + offsetof(NvmeRwCmd, mptr), rw->slba + rw->nlb, ns->id); + return NVME_INVALID_FIELD | NVME_DNR; + } ret = nvme_map_prp(&req->qsg, rw->prp1, rw->prp2, data_size, n); if (ret == NVME_SUCCESS) { @@ -505,6 +531,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, uint32_t nlb = (rw->nlb + 1) << (data_shift - 9); assert(nlb * BDRV_SECTOR_SIZE == req->qsg.size); + req->meta_size = meta_size; + req->mptr = rw->mptr; req->slba = rw->slba; req->nlb = rw->nlb; req->ns = ns; @@ -512,6 +540,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, req->aiocb = data_dir ? dma_bdrv_read(n->conf.bs, &req->qsg, slba, nvme_rw_cb, req) : dma_bdrv_write(n->conf.bs, &req->qsg, slba, nvme_rw_cb, req); + NVME_LOG(IO_DBG, "req:%p aiocb:%p ns:%u sq:%u cid:%x nlb:%u slba:%"PRIu64" "\ "aio slba:%"PRIu64" aio nlb:%u", req, req->aiocb, ns->id, @@ -1228,6 +1257,8 @@ static uint16_t nvme_abort_req(NvmeCtrl *n, NvmeCmd *cmd, uint32_t *result) } sq = n->sq[sqid]; + /* scan the queue for work that hasn't been processed yet. + * TODO: scan the outstanding work and issue an aio cancel */ while ((sq->head + index) % sq->size != sq->tail) { NvmeCmd abort_cmd; hwaddr addr; @@ -1315,8 +1346,7 @@ static uint16_t nvme_format_namespace(NvmeNamespace *ns, uint8_t lba_idx, ns->util = bitmap_new(ns->id_ns.nsze); if (sec_erase) { - /* TODO: write zeros, complete asynchronously */ - ; + /* TODO: write zeros, complete asynchronously */ ; } return NVME_SUCCESS; @@ -1427,7 +1457,6 @@ static void nvme_sq_process(void *opaque) req->aiocb = NULL; req->aiocb_dsm = NULL; - printf("about to call nvme command handler\n"); status = sq->id ? nvme_io_cmd(n, &cmd, req) : nvme_admin_cmd(n, &cmd, req); if (status != NVME_NO_COMPLETE) { @@ -1714,7 +1743,7 @@ static int nvme_init(PCIDevice *pci_dev) NvmeIdCtrl *id = &n->id_ctrl; uint8_t *pci_conf; int64_t bs_size; - int i, j; + int i, j, k; NVME_LOG(DBG, "new controller B:D.f: %02x:%02x.%u", pci_bus_num(pci_dev->bus), PCI_SLOT(pci_dev->devfn), @@ -1757,7 +1786,7 @@ static int nvme_init(PCIDevice *pci_dev) NVME_LOG(ERR, "requested invalid volatile write cache:%u", n->vwc); return -1; } - if (n->lba_index > 3) { + if (n->lba_index > 15) { NVME_LOG(ERR, "requested invalid lba index:%u", n->lba_index); return -1; } @@ -1854,18 +1883,34 @@ static int nvme_init(PCIDevice *pci_dev) NVME_LOG(DBG, "ctrl:%u cap:%016lx", n->instance, n->bar.cap); for (i = 0; i < n->num_namespaces; i++) { + uint64_t blks; NvmeNamespace *ns = &n->namespaces[i]; NvmeIdNs *id_ns = &ns->id_ns; - id_ns->nlbaf = 0x4; + id_ns->nlbaf = 0x7; id_ns->flbas = n->lba_index; - - for (j = 0; j <= id_ns->nlbaf; j++) { - id_ns->lbaf[j].ds = 9 + j; + id_ns->mc = 1 << 1; + id_ns->dps = DPS_TYPE_1; + id_ns->dpc = 1 << 3 | 1; + for (j = 0; j < 2; j++) { + for (k = 0; k <= id_ns->nlbaf; k++) { + id_ns->lbaf[k + j * (id_ns->nlbaf)].ds = 9 + k; + id_ns->lbaf[k + j * (id_ns->nlbaf)].ms = j * n->meta; + } } - id_ns->ncap = id_ns->nsze = (n->ns_size) >> id_ns->lbaf[j].ds; + + blks = n->ns_size / ((1 << id_ns->lbaf[n->lba_index].ds) + + id_ns->lbaf[n->lba_index].ms); + id_ns->ncap = id_ns->nsze = blks; ns->id = i + 1; ns->ctrl = n; - ns->start_block = (n->ns_size / BDRV_SECTOR_SIZE) * i; + ns->start_block = (n->ns_size / BDRV_SECTOR_SIZE + + n->meta * id_ns->nsze) * i; + if (n->meta) { + id_ns->nlbaf = 0xf; + ns->meta_start_block = ns->start_block + + (id_ns->nsze << id_ns->lbaf[n->lba_index].ds) / + BDRV_SECTOR_SIZE; + } ns->util = bitmap_new(id_ns->nsze); } @@ -1875,7 +1920,6 @@ static int nvme_init(PCIDevice *pci_dev) "\tfile size:%"PRIu64"", instance, n->num_namespaces, n->ns_size, n->num_queues, n->db_stride, n->reg_size, n->max_q_ents, n->cqr, n->mdts, n->aerl, n->acl, n->elpe, bdrv_getlength(n->conf.bs)); - return 0; } diff --git a/hw/nvme.h b/hw/nvme.h index 926f843..5264232 100644 --- a/hw/nvme.h +++ b/hw/nvme.h @@ -596,6 +596,8 @@ typedef struct NvmeRequest { uint16_t rw; uint16_t nlb; uint16_t aio_count; + uint32_t meta_size; + uint64_t mptr; NvmeCqe cqe; QEMUSGList qsg; QTAILQ_ENTRY(NvmeRequest)entry; @@ -651,6 +653,7 @@ typedef struct NvmeNamespace { unsigned long *util; unsigned long *uncorrectable; uint64_t start_block; + uint64_t meta_start_block; } NvmeNamespace; typedef struct NvmeCtrl { -- 1.7.0.4