From 4d95620bc3105916e69c40cff8e2e3d55bd6c4ae Mon Sep 17 00:00:00 2001 From: Paul Winder Date: Wed, 16 Oct 2019 10:19:13 +0100 Subject: [PATCH] 11827 Increase concurrency through blkdev 11847 The nvme cmd completion queue is contentious Reviewed by: Robert Mustacchi Reviewed by: Hans Rosenfeld Reviewed by: Matthias Scheler Approved by: Dan McDonald --- usr/src/uts/common/io/blkdev/blkdev.c | 306 ++++++++++++++----- usr/src/uts/common/io/nvme/nvme.c | 135 ++++++-- usr/src/uts/common/io/nvme/nvme_var.h | 5 +- usr/src/uts/common/io/sdcard/impl/sda_slot.c | 3 +- usr/src/uts/common/io/skd/skd.c | 3 +- usr/src/uts/common/io/vioblk/vioblk.c | 3 +- usr/src/uts/common/sys/blkdev.h | 30 +- 7 files changed, 355 insertions(+), 130 deletions(-) diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c index d3b96c9f8a..3405187f60 100644 --- a/usr/src/uts/common/io/blkdev/blkdev.c +++ b/usr/src/uts/common/io/blkdev/blkdev.c @@ -24,6 +24,7 @@ * Copyright 2012 Alexey Zaytsev All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright 2017 The MathWorks, Inc. All rights reserved. + * Copyright 2019 Western Digital Corporation. */ #include @@ -53,19 +54,93 @@ #include #include +/* + * blkdev is a driver which provides a lot of the common functionality + * a block device driver may need and helps by removing code which + * is frequently duplicated in block device drivers. + * + * Within this driver all the struct cb_ops functions required for a + * block device driver are written with appropriate call back functions + * to be provided by the parent driver. + * + * To use blkdev, a driver needs to: + * 1. Create a bd_ops_t structure which has the call back operations + * blkdev will use. + * 2. Create a handle by calling bd_alloc_handle(). One of the + * arguments to this function is the bd_ops_t. + * 3. Call bd_attach_handle(). This will instantiate a blkdev device + * as a child device node of the calling driver. + * + * A parent driver is not restricted to just allocating and attaching a + * single instance, it may attach as many as it wishes. For each handle + * attached, appropriate entries in /dev/[r]dsk are created. + * + * The bd_ops_t routines that a parent of blkdev need to provide are: + * + * o_drive_info: Provide information to blkdev such as how many I/O queues + * to create and the size of those queues. Also some device + * specifics such as EUI, vendor, product, model, serial + * number .... + * + * o_media_info: Provide information about the media. Eg size and block size. + * + * o_devid_init: Creates and initializes the device id. Typically calls + * ddi_devid_init(). + * + * o_sync_cache: Issues a device appropriate command to flush any write + * caches. + * + * o_read: Read data as described by bd_xfer_t argument. + * + * o_write: Write data as described by bd_xfer_t argument. + * + * + * Queues + * ------ + * Part of the drive_info data is a queue count. blkdev will create + * "queue count" number of waitq/runq pairs. Each waitq/runq pair + * operates independently. As an I/O is scheduled up to the parent + * driver via o_read or o_write its queue number is given. If the + * parent driver supports multiple hardware queues it can then select + * where to submit the I/O request. + * + * Currently blkdev uses a simplistic round-robin queue selection method. + * It has the advantage that it is lockless. In the future it will be + * worthwhile reviewing this strategy for something which prioritizes queues + * depending on how busy they are. + * + * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming + * I/O requests are initially added to the waitq. They are taken off the + * waitq, added to the runq and submitted, providing the runq is less + * than the qsize as specified in the drive_info. As an I/O request + * completes, the parent driver is required to call bd_xfer_done(), which + * will remove the I/O request from the runq and pass I/O completion + * status up the stack. + * + * Locks + * ----- + * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and + * d_statemutex. As well a q_iomutex per waitq/runq pair. + * + * Currently, there is no lock hierarchy. Nowhere do we ever own more than + * one lock, any change needs to be documented here with a defined + * hierarchy. + */ + #define BD_MAXPART 64 #define BDINST(dev) (getminor(dev) / BD_MAXPART) #define BDPART(dev) (getminor(dev) % BD_MAXPART) typedef struct bd bd_t; typedef struct bd_xfer_impl bd_xfer_impl_t; +typedef struct bd_queue bd_queue_t; struct bd { void *d_private; dev_info_t *d_dip; kmutex_t d_ocmutex; - kmutex_t d_iomutex; - kmutex_t *d_errmutex; + kmutex_t d_ksmutex; + kmutex_t d_errmutex; kmutex_t d_statemutex; kcondvar_t d_statecv; enum dkio_state d_state; @@ -73,8 +148,9 @@ struct bd { unsigned d_open_lyr[BD_MAXPART]; /* open count */ uint64_t d_open_excl; /* bit mask indexed by partition */ uint64_t d_open_reg[OTYPCNT]; /* bit mask */ + uint64_t d_io_counter; - uint32_t d_qsize; + uint32_t d_qcount; uint32_t d_qactive; uint32_t d_maxxfer; uint32_t d_blkshift; @@ -83,8 +159,7 @@ struct bd { ddi_devid_t d_devid; kmem_cache_t *d_cache; - list_t d_runq; - list_t d_waitq; + bd_queue_t *d_queues; kstat_t *d_ksp; kstat_io_t *d_kiop; kstat_t *d_errstats; @@ -117,6 +192,7 @@ struct bd_xfer_impl { list_node_t i_linkage; bd_t *i_bd; buf_t *i_bp; + bd_queue_t *i_bq; uint_t i_num_win; uint_t i_cur_win; off_t i_offset; @@ -126,6 +202,14 @@ struct bd_xfer_impl { size_t i_resid; }; +struct bd_queue { + kmutex_t q_iomutex; + uint32_t q_qsize; + uint32_t q_qactive; + list_t q_runq; + list_t q_waitq; +}; + #define i_dmah i_public.x_dmah #define i_dmac i_public.x_dmac #define i_ndmac i_public.x_ndmac @@ -133,6 +217,7 @@ struct bd_xfer_impl { #define i_nblks i_public.x_nblks #define i_blkno i_public.x_blkno #define i_flags i_public.x_flags +#define i_qnum i_public.x_qnum /* @@ -166,7 +251,7 @@ static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, static int bd_tg_getinfo(dev_info_t *, int, void *, void *); static int bd_xfer_ctor(void *, void *, int); static void bd_xfer_dtor(void *, void *); -static void bd_sched(bd_t *); +static void bd_sched(bd_t *, bd_queue_t *); static void bd_submit(bd_t *, bd_xfer_impl_t *); static void bd_runq_exit(bd_xfer_impl_t *, int); static void bd_update_state(bd_t *); @@ -181,20 +266,20 @@ struct cmlb_tg_ops bd_tg_ops = { }; static struct cb_ops bd_cb_ops = { - bd_open, /* open */ - bd_close, /* close */ - bd_strategy, /* strategy */ - nodev, /* print */ + bd_open, /* open */ + bd_close, /* close */ + bd_strategy, /* strategy */ + nodev, /* print */ bd_dump, /* dump */ - bd_read, /* read */ - bd_write, /* write */ - bd_ioctl, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* poll */ - bd_prop_op, /* cb_prop_op */ - 0, /* streamtab */ + bd_read, /* read */ + bd_write, /* write */ + bd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + bd_prop_op, /* cb_prop_op */ + 0, /* streamtab */ D_64BIT | D_MP, /* Driver comaptibility flag */ CB_REV, /* cb_rev */ bd_aread, /* async read */ @@ -202,15 +287,15 @@ static struct cb_ops bd_cb_ops = { }; struct dev_ops bd_dev_ops = { - DEVO_REV, /* devo_rev, */ - 0, /* refcnt */ + DEVO_REV, /* devo_rev, */ + 0, /* refcnt */ bd_getinfo, /* getinfo */ - nulldev, /* identify */ - nulldev, /* probe */ - bd_attach, /* attach */ + nulldev, /* identify */ + nulldev, /* probe */ + bd_attach, /* attach */ bd_detach, /* detach */ - nodev, /* reset */ - &bd_cb_ops, /* driver operations */ + nodev, /* reset */ + &bd_cb_ops, /* driver operations */ NULL, /* bus operations */ NULL, /* power */ ddi_quiesce_not_needed, /* quiesce */ @@ -350,6 +435,7 @@ bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive) bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); + mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL); if (bd->d_errstats == NULL) { /* * Even if we cannot create the kstat, we create a @@ -359,17 +445,8 @@ bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive) */ bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats), KM_SLEEP); - bd->d_errmutex = kmem_zalloc(sizeof (kmutex_t), KM_SLEEP); - mutex_init(bd->d_errmutex, NULL, MUTEX_DRIVER, NULL); } else { - if (bd->d_errstats->ks_lock == NULL) { - bd->d_errstats->ks_lock = kmem_zalloc(sizeof (kmutex_t), - KM_SLEEP); - mutex_init(bd->d_errstats->ks_lock, NULL, MUTEX_DRIVER, - NULL); - } - - bd->d_errmutex = bd->d_errstats->ks_lock; + bd->d_errstats->ks_lock = &bd->d_errmutex; bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data; } @@ -436,7 +513,7 @@ bd_init_errstats(bd_t *bd, bd_drive_t *drive) { struct bd_errstats *est = bd->d_kerr; - mutex_enter(bd->d_errmutex); + mutex_enter(&bd->d_errmutex); if (drive->d_model_len > 0 && KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) { @@ -454,7 +531,23 @@ bd_init_errstats(bd_t *bd, bd_drive_t *drive) bd_errstats_setstr(&est->bd_serial, drive->d_serial, drive->d_serial_len, "0 "); - mutex_exit(bd->d_errmutex); + mutex_exit(&bd->d_errmutex); +} + +static void +bd_queues_free(bd_t *bd) +{ + uint32_t i; + + for (i = 0; i < bd->d_qcount; i++) { + bd_queue_t *bq = &bd->d_queues[i]; + + mutex_destroy(&bq->q_iomutex); + list_destroy(&bq->q_waitq); + list_destroy(&bq->q_runq); + } + + kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount); } static int @@ -464,6 +557,7 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) bd_handle_t hdl; bd_t *bd; bd_drive_t drive; + uint32_t i; int rv; char name[16]; char kcache[32]; @@ -537,23 +631,18 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) hdl->h_bd = bd; ddi_set_driver_private(dip, bd); - mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL); cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL); - list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t), - offsetof(struct bd_xfer_impl, i_linkage)); - list_create(&bd->d_runq, sizeof (bd_xfer_impl_t), - offsetof(struct bd_xfer_impl, i_linkage)); - bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8, bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0); bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); if (bd->d_ksp != NULL) { - bd->d_ksp->ks_lock = &bd->d_iomutex; + bd->d_ksp->ks_lock = &bd->d_ksmutex; kstat_install(bd->d_ksp); bd->d_kiop = bd->d_ksp->ks_data; } else { @@ -571,8 +660,12 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) bd->d_state = DKIO_NONE; bzero(&drive, sizeof (drive)); + /* + * Default to one queue, parent driver can override. + */ + drive.d_qcount = 1; bd->d_ops.o_drive_info(bd->d_private, &drive); - bd->d_qsize = drive.d_qsize; + bd->d_qcount = drive.d_qcount; bd->d_removable = drive.d_removable; bd->d_hotpluggable = drive.d_hotpluggable; @@ -585,6 +678,21 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) bd_init_errstats(bd, &drive); bd_update_state(bd); + bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount, + KM_SLEEP); + for (i = 0; i < bd->d_qcount; i++) { + bd_queue_t *bq = &bd->d_queues[i]; + + bq->q_qsize = drive.d_qsize; + bq->q_qactive = 0; + mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL); + + list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t), + offsetof(struct bd_xfer_impl, i_linkage)); + list_create(&bq->q_runq, sizeof (bd_xfer_impl_t), + offsetof(struct bd_xfer_impl, i_linkage)); + } + rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT, bd->d_removable, bd->d_hotpluggable, /*LINTED: E_BAD_PTR_CAST_ALIGN*/ @@ -594,12 +702,11 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) if (rv != 0) { cmlb_free_handle(&bd->d_cmlbh); kmem_cache_destroy(bd->d_cache); - mutex_destroy(&bd->d_iomutex); + mutex_destroy(&bd->d_ksmutex); mutex_destroy(&bd->d_ocmutex); mutex_destroy(&bd->d_statemutex); cv_destroy(&bd->d_statecv); - list_destroy(&bd->d_waitq); - list_destroy(&bd->d_runq); + bd_queues_free(bd); if (bd->d_ksp != NULL) { kstat_delete(bd->d_ksp); bd->d_ksp = NULL; @@ -670,7 +777,7 @@ bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) bd->d_errstats = NULL; } else { kmem_free(bd->d_kerr, sizeof (struct bd_errstats)); - mutex_destroy(bd->d_errmutex); + mutex_destroy(&bd->d_errmutex); } cmlb_detach(bd->d_cmlbh, 0); @@ -678,12 +785,11 @@ bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (bd->d_devid) ddi_devid_free(bd->d_devid); kmem_cache_destroy(bd->d_cache); - mutex_destroy(&bd->d_iomutex); + mutex_destroy(&bd->d_ksmutex); mutex_destroy(&bd->d_ocmutex); mutex_destroy(&bd->d_statemutex); cv_destroy(&bd->d_statecv); - list_destroy(&bd->d_waitq); - list_destroy(&bd->d_runq); + bd_queues_free(bd); ddi_soft_state_free(bd_state, ddi_get_instance(dip)); return (DDI_SUCCESS); } @@ -1195,7 +1301,7 @@ bd_strategy(struct buf *bp) bd_xfer_impl_t *xi; uint32_t shift; int (*func)(void *, bd_xfer_t *); - diskaddr_t lblkno; + diskaddr_t lblkno; part = BDPART(bp->b_edev); inst = BDINST(bp->b_edev); @@ -1520,19 +1626,18 @@ bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) static void -bd_sched(bd_t *bd) +bd_sched(bd_t *bd, bd_queue_t *bq) { bd_xfer_impl_t *xi; struct buf *bp; int rv; - mutex_enter(&bd->d_iomutex); + mutex_enter(&bq->q_iomutex); - while ((bd->d_qactive < bd->d_qsize) && - ((xi = list_remove_head(&bd->d_waitq)) != NULL)) { - bd->d_qactive++; - kstat_waitq_to_runq(bd->d_kiop); - list_insert_tail(&bd->d_runq, xi); + while ((bq->q_qactive < bq->q_qsize) && + ((xi = list_remove_head(&bq->q_waitq)) != NULL)) { + bq->q_qactive++; + list_insert_tail(&bq->q_runq, xi); /* * Submit the job to the driver. We drop the I/O mutex @@ -1540,7 +1645,11 @@ bd_sched(bd_t *bd) * completion routine calls back into us synchronously. */ - mutex_exit(&bd->d_iomutex); + mutex_exit(&bq->q_iomutex); + + mutex_enter(&bd->d_ksmutex); + kstat_waitq_to_runq(bd->d_kiop); + mutex_exit(&bd->d_ksmutex); rv = xi->i_func(bd->d_private, &xi->i_public); if (rv != 0) { @@ -1549,53 +1658,71 @@ bd_sched(bd_t *bd) biodone(bp); atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32); - - mutex_enter(&bd->d_iomutex); - bd->d_qactive--; + mutex_enter(&bd->d_ksmutex); kstat_runq_exit(bd->d_kiop); - list_remove(&bd->d_runq, xi); + mutex_exit(&bd->d_ksmutex); + + mutex_enter(&bq->q_iomutex); + bq->q_qactive--; + list_remove(&bq->q_runq, xi); bd_xfer_free(xi); } else { - mutex_enter(&bd->d_iomutex); + mutex_enter(&bq->q_iomutex); } } - mutex_exit(&bd->d_iomutex); + mutex_exit(&bq->q_iomutex); } static void bd_submit(bd_t *bd, bd_xfer_impl_t *xi) { - mutex_enter(&bd->d_iomutex); - list_insert_tail(&bd->d_waitq, xi); + uint64_t nv = atomic_inc_64_nv(&bd->d_io_counter); + unsigned q = nv % bd->d_qcount; + bd_queue_t *bq = &bd->d_queues[q]; + + xi->i_bq = bq; + xi->i_qnum = q; + + mutex_enter(&bq->q_iomutex); + list_insert_tail(&bq->q_waitq, xi); + mutex_exit(&bq->q_iomutex); + + mutex_enter(&bd->d_ksmutex); kstat_waitq_enter(bd->d_kiop); - mutex_exit(&bd->d_iomutex); + mutex_exit(&bd->d_ksmutex); - bd_sched(bd); + bd_sched(bd, bq); } static void bd_runq_exit(bd_xfer_impl_t *xi, int err) { - bd_t *bd = xi->i_bd; - buf_t *bp = xi->i_bp; + bd_t *bd = xi->i_bd; + buf_t *bp = xi->i_bp; + bd_queue_t *bq = xi->i_bq; + + mutex_enter(&bq->q_iomutex); + bq->q_qactive--; + list_remove(&bq->q_runq, xi); + mutex_exit(&bq->q_iomutex); - mutex_enter(&bd->d_iomutex); - bd->d_qactive--; + mutex_enter(&bd->d_ksmutex); kstat_runq_exit(bd->d_kiop); - list_remove(&bd->d_runq, xi); - mutex_exit(&bd->d_iomutex); + mutex_exit(&bd->d_ksmutex); if (err == 0) { if (bp->b_flags & B_READ) { - bd->d_kiop->reads++; - bd->d_kiop->nread += (bp->b_bcount - xi->i_resid); + atomic_inc_uint(&bd->d_kiop->reads); + atomic_add_64((uint64_t *)&bd->d_kiop->nread, + bp->b_bcount - xi->i_resid); } else { - bd->d_kiop->writes++; - bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid); + atomic_inc_uint(&bd->d_kiop->writes); + atomic_add_64((uint64_t *)&bd->d_kiop->nwritten, + bp->b_bcount - xi->i_resid); } } - bd_sched(bd); + bd_sched(bd, bq); } static void @@ -1797,6 +1924,19 @@ bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag) { bd_handle_t hdl; + /* + * There is full compatability between the version 0 API and the + * current version. + */ + switch (ops->o_version) { + case BD_OPS_VERSION_0: + case BD_OPS_CURRENT_VERSION: + break; + + default: + return (NULL); + } + hdl = kmem_zalloc(sizeof (*hdl), kmflag); if (hdl != NULL) { hdl->h_ops = *ops; diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index b77545ead2..9fd95a1137 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -542,7 +542,7 @@ static struct modlinkage nvme_modlinkage = { }; static bd_ops_t nvme_bd_ops = { - .o_version = BD_OPS_VERSION_0, + .o_version = BD_OPS_CURRENT_VERSION, .o_drive_info = nvme_bd_driveinfo, .o_media_info = nvme_bd_mediainfo, .o_devid_init = nvme_bd_devid, @@ -831,6 +831,9 @@ nvme_free_cq(nvme_cq_t *cq) { mutex_destroy(&cq->ncq_mutex); + if (cq->ncq_cmd_taskq != NULL) + taskq_destroy(cq->ncq_cmd_taskq); + if (cq->ncq_dma != NULL) nvme_free_dma(cq->ncq_dma); @@ -876,9 +879,11 @@ nvme_destroy_cq_array(nvme_t *nvme, uint_t start) } static int -nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx) +nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx, + uint_t nthr) { nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); + char name[64]; /* large enough for the taskq name */ mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, DDI_INTR_PRI(nvme->n_intr_pri)); @@ -892,6 +897,21 @@ nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx) cq->ncq_id = idx; cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); + /* + * Each completion queue has its own command taskq. + */ + (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u", + ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx); + + cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX, + TASKQ_PREPOPULATE); + + if (cq->ncq_cmd_taskq == NULL) { + dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd " + "taskq for cq %u", idx); + goto fail; + } + *cqp = cq; return (DDI_SUCCESS); @@ -909,7 +929,7 @@ fail: * max number of entries to UINT16_MAX + 1. */ static int -nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry) +nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr) { nvme_cq_t **cq; uint_t i, cq_count; @@ -926,7 +946,7 @@ nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry) nvme->n_cq[i] = cq[i]; for (; i < nvme->n_cq_count; i++) - if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i) != + if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) != DDI_SUCCESS) goto fail; @@ -1158,8 +1178,8 @@ nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) mutex_enter(&cq->ncq_mutex); while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { - taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, - cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); + taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd, + TQ_NOSLEEP, &cmd->nc_tqent); completed++; } @@ -2494,6 +2514,7 @@ nvme_init_ns(nvme_t *nvme, int nsid) { nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; nvme_identify_nsid_t *idns; + boolean_t was_ignored; int last_rp; ns->ns_nvme = nvme; @@ -2552,6 +2573,8 @@ nvme_init_ns(nvme_t *nvme, int nsid) if (ns->ns_best_block_size < nvme->n_min_block_size) ns->ns_best_block_size = nvme->n_min_block_size; + was_ignored = ns->ns_ignore; + /* * We currently don't support namespaces that use either: * - protection information @@ -2571,6 +2594,25 @@ nvme_init_ns(nvme_t *nvme, int nsid) ns->ns_ignore = B_FALSE; } + /* + * Keep a count of namespaces which are attachable. + * See comments in nvme_bd_driveinfo() to understand its effect. + */ + if (was_ignored) { + /* + * Previously ignored, but now not. Count it. + */ + if (!ns->ns_ignore) + nvme->n_namespaces_attachable++; + } else { + /* + * Wasn't ignored previously, but now needs to be. + * Discount it. + */ + if (ns->ns_ignore) + nvme->n_namespaces_attachable--; + } + return (DDI_SUCCESS); } @@ -2586,6 +2628,7 @@ nvme_init(nvme_t *nvme) nvme_reg_csts_t csts; int i = 0; uint16_t nqueues; + uint_t tq_threads; char model[sizeof (nvme->n_idctl->id_model) + 1]; char *vendor, *product; @@ -2655,9 +2698,9 @@ nvme_init(nvme_t *nvme) /* * Create the cq array with one completion queue to be assigned - * to the admin queue pair. + * to the admin queue pair and a limited number of taskqs (4). */ - if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len) != + if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, "!failed to pre-allocate admin completion queue"); @@ -2911,6 +2954,7 @@ nvme_init(nvme_t *nvme) for (i = 0; i != nvme->n_namespace_count; i++) { mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); + nvme->n_ns[i].ns_ignore = B_TRUE; if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) goto fail; } @@ -2983,8 +3027,21 @@ nvme_init(nvme_t *nvme) (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", nvme->n_io_cqueue_len); + /* + * Assign the equal quantity of taskq threads to each completion + * queue, capping the total number of threads to the number + * of CPUs. + */ + tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues; + + /* + * In case the calculation above is zero, we need at least one + * thread per completion queue. + */ + tq_threads = MAX(1, tq_threads); + if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, - nvme->n_io_cqueue_len) != DDI_SUCCESS) { + nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, "!failed to pre-allocate completion queues"); goto fail; @@ -3350,18 +3407,6 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) nvme->n_progress |= NVME_REGS_MAPPED; - /* - * Create taskq for command completion. - */ - (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq", - ddi_driver_name(dip), ddi_get_instance(dip)); - nvme->n_cmd_taskq = ddi_taskq_create(dip, name, MIN(UINT16_MAX, ncpus), - TASKQ_DEFAULTPRI, 0); - if (nvme->n_cmd_taskq == NULL) { - dev_err(dip, CE_WARN, "!failed to create cmd taskq"); - goto fail; - } - /* * Create PRP DMA cache */ @@ -3487,8 +3532,10 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (nvme->n_progress & NVME_INTERRUPTS) nvme_release_interrupts(nvme); - if (nvme->n_cmd_taskq) - ddi_taskq_wait(nvme->n_cmd_taskq); + for (i = 0; i < nvme->n_cq_count; i++) { + if (nvme->n_cq[i]->ncq_cmd_taskq != NULL) + taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); + } if (nvme->n_ioq_count > 0) { for (i = 1; i != nvme->n_ioq_count + 1; i++) { @@ -3511,9 +3558,6 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) (void) nvme_reset(nvme, B_FALSE); } - if (nvme->n_cmd_taskq) - ddi_taskq_destroy(nvme->n_cmd_taskq); - if (nvme->n_progress & NVME_CTRL_LIMITS) sema_destroy(&nvme->n_abort_sema); @@ -3693,15 +3737,40 @@ nvme_bd_driveinfo(void *arg, bd_drive_t *drive) { nvme_namespace_t *ns = arg; nvme_t *nvme = ns->ns_nvme; + uint_t ns_count = MAX(1, nvme->n_namespaces_attachable); + + /* + * Set the blkdev qcount to the number of submission queues. + * It will then create one waitq/runq pair for each submission + * queue and spread I/O requests across the queues. + */ + drive->d_qcount = nvme->n_ioq_count; + + /* + * I/O activity to individual namespaces is distributed across + * each of the d_qcount blkdev queues (which has been set to + * the number of nvme submission queues). d_qsize is the number + * of submitted and not completed I/Os within each queue that blkdev + * will allow before it starts holding them in the waitq. + * + * Each namespace will create a child blkdev instance, for each one + * we try and set the d_qsize so that each namespace gets an + * equal portion of the submission queue. + * + * If post instantiation of the nvme drive, n_namespaces_attachable + * changes and a namespace is attached it could calculate a + * different d_qsize. It may even be that the sum of the d_qsizes is + * now beyond the submission queue size. Should that be the case + * and the I/O rate is such that blkdev attempts to submit more + * I/Os than the size of the submission queue, the excess I/Os + * will be held behind the semaphore nq_sema. + */ + drive->d_qsize = nvme->n_io_squeue_len / ns_count; /* - * blkdev maintains one queue size per instance (namespace), - * but all namespace share the I/O queues. - * TODO: need to figure out a sane default, or use per-NS I/O queues, - * or change blkdev to handle EAGAIN + * Don't let the queue size drop below the minimum, though. */ - drive->d_qsize = nvme->n_ioq_count * nvme->n_io_squeue_len - / nvme->n_namespace_count; + drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN); /* * d_maxxfer is not set, which means the value is taken from the DMA @@ -3758,7 +3827,7 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) if (cmd == NULL) return (ENOMEM); - cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; + cmd->nc_sqid = xfer->x_qnum + 1; ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); ioq = nvme->n_ioq[cmd->nc_sqid]; diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h index 110f639845..fb8f4ba771 100644 --- a/usr/src/uts/common/io/nvme/nvme_var.h +++ b/usr/src/uts/common/io/nvme/nvme_var.h @@ -106,6 +106,8 @@ struct nvme_cq { uintptr_t ncq_hdbl; int ncq_phase; + taskq_t *ncq_cmd_taskq; + kmutex_t ncq_mutex; }; @@ -180,6 +182,7 @@ struct nvme { int n_pagesize; int n_namespace_count; + uint_t n_namespaces_attachable; uint_t n_ioq_count; uint_t n_cq_count; @@ -200,8 +203,6 @@ struct nvme { ksema_t n_abort_sema; - ddi_taskq_t *n_cmd_taskq; - /* state for devctl minor node */ nvme_minor_state_t n_minor; diff --git a/usr/src/uts/common/io/sdcard/impl/sda_slot.c b/usr/src/uts/common/io/sdcard/impl/sda_slot.c index 1d64f6d89b..dda392c7ea 100644 --- a/usr/src/uts/common/io/sdcard/impl/sda_slot.c +++ b/usr/src/uts/common/io/sdcard/impl/sda_slot.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Western Digital Corporation. */ /* @@ -477,7 +478,7 @@ sda_slot_fini(sda_slot_t *slot) } static bd_ops_t sda_bd_ops = { - BD_OPS_VERSION_0, + BD_OPS_CURRENT_VERSION, sda_mem_bd_driveinfo, sda_mem_bd_mediainfo, NULL, /* devid_init */ diff --git a/usr/src/uts/common/io/skd/skd.c b/usr/src/uts/common/io/skd/skd.c index 78753ba00f..172d00806b 100644 --- a/usr/src/uts/common/io/skd/skd.c +++ b/usr/src/uts/common/io/skd/skd.c @@ -24,6 +24,7 @@ * Copyright 2013 STEC, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Western Digital Corporation. */ #include @@ -129,7 +130,7 @@ static int skd_devid_init(void *arg, dev_info_t *, ddi_devid_t *); static bd_ops_t skd_bd_ops = { - BD_OPS_VERSION_0, + BD_OPS_CURRENT_VERSION, skd_bd_driveinfo, skd_bd_mediainfo, skd_devid_init, diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c index df1d1d6dd4..ef67ba12d9 100644 --- a/usr/src/uts/common/io/vioblk/vioblk.c +++ b/usr/src/uts/common/io/vioblk/vioblk.c @@ -23,6 +23,7 @@ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Alexey Zaytsev * Copyright 2019 Joyent Inc. + * Copyright 2019 Western Digital Corporation. */ /* @@ -925,7 +926,7 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * stack based on negotiated features. */ bd_ops_t vioblk_bd_ops = { - .o_version = BD_OPS_VERSION_0, + .o_version = BD_OPS_CURRENT_VERSION, .o_drive_info = vioblk_bd_driveinfo, .o_media_info = vioblk_bd_mediainfo, .o_devid_init = vioblk_bd_devid, diff --git a/usr/src/uts/common/sys/blkdev.h b/usr/src/uts/common/sys/blkdev.h index cb6a397520..7802f06dbc 100644 --- a/usr/src/uts/common/sys/blkdev.h +++ b/usr/src/uts/common/sys/blkdev.h @@ -22,6 +22,7 @@ * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Western Digital Corporation. */ #ifndef _SYS_BLKDEV_H @@ -93,6 +94,7 @@ struct bd_xfer { unsigned x_ndmac; caddr_t x_kaddr; unsigned x_flags; + unsigned x_qnum; }; #define BD_XFER_POLL (1U << 0) /* no interrupts (dump) */ @@ -115,6 +117,8 @@ struct bd_drive { size_t d_revision_len; char *d_revision; uint8_t d_eui64[8]; + /* Added at the end to maintain binary compatibility */ + uint32_t d_qcount; }; struct bd_media { @@ -142,18 +146,26 @@ struct bd_media { #define BD_INFO_FLAG_HOTPLUGGABLE (1U << 1) #define BD_INFO_FLAG_READ_ONLY (1U << 2) +/* + * If the API changes and we want to bump the version, add another + * enum value, Eg BD_OPS_VERSION_1. BD_OPS_CURRENT_VERSION should always + * be last. + */ +typedef enum { + BD_OPS_VERSION_0 = 0, + BD_OPS_CURRENT_VERSION +} bd_version_t; + struct bd_ops { - int o_version; - void (*o_drive_info)(void *, bd_drive_t *); - int (*o_media_info)(void *, bd_media_t *); - int (*o_devid_init)(void *, dev_info_t *, ddi_devid_t *); - int (*o_sync_cache)(void *, bd_xfer_t *); - int (*o_read)(void *, bd_xfer_t *); - int (*o_write)(void *, bd_xfer_t *); + bd_version_t o_version; + void (*o_drive_info)(void *, bd_drive_t *); + int (*o_media_info)(void *, bd_media_t *); + int (*o_devid_init)(void *, dev_info_t *, ddi_devid_t *); + int (*o_sync_cache)(void *, bd_xfer_t *); + int (*o_read)(void *, bd_xfer_t *); + int (*o_write)(void *, bd_xfer_t *); }; -#define BD_OPS_VERSION_0 0 - struct bd_errstats { /* these are managed by blkdev itself */ kstat_named_t bd_softerrs; -- 2.45.2