~hww3/illumos-joyent

3d33c98653d81c0c8bb83de26636d7e4d1aee96c — Jerry Jelinek 5 years ago 23397fb + 4d95620
	[illumos-gate merge]

commit 4d95620bc3105916e69c40cff8e2e3d55bd6c4ae
    11827 Increase concurrency through blkdev
    11847 The nvme cmd completion queue is contentious
commit d15801816cc716b56ada92df72ee04b7b891291c
    11927 Log, or optionally panic, on zero-length kmem allocations
commit e7253420989203a0028f982d437bf50c3140eba6
    12007 make_test always returns non-zero

	Conflicts:
	usr/src/uts/common/io/nvme/nvme_var.h
	usr/src/man/man9f/kmem_alloc.9f
	usr/src/cmd/mdb/common/modules/genunix/genunix.c
M usr/src/cmd/mdb/common/modules/genunix/genunix.c => usr/src/cmd/mdb/common/modules/genunix/genunix.c +5 -3
@@ 4264,9 4264,11 @@ static const mdb_dcmd_t dcmds[] = {
	{ "bufctl", ":[-vh] [-a addr] [-c caller] [-e earliest] [-l latest] "
		"[-t thd]", "print or filter a bufctl", bufctl, bufctl_help },
	{ "freedby", ":", "given a thread, print its freed buffers", freedby },
	{ "kmalog", "?[ fail | slab ]",
	    "display kmem transaction log and stack traces", kmalog },
	{ "kmastat", NULL, "kernel memory allocator stats", kmastat },
	{ "kmalog", "?[ fail | slab | zerosized ]",
	    "display kmem transaction log and stack traces for specified type",
	    kmalog },
	{ "kmastat", "[-kmg]", "kernel memory allocator stats",
	    kmastat },
	{ "kmausers", "?[-ef] [cache ...]", "current medium and large users "
		"of the kmem allocator", kmausers, kmausers_help },
	{ "kmem_cache", "?[-n name]",

M usr/src/man/man9f/kmem_alloc.9f => usr/src/man/man9f/kmem_alloc.9f +1 -17
@@ 5,11 5,10 @@
.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
.TH KMEM_ALLOC 9F "Oct 22, 2014"
.TH KMEM_ALLOC 9F "Nov 20, 2019"
.SH NAME
kmem_alloc, kmem_zalloc, kmem_free \- allocate kernel memory
.SH SYNOPSIS
.LP
.nf
#include <sys/types.h>
#include <sys/kmem.h>


@@ 30,11 29,8 @@ kmem_alloc, kmem_zalloc, kmem_free \- allocate kernel memory
.fi

.SH INTERFACE LEVEL
.sp
.LP
Architecture independent level 1 (DDI/DKI).
.SH PARAMETERS
.sp
.ne 2
.na
\fB\fIsize\fR\fR


@@ 64,8 60,6 @@ Pointer to allocated memory.
.RE

.SH DESCRIPTION
.sp
.LP
The \fBkmem_alloc()\fR function allocates \fIsize\fR bytes of kernel memory and
returns a pointer to the allocated memory. The allocated memory is at least
double-word aligned, so it can hold any C data structure. No greater alignment


@@ 84,28 78,20 @@ The \fBkmem_free()\fR function frees previously allocated kernel memory. The
buffer address and size must exactly match the original allocation. Memory
cannot be returned piecemeal.
.SH RETURN VALUES
.sp
.LP
If successful, \fBkmem_alloc()\fR and \fBkmem_zalloc()\fR return a pointer to
the allocated memory. If \fBKM_NOSLEEP\fR is set and memory cannot be allocated
without sleeping, \fBkmem_alloc()\fR and \fBkmem_zalloc()\fR return \fINULL\fR.
.SH CONTEXT
.sp
.LP
The \fBkmem_alloc()\fR and \fBkmem_zalloc()\fR functions can be called from
interrupt context only if the \fBKM_NOSLEEP\fR flag is set. They can be called
from user context with any valid \fIflag\fR. The \fBkmem_free()\fR function can
be called from from user, interrupt, or kernel context.
.SH SEE ALSO
.sp
.LP
\fBcopyout\fR(9F), \fBfreerbuf\fR(9F), \fBgetrbuf\fR(9F)
.sp
.LP
\fIWriting Device Drivers\fR
.SH WARNINGS
.sp
.LP
Memory allocated using \fBkmem_alloc()\fR is not paged. Available memory is
therefore limited by the total physical memory on the system. It is also
limited by the available kernel virtual address space, which is often the more


@@ 127,8 113,6 @@ garbage. This random garbage may include secure kernel data. Therefore,
uninitialized kernel memory should be handled carefully. For example, never
\fBcopyout\fR(9F) a potentially uninitialized buffer.
.SH NOTES
.sp
.LP
\fBkmem_alloc(0\fR, \fIflag\fR\fB)\fR always returns \fINULL\fR, but
if \fBKM_SLEEP\fR is set, this behavior is considered to be deprecated;
the system may be configured to explicitly panic in this case in lieu

M usr/src/test/util-tests/tests/make/make_test.ksh => usr/src/test/util-tests/tests/make/make_test.ksh +2 -0
@@ 165,3 165,5 @@ test_make_C_invalid

[[ $fail -gt 0 ]] && exit -1

exit 0


M usr/src/uts/common/fs/nfs/nfs_auth.c => usr/src/uts/common/fs/nfs/nfs_auth.c +1 -0
@@ 24,6 24,7 @@
 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
 * Copyright (c) 2015 by Delphix. All rights reserved.
 * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
 */

#include <sys/param.h>

M usr/src/uts/common/io/blkdev/blkdev.c => usr/src/uts/common/io/blkdev/blkdev.c +223 -83
@@ 24,6 24,7 @@
 * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
 * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
 * Copyright 2017 The MathWorks, Inc.  All rights reserved.
 * Copyright 2019 Western Digital Corporation.
 */

#include <sys/types.h>


@@ 53,19 54,93 @@
#include <sys/blkdev.h>
#include <sys/scsi/impl/inquiry.h>

/*
 * blkdev is a driver which provides a lot of the common functionality
 * a block device driver may need and helps by removing code which
 * is frequently duplicated in block device drivers.
 *
 * Within this driver all the struct cb_ops functions required for a
 * block device driver are written with appropriate call back functions
 * to be provided by the parent driver.
 *
 * To use blkdev, a driver needs to:
 *	1. Create a bd_ops_t structure which has the call back operations
 *	   blkdev will use.
 *	2. Create a handle by calling bd_alloc_handle(). One of the
 *	   arguments to this function is the bd_ops_t.
 *	3. Call bd_attach_handle(). This will instantiate a blkdev device
 *	   as a child device node of the calling driver.
 *
 * A parent driver is not restricted to just allocating and attaching a
 * single instance, it may attach as many as it wishes. For each handle
 * attached, appropriate entries in /dev/[r]dsk are created.
 *
 * The bd_ops_t routines that a parent of blkdev need to provide are:
 *
 * o_drive_info: Provide information to blkdev such as how many I/O queues
 *		 to create and the size of those queues. Also some device
 *		 specifics such as EUI, vendor, product, model, serial
 *		 number ....
 *
 * o_media_info: Provide information about the media. Eg size and block size.
 *
 * o_devid_init: Creates and initializes the device id. Typically calls
 *		 ddi_devid_init().
 *
 * o_sync_cache: Issues a device appropriate command to flush any write
 *		 caches.
 *
 * o_read:	 Read data as described by bd_xfer_t argument.
 *
 * o_write:	 Write data as described by bd_xfer_t argument.
 *
 *
 * Queues
 * ------
 * Part of the drive_info data is a queue count. blkdev will create
 * "queue count" number of waitq/runq pairs. Each waitq/runq pair
 * operates independently. As an I/O is scheduled up to the parent
 * driver via o_read or o_write its queue number is given. If the
 * parent driver supports multiple hardware queues it can then select
 * where to submit the I/O request.
 *
 * Currently blkdev uses a simplistic round-robin queue selection method.
 * It has the advantage that it is lockless. In the future it will be
 * worthwhile reviewing this strategy for something which prioritizes queues
 * depending on how busy they are.
 *
 * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
 * I/O requests are initially added to the waitq. They are taken off the
 * waitq, added to the runq and submitted, providing the runq is less
 * than the qsize as specified in the drive_info. As an I/O request
 * completes, the parent driver is required to call bd_xfer_done(), which
 * will remove the I/O request from the runq and pass I/O completion
 * status up the stack.
 *
 * Locks
 * -----
 * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and
 * d_statemutex. As well a q_iomutex per waitq/runq pair.
 *
 * Currently, there is no lock hierarchy. Nowhere do we ever own more than
 * one lock, any change needs to be documented here with a defined
 * hierarchy.
 */

#define	BD_MAXPART	64
#define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
#define	BDPART(dev)	(getminor(dev) % BD_MAXPART)

typedef struct bd bd_t;
typedef struct bd_xfer_impl bd_xfer_impl_t;
typedef struct bd_queue bd_queue_t;

struct bd {
	void		*d_private;
	dev_info_t	*d_dip;
	kmutex_t	d_ocmutex;
	kmutex_t	d_iomutex;
	kmutex_t	*d_errmutex;
	kmutex_t	d_ksmutex;
	kmutex_t	d_errmutex;
	kmutex_t	d_statemutex;
	kcondvar_t	d_statecv;
	enum dkio_state	d_state;


@@ 73,8 148,9 @@ struct bd {
	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
	uint64_t	d_open_excl;	/* bit mask indexed by partition */
	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
	uint64_t	d_io_counter;

	uint32_t	d_qsize;
	uint32_t	d_qcount;
	uint32_t	d_qactive;
	uint32_t	d_maxxfer;
	uint32_t	d_blkshift;


@@ 83,8 159,7 @@ struct bd {
	ddi_devid_t	d_devid;

	kmem_cache_t	*d_cache;
	list_t		d_runq;
	list_t		d_waitq;
	bd_queue_t	*d_queues;
	kstat_t		*d_ksp;
	kstat_io_t	*d_kiop;
	kstat_t		*d_errstats;


@@ 117,6 192,7 @@ struct bd_xfer_impl {
	list_node_t	i_linkage;
	bd_t		*i_bd;
	buf_t		*i_bp;
	bd_queue_t	*i_bq;
	uint_t		i_num_win;
	uint_t		i_cur_win;
	off_t		i_offset;


@@ 126,6 202,14 @@ struct bd_xfer_impl {
	size_t		i_resid;
};

struct bd_queue {
	kmutex_t	q_iomutex;
	uint32_t	q_qsize;
	uint32_t	q_qactive;
	list_t		q_runq;
	list_t		q_waitq;
};

#define	i_dmah		i_public.x_dmah
#define	i_dmac		i_public.x_dmac
#define	i_ndmac		i_public.x_ndmac


@@ 133,6 217,7 @@ struct bd_xfer_impl {
#define	i_nblks		i_public.x_nblks
#define	i_blkno		i_public.x_blkno
#define	i_flags		i_public.x_flags
#define	i_qnum		i_public.x_qnum


/*


@@ 166,7 251,7 @@ static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
static int bd_xfer_ctor(void *, void *, int);
static void bd_xfer_dtor(void *, void *);
static void bd_sched(bd_t *);
static void bd_sched(bd_t *, bd_queue_t *);
static void bd_submit(bd_t *, bd_xfer_impl_t *);
static void bd_runq_exit(bd_xfer_impl_t *, int);
static void bd_update_state(bd_t *);


@@ 181,20 266,20 @@ struct cmlb_tg_ops bd_tg_ops = {
};

static struct cb_ops bd_cb_ops = {
	bd_open, 		/* open */
	bd_close, 		/* close */
	bd_strategy, 		/* strategy */
	nodev, 			/* print */
	bd_open,		/* open */
	bd_close,		/* close */
	bd_strategy,		/* strategy */
	nodev,			/* print */
	bd_dump,		/* dump */
	bd_read, 		/* read */
	bd_write, 		/* write */
	bd_ioctl, 		/* ioctl */
	nodev, 			/* devmap */
	nodev, 			/* mmap */
	nodev, 			/* segmap */
	nochpoll, 		/* poll */
	bd_prop_op, 		/* cb_prop_op */
	0, 			/* streamtab  */
	bd_read,		/* read */
	bd_write,		/* write */
	bd_ioctl,		/* ioctl */
	nodev,			/* devmap */
	nodev,			/* mmap */
	nodev,			/* segmap */
	nochpoll,		/* poll */
	bd_prop_op,		/* cb_prop_op */
	0,			/* streamtab  */
	D_64BIT | D_MP,		/* Driver comaptibility flag */
	CB_REV,			/* cb_rev */
	bd_aread,		/* async read */


@@ 202,15 287,15 @@ static struct cb_ops bd_cb_ops = {
};

struct dev_ops bd_dev_ops = {
	DEVO_REV, 		/* devo_rev, */
	0, 			/* refcnt  */
	DEVO_REV,		/* devo_rev, */
	0,			/* refcnt  */
	bd_getinfo,		/* getinfo */
	nulldev, 		/* identify */
	nulldev, 		/* probe */
	bd_attach, 		/* attach */
	nulldev,		/* identify */
	nulldev,		/* probe */
	bd_attach,		/* attach */
	bd_detach,		/* detach */
	nodev, 			/* reset */
	&bd_cb_ops, 		/* driver operations */
	nodev,			/* reset */
	&bd_cb_ops,		/* driver operations */
	NULL,			/* bus operations */
	NULL,			/* power */
	ddi_quiesce_not_needed,	/* quiesce */


@@ 350,6 435,7 @@ bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);

	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
	if (bd->d_errstats == NULL) {
		/*
		 * Even if we cannot create the kstat, we create a


@@ 359,17 445,8 @@ bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
		 */
		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
		    KM_SLEEP);
		bd->d_errmutex = kmem_zalloc(sizeof (kmutex_t), KM_SLEEP);
		mutex_init(bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
	} else {
		if (bd->d_errstats->ks_lock == NULL) {
			bd->d_errstats->ks_lock = kmem_zalloc(sizeof (kmutex_t),
			    KM_SLEEP);
			mutex_init(bd->d_errstats->ks_lock, NULL, MUTEX_DRIVER,
			    NULL);
		}

		bd->d_errmutex = bd->d_errstats->ks_lock;
		bd->d_errstats->ks_lock = &bd->d_errmutex;
		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
	}



@@ 436,7 513,7 @@ bd_init_errstats(bd_t *bd, bd_drive_t *drive)
{
	struct bd_errstats	*est = bd->d_kerr;

	mutex_enter(bd->d_errmutex);
	mutex_enter(&bd->d_errmutex);

	if (drive->d_model_len > 0 &&
	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {


@@ 454,7 531,23 @@ bd_init_errstats(bd_t *bd, bd_drive_t *drive)
	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
	    drive->d_serial_len, "0               ");

	mutex_exit(bd->d_errmutex);
	mutex_exit(&bd->d_errmutex);
}

static void
bd_queues_free(bd_t *bd)
{
	uint32_t i;

	for (i = 0; i < bd->d_qcount; i++) {
		bd_queue_t *bq = &bd->d_queues[i];

		mutex_destroy(&bq->q_iomutex);
		list_destroy(&bq->q_waitq);
		list_destroy(&bq->q_runq);
	}

	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
}

static int


@@ 464,6 557,7 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	bd_handle_t	hdl;
	bd_t		*bd;
	bd_drive_t	drive;
	uint32_t	i;
	int		rv;
	char		name[16];
	char		kcache[32];


@@ 537,23 631,18 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	hdl->h_bd = bd;
	ddi_set_driver_private(dip, bd);

	mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);

	list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
	    offsetof(struct bd_xfer_impl, i_linkage));
	list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
	    offsetof(struct bd_xfer_impl, i_linkage));

	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);

	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
	if (bd->d_ksp != NULL) {
		bd->d_ksp->ks_lock = &bd->d_iomutex;
		bd->d_ksp->ks_lock = &bd->d_ksmutex;
		kstat_install(bd->d_ksp);
		bd->d_kiop = bd->d_ksp->ks_data;
	} else {


@@ 571,8 660,12 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	bd->d_state = DKIO_NONE;

	bzero(&drive, sizeof (drive));
	/*
	 * Default to one queue, parent driver can override.
	 */
	drive.d_qcount = 1;
	bd->d_ops.o_drive_info(bd->d_private, &drive);
	bd->d_qsize = drive.d_qsize;
	bd->d_qcount = drive.d_qcount;
	bd->d_removable = drive.d_removable;
	bd->d_hotpluggable = drive.d_hotpluggable;



@@ 585,6 678,21 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	bd_init_errstats(bd, &drive);
	bd_update_state(bd);

	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
	    KM_SLEEP);
	for (i = 0; i < bd->d_qcount; i++) {
		bd_queue_t *bq = &bd->d_queues[i];

		bq->q_qsize = drive.d_qsize;
		bq->q_qactive = 0;
		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);

		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
		    offsetof(struct bd_xfer_impl, i_linkage));
		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
		    offsetof(struct bd_xfer_impl, i_linkage));
	}

	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
	    bd->d_removable, bd->d_hotpluggable,
	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/


@@ 594,12 702,11 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	if (rv != 0) {
		cmlb_free_handle(&bd->d_cmlbh);
		kmem_cache_destroy(bd->d_cache);
		mutex_destroy(&bd->d_iomutex);
		mutex_destroy(&bd->d_ksmutex);
		mutex_destroy(&bd->d_ocmutex);
		mutex_destroy(&bd->d_statemutex);
		cv_destroy(&bd->d_statecv);
		list_destroy(&bd->d_waitq);
		list_destroy(&bd->d_runq);
		bd_queues_free(bd);
		if (bd->d_ksp != NULL) {
			kstat_delete(bd->d_ksp);
			bd->d_ksp = NULL;


@@ 670,7 777,7 @@ bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
		bd->d_errstats = NULL;
	} else {
		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
		mutex_destroy(bd->d_errmutex);
		mutex_destroy(&bd->d_errmutex);
	}

	cmlb_detach(bd->d_cmlbh, 0);


@@ 678,12 785,11 @@ bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
	if (bd->d_devid)
		ddi_devid_free(bd->d_devid);
	kmem_cache_destroy(bd->d_cache);
	mutex_destroy(&bd->d_iomutex);
	mutex_destroy(&bd->d_ksmutex);
	mutex_destroy(&bd->d_ocmutex);
	mutex_destroy(&bd->d_statemutex);
	cv_destroy(&bd->d_statecv);
	list_destroy(&bd->d_waitq);
	list_destroy(&bd->d_runq);
	bd_queues_free(bd);
	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
	return (DDI_SUCCESS);
}


@@ 1195,7 1301,7 @@ bd_strategy(struct buf *bp)
	bd_xfer_impl_t	*xi;
	uint32_t	shift;
	int		(*func)(void *, bd_xfer_t *);
	diskaddr_t 	lblkno;
	diskaddr_t	lblkno;

	part = BDPART(bp->b_edev);
	inst = BDINST(bp->b_edev);


@@ 1520,19 1626,18 @@ bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)


static void
bd_sched(bd_t *bd)
bd_sched(bd_t *bd, bd_queue_t *bq)
{
	bd_xfer_impl_t	*xi;
	struct buf	*bp;
	int		rv;

	mutex_enter(&bd->d_iomutex);
	mutex_enter(&bq->q_iomutex);

	while ((bd->d_qactive < bd->d_qsize) &&
	    ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
		bd->d_qactive++;
		kstat_waitq_to_runq(bd->d_kiop);
		list_insert_tail(&bd->d_runq, xi);
	while ((bq->q_qactive < bq->q_qsize) &&
	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
		bq->q_qactive++;
		list_insert_tail(&bq->q_runq, xi);

		/*
		 * Submit the job to the driver.  We drop the I/O mutex


@@ 1540,7 1645,11 @@ bd_sched(bd_t *bd)
		 * completion routine calls back into us synchronously.
		 */

		mutex_exit(&bd->d_iomutex);
		mutex_exit(&bq->q_iomutex);

		mutex_enter(&bd->d_ksmutex);
		kstat_waitq_to_runq(bd->d_kiop);
		mutex_exit(&bd->d_ksmutex);

		rv = xi->i_func(bd->d_private, &xi->i_public);
		if (rv != 0) {


@@ 1549,53 1658,71 @@ bd_sched(bd_t *bd)
			biodone(bp);

			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);

			mutex_enter(&bd->d_iomutex);
			bd->d_qactive--;
			mutex_enter(&bd->d_ksmutex);
			kstat_runq_exit(bd->d_kiop);
			list_remove(&bd->d_runq, xi);
			mutex_exit(&bd->d_ksmutex);

			mutex_enter(&bq->q_iomutex);
			bq->q_qactive--;
			list_remove(&bq->q_runq, xi);
			bd_xfer_free(xi);
		} else {
			mutex_enter(&bd->d_iomutex);
			mutex_enter(&bq->q_iomutex);
		}
	}

	mutex_exit(&bd->d_iomutex);
	mutex_exit(&bq->q_iomutex);
}

static void
bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
{
	mutex_enter(&bd->d_iomutex);
	list_insert_tail(&bd->d_waitq, xi);
	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
	unsigned	q = nv % bd->d_qcount;
	bd_queue_t	*bq = &bd->d_queues[q];

	xi->i_bq = bq;
	xi->i_qnum = q;

	mutex_enter(&bq->q_iomutex);
	list_insert_tail(&bq->q_waitq, xi);
	mutex_exit(&bq->q_iomutex);

	mutex_enter(&bd->d_ksmutex);
	kstat_waitq_enter(bd->d_kiop);
	mutex_exit(&bd->d_iomutex);
	mutex_exit(&bd->d_ksmutex);

	bd_sched(bd);
	bd_sched(bd, bq);
}

static void
bd_runq_exit(bd_xfer_impl_t *xi, int err)
{
	bd_t	*bd = xi->i_bd;
	buf_t	*bp = xi->i_bp;
	bd_t		*bd = xi->i_bd;
	buf_t		*bp = xi->i_bp;
	bd_queue_t	*bq = xi->i_bq;

	mutex_enter(&bq->q_iomutex);
	bq->q_qactive--;
	list_remove(&bq->q_runq, xi);
	mutex_exit(&bq->q_iomutex);

	mutex_enter(&bd->d_iomutex);
	bd->d_qactive--;
	mutex_enter(&bd->d_ksmutex);
	kstat_runq_exit(bd->d_kiop);
	list_remove(&bd->d_runq, xi);
	mutex_exit(&bd->d_iomutex);
	mutex_exit(&bd->d_ksmutex);

	if (err == 0) {
		if (bp->b_flags & B_READ) {
			bd->d_kiop->reads++;
			bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
			atomic_inc_uint(&bd->d_kiop->reads);
			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
			    bp->b_bcount - xi->i_resid);
		} else {
			bd->d_kiop->writes++;
			bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
			atomic_inc_uint(&bd->d_kiop->writes);
			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
			    bp->b_bcount - xi->i_resid);
		}
	}
	bd_sched(bd);
	bd_sched(bd, bq);
}

static void


@@ 1797,6 1924,19 @@ bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
{
	bd_handle_t	hdl;

	/*
	 * There is full compatability between the version 0 API and the
	 * current version.
	 */
	switch (ops->o_version) {
	case BD_OPS_VERSION_0:
	case BD_OPS_CURRENT_VERSION:
		break;

	default:
		return (NULL);
	}

	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
	if (hdl != NULL) {
		hdl->h_ops = *ops;

M usr/src/uts/common/io/nvme/nvme.c => usr/src/uts/common/io/nvme/nvme.c +102 -33
@@ 559,7 559,7 @@ static struct modlinkage nvme_modlinkage = {
};

static bd_ops_t nvme_bd_ops = {
	.o_version	= BD_OPS_VERSION_0,
	.o_version	= BD_OPS_CURRENT_VERSION,
	.o_drive_info	= nvme_bd_driveinfo,
	.o_media_info	= nvme_bd_mediainfo,
	.o_devid_init	= nvme_bd_devid,


@@ 848,6 848,9 @@ nvme_free_cq(nvme_cq_t *cq)
{
	mutex_destroy(&cq->ncq_mutex);

	if (cq->ncq_cmd_taskq != NULL)
		taskq_destroy(cq->ncq_cmd_taskq);

	if (cq->ncq_dma != NULL)
		nvme_free_dma(cq->ncq_dma);



@@ 893,9 896,11 @@ nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
}

static int
nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx)
nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx,
    uint_t nthr)
{
	nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
	char name[64];		/* large enough for the taskq name */

	mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
	    DDI_INTR_PRI(nvme->n_intr_pri));


@@ 909,6 914,21 @@ nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx)
	cq->ncq_id = idx;
	cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);

	/*
	 * Each completion queue has its own command taskq.
	 */
	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u",
	    ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx);

	cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX,
	    TASKQ_PREPOPULATE);

	if (cq->ncq_cmd_taskq == NULL) {
		dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd "
		    "taskq for cq %u", idx);
		goto fail;
	}

	*cqp = cq;
	return (DDI_SUCCESS);



@@ 926,7 946,7 @@ fail:
 * max number of entries to UINT16_MAX + 1.
 */
static int
nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry)
nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr)
{
	nvme_cq_t **cq;
	uint_t i, cq_count;


@@ 943,7 963,7 @@ nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry)
		nvme->n_cq[i] = cq[i];

	for (; i < nvme->n_cq_count; i++)
		if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i) !=
		if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) !=
		    DDI_SUCCESS)
			goto fail;



@@ 1179,8 1199,8 @@ nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
	mutex_enter(&cq->ncq_mutex);

	while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
		taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
		    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
		taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd,
		    TQ_NOSLEEP, &cmd->nc_tqent);

		completed++;
	}


@@ 2515,6 2535,7 @@ nvme_init_ns(nvme_t *nvme, int nsid)
{
	nvme_namespace_t *ns = &nvme->n_ns[nsid - 1];
	nvme_identify_nsid_t *idns;
	boolean_t was_ignored;
	int last_rp;

	ns->ns_nvme = nvme;


@@ 2573,6 2594,8 @@ nvme_init_ns(nvme_t *nvme, int nsid)
	if (ns->ns_best_block_size < nvme->n_min_block_size)
		ns->ns_best_block_size = nvme->n_min_block_size;

	was_ignored = ns->ns_ignore;

	/*
	 * We currently don't support namespaces that use either:
	 * - protection information


@@ 2592,6 2615,25 @@ nvme_init_ns(nvme_t *nvme, int nsid)
		ns->ns_ignore = B_FALSE;
	}

	/*
	 * Keep a count of namespaces which are attachable.
	 * See comments in nvme_bd_driveinfo() to understand its effect.
	 */
	if (was_ignored) {
		/*
		 * Previously ignored, but now not. Count it.
		 */
		if (!ns->ns_ignore)
			nvme->n_namespaces_attachable++;
	} else {
		/*
		 * Wasn't ignored previously, but now needs to be.
		 * Discount it.
		 */
		if (ns->ns_ignore)
			nvme->n_namespaces_attachable--;
	}

	return (DDI_SUCCESS);
}



@@ 2607,6 2649,7 @@ nvme_init(nvme_t *nvme)
	nvme_reg_csts_t csts;
	int i = 0;
	uint16_t nqueues;
	uint_t tq_threads;
	char model[sizeof (nvme->n_idctl->id_model) + 1];
	char *vendor, *product;



@@ 2676,9 2719,9 @@ nvme_init(nvme_t *nvme)

	/*
	 * Create the cq array with one completion queue to be assigned
	 * to the admin queue pair.
	 * to the admin queue pair and a limited number of taskqs (4).
	 */
	if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len) !=
	if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) !=
	    DDI_SUCCESS) {
		dev_err(nvme->n_dip, CE_WARN,
		    "!failed to pre-allocate admin completion queue");


@@ 2932,6 2975,7 @@ nvme_init(nvme_t *nvme)
	for (i = 0; i != nvme->n_namespace_count; i++) {
		mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER,
		    NULL);
		nvme->n_ns[i].ns_ignore = B_TRUE;
		if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS)
			goto fail;
	}


@@ 3004,8 3048,21 @@ nvme_init(nvme_t *nvme)
	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
	    nvme->n_io_cqueue_len);

	/*
	 * Assign the equal quantity of taskq threads to each completion
	 * queue, capping the total number of threads to the number
	 * of CPUs.
	 */
	tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues;

	/*
	 * In case the calculation above is zero, we need at least one
	 * thread per completion queue.
	 */
	tq_threads = MAX(1, tq_threads);

	if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
	    nvme->n_io_cqueue_len) != DDI_SUCCESS) {
	    nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) {
		dev_err(nvme->n_dip, CE_WARN,
		    "!failed to pre-allocate completion queues");
		goto fail;


@@ 3424,18 3481,6 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	nvme->n_progress |= NVME_REGS_MAPPED;

	/*
	 * Create taskq for command completion.
	 */
	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq",
	    ddi_driver_name(dip), ddi_get_instance(dip));
	nvme->n_cmd_taskq = ddi_taskq_create(dip, name, MIN(UINT16_MAX, ncpus),
	    TASKQ_DEFAULTPRI, 0);
	if (nvme->n_cmd_taskq == NULL) {
		dev_err(dip, CE_WARN, "!failed to create cmd taskq");
		goto fail;
	}

	/*
	 * Create PRP DMA cache
	 */
	(void) snprintf(name, sizeof (name), "%s%d_prp_cache",


@@ 3560,8 3605,10 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
	if (nvme->n_progress & NVME_INTERRUPTS)
		nvme_release_interrupts(nvme);

	if (nvme->n_cmd_taskq)
		ddi_taskq_wait(nvme->n_cmd_taskq);
	for (i = 0; i < nvme->n_cq_count; i++) {
		if (nvme->n_cq[i]->ncq_cmd_taskq != NULL)
			taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq);
	}

	if (nvme->n_ioq_count > 0) {
		for (i = 1; i != nvme->n_ioq_count + 1; i++) {


@@ 3584,9 3631,6 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
		(void) nvme_reset(nvme, B_FALSE);
	}

	if (nvme->n_cmd_taskq)
		ddi_taskq_destroy(nvme->n_cmd_taskq);

	if (nvme->n_progress & NVME_CTRL_LIMITS)
		sema_destroy(&nvme->n_abort_sema);



@@ 3772,15 3816,40 @@ nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
{
	nvme_namespace_t *ns = arg;
	nvme_t *nvme = ns->ns_nvme;
	uint_t ns_count = MAX(1, nvme->n_namespaces_attachable);

	/*
	 * Set the blkdev qcount to the number of submission queues.
	 * It will then create one waitq/runq pair for each submission
	 * queue and spread I/O requests across the queues.
	 */
	drive->d_qcount = nvme->n_ioq_count;

	/*
	 * I/O activity to individual namespaces is distributed across
	 * each of the d_qcount blkdev queues (which has been set to
	 * the number of nvme submission queues). d_qsize is the number
	 * of submitted and not completed I/Os within each queue that blkdev
	 * will allow before it starts holding them in the waitq.
	 *
	 * Each namespace will create a child blkdev instance, for each one
	 * we try and set the d_qsize so that each namespace gets an
	 * equal portion of the submission queue.
	 *
	 * If post instantiation of the nvme drive, n_namespaces_attachable
	 * changes and a namespace is attached it could calculate a
	 * different d_qsize. It may even be that the sum of the d_qsizes is
	 * now beyond the submission queue size. Should that be the case
	 * and the I/O rate is such that blkdev attempts to submit more
	 * I/Os than the size of the submission queue, the excess I/Os
	 * will be held behind the semaphore nq_sema.
	 */
	drive->d_qsize = nvme->n_io_squeue_len / ns_count;

	/*
	 * blkdev maintains one queue size per instance (namespace),
	 * but all namespace share the I/O queues.
	 * TODO: need to figure out a sane default, or use per-NS I/O queues,
	 * or change blkdev to handle EAGAIN
	 * Don't let the queue size drop below the minimum, though.
	 */
	drive->d_qsize = nvme->n_ioq_count * nvme->n_io_squeue_len
	    / nvme->n_namespace_count;
	drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN);

	/*
	 * d_maxxfer is not set, which means the value is taken from the DMA


@@ 3843,7 3912,7 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
	if (cmd == NULL)
		return (ENOMEM);

	cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
	cmd->nc_sqid = xfer->x_qnum + 1;
	ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
	ioq = nvme->n_ioq[cmd->nc_sqid];


M usr/src/uts/common/io/nvme/nvme_var.h => usr/src/uts/common/io/nvme/nvme_var.h +4 -3
@@ 106,6 106,8 @@ struct nvme_cq {
	uintptr_t ncq_hdbl;
	int ncq_phase;

	taskq_t *ncq_cmd_taskq;

	kmutex_t ncq_mutex;
};



@@ 183,7 185,8 @@ struct nvme {
	int n_pagesize;

	int n_namespace_count;
	uint_t n_ioq_count; /* number of I/O command queues */
	uint_t n_namespaces_attachable;
	uint_t n_ioq_count;
	uint_t n_cq_count;

	nvme_identify_ctrl_t *n_idctl;


@@ 208,8 211,6 @@ struct nvme {

	ksema_t n_abort_sema;

	ddi_taskq_t *n_cmd_taskq;

	/* state for devctl minor node */
	nvme_minor_state_t n_minor;


M usr/src/uts/common/io/sdcard/impl/sda_slot.c => usr/src/uts/common/io/sdcard/impl/sda_slot.c +2 -1
@@ 21,6 21,7 @@
/*
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 * Copyright 2019 Western Digital Corporation.
 */

/*


@@ 477,7 478,7 @@ sda_slot_fini(sda_slot_t *slot)
}

static bd_ops_t sda_bd_ops = {
	BD_OPS_VERSION_0,
	BD_OPS_CURRENT_VERSION,
	sda_mem_bd_driveinfo,
	sda_mem_bd_mediainfo,
	NULL,			/* devid_init */

M usr/src/uts/common/io/skd/skd.c => usr/src/uts/common/io/skd/skd.c +2 -1
@@ 24,6 24,7 @@
 * Copyright 2013 STEC, Inc.  All rights reserved.
 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 * Copyright (c) 2018, Joyent, Inc.
 * Copyright 2019 Western Digital Corporation.
 */

#include	<sys/types.h>


@@ 129,7 130,7 @@ static int  skd_devid_init(void *arg, dev_info_t *, ddi_devid_t *);


static bd_ops_t skd_bd_ops = {
	BD_OPS_VERSION_0,
	BD_OPS_CURRENT_VERSION,
	skd_bd_driveinfo,
	skd_bd_mediainfo,
	skd_devid_init,

M usr/src/uts/common/io/vioblk/vioblk.c => usr/src/uts/common/io/vioblk/vioblk.c +2 -1
@@ 23,6 23,7 @@
 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
 * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
 * Copyright 2019 Joyent Inc.
 * Copyright 2019 Western Digital Corporation.
 */

/*


@@ 925,7 926,7 @@ vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
	 * stack based on negotiated features.
	 */
	bd_ops_t vioblk_bd_ops = {
		.o_version =		BD_OPS_VERSION_0,
		.o_version =		BD_OPS_CURRENT_VERSION,
		.o_drive_info =		vioblk_bd_driveinfo,
		.o_media_info =		vioblk_bd_mediainfo,
		.o_devid_init =		vioblk_bd_devid,

M usr/src/uts/common/os/fio.c => usr/src/uts/common/os/fio.c +3 -3
@@ 1507,8 1507,8 @@ int
fgetstartvp(int fd, char *path, vnode_t **startvpp)
{
	vnode_t		*startvp;
	file_t 		*startfp;
	char 		startchar;
	file_t		*startfp;
	char		startchar;

	if (fd == AT_FDCWD && path == NULL)
		return (EFAULT);


@@ 1554,7 1554,7 @@ fsetattrat(int fd, char *path, int flags, struct vattr *vap)
{
	vnode_t		*startvp;
	vnode_t		*vp;
	int 		error;
	int		error;

	/*
	 * Since we are never called to set the size of a file, we don't

M usr/src/uts/common/os/kmem.c => usr/src/uts/common/os/kmem.c +2 -0
@@ 1061,6 1061,8 @@ static vmem_t		*kmem_default_arena;
static vmem_t		*kmem_firewall_va_arena;
static vmem_t		*kmem_firewall_arena;

static int		kmem_zerosized;		/* # of zero-sized allocs */

/*
 * kmem slab consolidator thresholds (tunables)
 */

M usr/src/uts/common/sys/blkdev.h => usr/src/uts/common/sys/blkdev.h +21 -9
@@ 22,6 22,7 @@
 * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
 * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2019 Western Digital Corporation.
 */

#ifndef	_SYS_BLKDEV_H


@@ 93,6 94,7 @@ struct bd_xfer {
	unsigned		x_ndmac;
	caddr_t			x_kaddr;
	unsigned		x_flags;
	unsigned		x_qnum;
};

#define	BD_XFER_POLL		(1U << 0)	/* no interrupts (dump) */


@@ 115,6 117,8 @@ struct bd_drive {
	size_t			d_revision_len;
	char			*d_revision;
	uint8_t			d_eui64[8];
	/* Added at the end to maintain binary compatibility */
	uint32_t		d_qcount;
};

struct bd_media {


@@ 142,18 146,26 @@ struct bd_media {
#define	BD_INFO_FLAG_HOTPLUGGABLE	(1U << 1)
#define	BD_INFO_FLAG_READ_ONLY		(1U << 2)

/*
 * If the API changes and we want to bump the version, add another
 * enum value, Eg BD_OPS_VERSION_1. BD_OPS_CURRENT_VERSION should always
 * be last.
 */
typedef enum {
	BD_OPS_VERSION_0 = 0,
	BD_OPS_CURRENT_VERSION
} bd_version_t;

struct bd_ops {
	int	o_version;
	void	(*o_drive_info)(void *, bd_drive_t *);
	int	(*o_media_info)(void *, bd_media_t *);
	int	(*o_devid_init)(void *, dev_info_t *, ddi_devid_t *);
	int	(*o_sync_cache)(void *, bd_xfer_t *);
	int	(*o_read)(void *, bd_xfer_t *);
	int	(*o_write)(void *, bd_xfer_t *);
	bd_version_t	o_version;
	void		(*o_drive_info)(void *, bd_drive_t *);
	int		(*o_media_info)(void *, bd_media_t *);
	int		(*o_devid_init)(void *, dev_info_t *, ddi_devid_t *);
	int		(*o_sync_cache)(void *, bd_xfer_t *);
	int		(*o_read)(void *, bd_xfer_t *);
	int		(*o_write)(void *, bd_xfer_t *);
};

#define	BD_OPS_VERSION_0		0

struct bd_errstats {
	/* these are managed by blkdev itself */
	kstat_named_t	bd_softerrs;