xref: /illumos-gate/usr/src/uts/common/io/blkdev/blkdev.c (revision 6f0e4dc9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2017 The MathWorks, Inc.  All rights reserved.
27  * Copyright 2019 Western Digital Corporation.
28  * Copyright 2020 Joyent, Inc.
29  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/ksynch.h>
34 #include <sys/kmem.h>
35 #include <sys/file.h>
36 #include <sys/errno.h>
37 #include <sys/open.h>
38 #include <sys/buf.h>
39 #include <sys/uio.h>
40 #include <sys/aio_req.h>
41 #include <sys/cred.h>
42 #include <sys/modctl.h>
43 #include <sys/cmlb.h>
44 #include <sys/conf.h>
45 #include <sys/devops.h>
46 #include <sys/list.h>
47 #include <sys/sysmacros.h>
48 #include <sys/dkio.h>
49 #include <sys/dkioc_free_util.h>
50 #include <sys/vtoc.h>
51 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
52 #include <sys/kstat.h>
53 #include <sys/fs/dv_node.h>
54 #include <sys/ddi.h>
55 #include <sys/sunddi.h>
56 #include <sys/note.h>
57 #include <sys/blkdev.h>
58 #include <sys/scsi/impl/inquiry.h>
59 #include <sys/taskq.h>
60 #include <sys/taskq_impl.h>
61 #include <sys/disp.h>
62 #include <sys/sysevent/eventdefs.h>
63 #include <sys/sysevent/dev.h>
64 
65 /*
66  * blkdev is a driver which provides a lot of the common functionality
67  * a block device driver may need and helps by removing code which
68  * is frequently duplicated in block device drivers.
69  *
70  * Within this driver all the struct cb_ops functions required for a
71  * block device driver are written with appropriate call back functions
72  * to be provided by the parent driver.
73  *
74  * To use blkdev, a driver needs to:
75  *	1. Create a bd_ops_t structure which has the call back operations
76  *	   blkdev will use.
77  *	2. Create a handle by calling bd_alloc_handle(). One of the
78  *	   arguments to this function is the bd_ops_t.
79  *	3. Call bd_attach_handle(). This will instantiate a blkdev device
80  *	   as a child device node of the calling driver.
81  *
82  * A parent driver is not restricted to just allocating and attaching a
83  * single instance, it may attach as many as it wishes. For each handle
84  * attached, appropriate entries in /dev/[r]dsk are created.
85  *
86  * The bd_ops_t routines that a parent of blkdev need to provide are:
87  *
88  * o_drive_info: Provide information to blkdev such as how many I/O queues
89  *		 to create and the size of those queues. Also some device
90  *		 specifics such as EUI, vendor, product, model, serial
91  *		 number ....
92  *
93  * o_media_info: Provide information about the media. Eg size and block size.
94  *
95  * o_devid_init: Creates and initializes the device id. Typically calls
96  *		 ddi_devid_init().
97  *
98  * o_sync_cache: Issues a device appropriate command to flush any write
99  *		 caches.
100  *
101  * o_read:	 Read data as described by bd_xfer_t argument.
102  *
103  * o_write:	 Write data as described by bd_xfer_t argument.
104  *
105  * o_free_space: Free the space described by bd_xfer_t argument (optional).
106  *
107  * Queues
108  * ------
109  * Part of the drive_info data is a queue count. blkdev will create
110  * "queue count" number of waitq/runq pairs. Each waitq/runq pair
111  * operates independently. As an I/O is scheduled up to the parent
112  * driver via o_read or o_write its queue number is given. If the
113  * parent driver supports multiple hardware queues it can then select
114  * where to submit the I/O request.
115  *
116  * Currently blkdev uses a simplistic round-robin queue selection method.
117  * It has the advantage that it is lockless. In the future it will be
118  * worthwhile reviewing this strategy for something which prioritizes queues
119  * depending on how busy they are.
120  *
121  * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
122  * I/O requests are initially added to the waitq. They are taken off the
123  * waitq, added to the runq and submitted, providing the runq is less
124  * than the qsize as specified in the drive_info. As an I/O request
125  * completes, the parent driver is required to call bd_xfer_done(), which
126  * will remove the I/O request from the runq and pass I/O completion
127  * status up the stack.
128  *
129  * Locks
130  * -----
131  * There are 5 instance global locks d_ocmutex, d_ksmutex, d_errmutex,
132  * d_statemutex and d_dle_mutex. As well a q_iomutex per waitq/runq pair.
133  *
134  * Lock Hierarchy
135  * --------------
136  * The only two locks which may be held simultaneously are q_iomutex and
137  * d_ksmutex. In all cases q_iomutex must be acquired before d_ksmutex.
138  */
139 
140 #define	BD_MAXPART	64
141 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
142 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
143 
144 typedef struct bd bd_t;
145 typedef struct bd_xfer_impl bd_xfer_impl_t;
146 typedef struct bd_queue bd_queue_t;
147 
148 typedef enum {
149 	BD_DLE_PENDING	= 1 << 0,
150 	BD_DLE_RUNNING	= 1 << 1
151 } bd_dle_state_t;
152 
153 struct bd {
154 	void		*d_private;
155 	dev_info_t	*d_dip;
156 	kmutex_t	d_ocmutex;	/* open/close */
157 	kmutex_t	d_ksmutex;	/* kstat */
158 	kmutex_t	d_errmutex;
159 	kmutex_t	d_statemutex;
160 	kcondvar_t	d_statecv;
161 	enum dkio_state	d_state;
162 	cmlb_handle_t	d_cmlbh;
163 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
164 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
165 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
166 	uint64_t	d_io_counter;
167 
168 	uint32_t	d_qcount;
169 	uint32_t	d_qactive;
170 	uint32_t	d_maxxfer;
171 	uint32_t	d_blkshift;
172 	uint32_t	d_pblkshift;
173 	uint64_t	d_numblks;
174 	ddi_devid_t	d_devid;
175 
176 	uint64_t	d_max_free_seg;
177 	uint64_t	d_max_free_blks;
178 	uint64_t	d_max_free_seg_blks;
179 	uint64_t	d_free_align;
180 
181 	kmem_cache_t	*d_cache;
182 	bd_queue_t	*d_queues;
183 	kstat_t		*d_ksp;
184 	kstat_io_t	*d_kiop;
185 	kstat_t		*d_errstats;
186 	struct bd_errstats *d_kerr;
187 
188 	boolean_t	d_rdonly;
189 	boolean_t	d_ssd;
190 	boolean_t	d_removable;
191 	boolean_t	d_hotpluggable;
192 	boolean_t	d_use_dma;
193 
194 	ddi_dma_attr_t	d_dma;
195 	bd_ops_t	d_ops;
196 	bd_handle_t	d_handle;
197 
198 	kmutex_t	d_dle_mutex;
199 	taskq_ent_t	d_dle_ent;
200 	bd_dle_state_t	d_dle_state;
201 };
202 
203 struct bd_handle {
204 	bd_ops_t	h_ops;
205 	ddi_dma_attr_t	*h_dma;
206 	dev_info_t	*h_parent;
207 	dev_info_t	*h_child;
208 	void		*h_private;
209 	bd_t		*h_bd;
210 	char		*h_name;
211 	char		h_addr[30];	/* enough for w%0.16x,%X */
212 };
213 
214 struct bd_xfer_impl {
215 	bd_xfer_t	i_public;
216 	list_node_t	i_linkage;
217 	bd_t		*i_bd;
218 	buf_t		*i_bp;
219 	bd_queue_t	*i_bq;
220 	uint_t		i_num_win;
221 	uint_t		i_cur_win;
222 	off_t		i_offset;
223 	int		(*i_func)(void *, bd_xfer_t *);
224 	uint32_t	i_blkshift;
225 	size_t		i_len;
226 	size_t		i_resid;
227 };
228 
229 struct bd_queue {
230 	kmutex_t	q_iomutex;
231 	uint32_t	q_qsize;
232 	uint32_t	q_qactive;
233 	list_t		q_runq;
234 	list_t		q_waitq;
235 };
236 
237 #define	i_dmah		i_public.x_dmah
238 #define	i_dmac		i_public.x_dmac
239 #define	i_ndmac		i_public.x_ndmac
240 #define	i_kaddr		i_public.x_kaddr
241 #define	i_nblks		i_public.x_nblks
242 #define	i_blkno		i_public.x_blkno
243 #define	i_flags		i_public.x_flags
244 #define	i_qnum		i_public.x_qnum
245 #define	i_dfl		i_public.x_dfl
246 
247 #define	CAN_FREESPACE(bd) \
248 	(((bd)->d_ops.o_free_space == NULL) ? B_FALSE : B_TRUE)
249 
250 /*
251  * Private prototypes.
252  */
253 
254 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
255 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
256 static void bd_create_errstats(bd_t *, int, bd_drive_t *);
257 static void bd_destroy_errstats(bd_t *);
258 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
259 static void bd_init_errstats(bd_t *, bd_drive_t *);
260 static void bd_fini_errstats(bd_t *);
261 
262 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
263 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
264 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
265 
266 static int bd_open(dev_t *, int, int, cred_t *);
267 static int bd_close(dev_t, int, int, cred_t *);
268 static int bd_strategy(struct buf *);
269 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
270 static int bd_dump(dev_t, caddr_t, daddr_t, int);
271 static int bd_read(dev_t, struct uio *, cred_t *);
272 static int bd_write(dev_t, struct uio *, cred_t *);
273 static int bd_aread(dev_t, struct aio_req *, cred_t *);
274 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
275 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
276     caddr_t, int *);
277 
278 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
279     void *);
280 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
281 static int bd_xfer_ctor(void *, void *, int);
282 static void bd_xfer_dtor(void *, void *);
283 static void bd_sched(bd_t *, bd_queue_t *);
284 static void bd_submit(bd_t *, bd_xfer_impl_t *);
285 static void bd_runq_exit(bd_xfer_impl_t *, int);
286 static void bd_update_state(bd_t *);
287 static int bd_check_state(bd_t *, enum dkio_state *);
288 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
289 static int bd_check_uio(dev_t, struct uio *);
290 static int bd_free_space(dev_t, bd_t *, dkioc_free_list_t *);
291 
292 struct cmlb_tg_ops bd_tg_ops = {
293 	TG_DK_OPS_VERSION_1,
294 	bd_tg_rdwr,
295 	bd_tg_getinfo,
296 };
297 
298 static struct cb_ops bd_cb_ops = {
299 	bd_open,		/* open */
300 	bd_close,		/* close */
301 	bd_strategy,		/* strategy */
302 	nodev,			/* print */
303 	bd_dump,		/* dump */
304 	bd_read,		/* read */
305 	bd_write,		/* write */
306 	bd_ioctl,		/* ioctl */
307 	nodev,			/* devmap */
308 	nodev,			/* mmap */
309 	nodev,			/* segmap */
310 	nochpoll,		/* poll */
311 	bd_prop_op,		/* cb_prop_op */
312 	0,			/* streamtab  */
313 	D_64BIT | D_MP,		/* Driver comaptibility flag */
314 	CB_REV,			/* cb_rev */
315 	bd_aread,		/* async read */
316 	bd_awrite		/* async write */
317 };
318 
319 struct dev_ops bd_dev_ops = {
320 	DEVO_REV,		/* devo_rev, */
321 	0,			/* refcnt  */
322 	bd_getinfo,		/* getinfo */
323 	nulldev,		/* identify */
324 	nulldev,		/* probe */
325 	bd_attach,		/* attach */
326 	bd_detach,		/* detach */
327 	nodev,			/* reset */
328 	&bd_cb_ops,		/* driver operations */
329 	NULL,			/* bus operations */
330 	NULL,			/* power */
331 	ddi_quiesce_not_needed,	/* quiesce */
332 };
333 
334 static struct modldrv modldrv = {
335 	&mod_driverops,
336 	"Generic Block Device",
337 	&bd_dev_ops,
338 };
339 
340 static struct modlinkage modlinkage = {
341 	MODREV_1, { &modldrv, NULL }
342 };
343 
344 static void *bd_state;
345 static krwlock_t bd_lock;
346 static taskq_t *bd_taskq;
347 
348 int
349 _init(void)
350 {
351 	char taskq_name[TASKQ_NAMELEN];
352 	const char *name;
353 	int rv;
354 
355 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
356 	if (rv != DDI_SUCCESS)
357 		return (rv);
358 
359 	name = mod_modname(&modlinkage);
360 	(void) snprintf(taskq_name, sizeof (taskq_name), "%s_taskq", name);
361 	bd_taskq = taskq_create(taskq_name, 1, minclsyspri, 0, 0, 0);
362 	if (bd_taskq == NULL) {
363 		cmn_err(CE_WARN, "%s: unable to create %s", name, taskq_name);
364 		ddi_soft_state_fini(&bd_state);
365 		return (DDI_FAILURE);
366 	}
367 
368 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
369 
370 	rv = mod_install(&modlinkage);
371 	if (rv != DDI_SUCCESS) {
372 		rw_destroy(&bd_lock);
373 		taskq_destroy(bd_taskq);
374 		ddi_soft_state_fini(&bd_state);
375 	}
376 	return (rv);
377 }
378 
379 int
380 _fini(void)
381 {
382 	int	rv;
383 
384 	rv = mod_remove(&modlinkage);
385 	if (rv == DDI_SUCCESS) {
386 		rw_destroy(&bd_lock);
387 		taskq_destroy(bd_taskq);
388 		ddi_soft_state_fini(&bd_state);
389 	}
390 	return (rv);
391 }
392 
393 int
394 _info(struct modinfo *modinfop)
395 {
396 	return (mod_info(&modlinkage, modinfop));
397 }
398 
399 static int
400 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
401 {
402 	bd_t	*bd;
403 	minor_t	inst;
404 
405 	_NOTE(ARGUNUSED(dip));
406 
407 	inst = BDINST((dev_t)arg);
408 
409 	switch (cmd) {
410 	case DDI_INFO_DEVT2DEVINFO:
411 		bd = ddi_get_soft_state(bd_state, inst);
412 		if (bd == NULL) {
413 			return (DDI_FAILURE);
414 		}
415 		*resultp = (void *)bd->d_dip;
416 		break;
417 
418 	case DDI_INFO_DEVT2INSTANCE:
419 		*resultp = (void *)(intptr_t)inst;
420 		break;
421 
422 	default:
423 		return (DDI_FAILURE);
424 	}
425 	return (DDI_SUCCESS);
426 }
427 
428 static void
429 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
430 {
431 	int	ilen;
432 	char	*data_string;
433 
434 	ilen = scsi_ascii_inquiry_len(data, len);
435 	ASSERT3U(ilen, <=, len);
436 	if (ilen <= 0)
437 		return;
438 	/* ensure null termination */
439 	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
440 	bcopy(data, data_string, ilen);
441 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
442 	kmem_free(data_string, ilen + 1);
443 }
444 
445 static void
446 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
447 {
448 	if (drive->d_vendor_len > 0)
449 		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
450 		    drive->d_vendor, drive->d_vendor_len);
451 
452 	if (drive->d_product_len > 0)
453 		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
454 		    drive->d_product, drive->d_product_len);
455 
456 	if (drive->d_serial_len > 0)
457 		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
458 		    drive->d_serial, drive->d_serial_len);
459 
460 	if (drive->d_revision_len > 0)
461 		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
462 		    drive->d_revision, drive->d_revision_len);
463 }
464 
465 static void
466 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
467 {
468 	char	ks_module[KSTAT_STRLEN];
469 	char	ks_name[KSTAT_STRLEN];
470 	int	ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
471 
472 	if (bd->d_errstats != NULL)
473 		return;
474 
475 	(void) snprintf(ks_module, sizeof (ks_module), "%serr",
476 	    ddi_driver_name(bd->d_dip));
477 	(void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
478 	    ddi_driver_name(bd->d_dip), inst);
479 
480 	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
481 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
482 
483 	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
484 	if (bd->d_errstats == NULL) {
485 		/*
486 		 * Even if we cannot create the kstat, we create a
487 		 * scratch kstat.  The reason for this is to ensure
488 		 * that we can update the kstat all of the time,
489 		 * without adding an extra branch instruction.
490 		 */
491 		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
492 		    KM_SLEEP);
493 	} else {
494 		bd->d_errstats->ks_lock = &bd->d_errmutex;
495 		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
496 	}
497 
498 	kstat_named_init(&bd->d_kerr->bd_softerrs,	"Soft Errors",
499 	    KSTAT_DATA_UINT32);
500 	kstat_named_init(&bd->d_kerr->bd_harderrs,	"Hard Errors",
501 	    KSTAT_DATA_UINT32);
502 	kstat_named_init(&bd->d_kerr->bd_transerrs,	"Transport Errors",
503 	    KSTAT_DATA_UINT32);
504 
505 	if (drive->d_model_len > 0) {
506 		kstat_named_init(&bd->d_kerr->bd_model,	"Model",
507 		    KSTAT_DATA_STRING);
508 	} else {
509 		kstat_named_init(&bd->d_kerr->bd_vid,	"Vendor",
510 		    KSTAT_DATA_STRING);
511 		kstat_named_init(&bd->d_kerr->bd_pid,	"Product",
512 		    KSTAT_DATA_STRING);
513 	}
514 
515 	kstat_named_init(&bd->d_kerr->bd_revision,	"Revision",
516 	    KSTAT_DATA_STRING);
517 	kstat_named_init(&bd->d_kerr->bd_serial,	"Serial No",
518 	    KSTAT_DATA_STRING);
519 	kstat_named_init(&bd->d_kerr->bd_capacity,	"Size",
520 	    KSTAT_DATA_ULONGLONG);
521 	kstat_named_init(&bd->d_kerr->bd_rq_media_err,	"Media Error",
522 	    KSTAT_DATA_UINT32);
523 	kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err,	"Device Not Ready",
524 	    KSTAT_DATA_UINT32);
525 	kstat_named_init(&bd->d_kerr->bd_rq_nodev_err,	"No Device",
526 	    KSTAT_DATA_UINT32);
527 	kstat_named_init(&bd->d_kerr->bd_rq_recov_err,	"Recoverable",
528 	    KSTAT_DATA_UINT32);
529 	kstat_named_init(&bd->d_kerr->bd_rq_illrq_err,	"Illegal Request",
530 	    KSTAT_DATA_UINT32);
531 	kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
532 	    "Predictive Failure Analysis", KSTAT_DATA_UINT32);
533 
534 	bd->d_errstats->ks_private = bd;
535 
536 	kstat_install(bd->d_errstats);
537 	bd_init_errstats(bd, drive);
538 }
539 
540 static void
541 bd_destroy_errstats(bd_t *bd)
542 {
543 	if (bd->d_errstats != NULL) {
544 		bd_fini_errstats(bd);
545 		kstat_delete(bd->d_errstats);
546 		bd->d_errstats = NULL;
547 	} else {
548 		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
549 		bd->d_kerr = NULL;
550 		mutex_destroy(&bd->d_errmutex);
551 	}
552 }
553 
554 static void
555 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
556 {
557 	char	*tmp;
558 	size_t	km_len;
559 
560 	if (KSTAT_NAMED_STR_PTR(k) == NULL) {
561 		if (len > 0)
562 			km_len = strnlen(str, len);
563 		else if (alt != NULL)
564 			km_len = strlen(alt);
565 		else
566 			return;
567 
568 		tmp = kmem_alloc(km_len + 1, KM_SLEEP);
569 		bcopy(len > 0 ? str : alt, tmp, km_len);
570 		tmp[km_len] = '\0';
571 
572 		kstat_named_setstr(k, tmp);
573 	}
574 }
575 
576 static void
577 bd_errstats_clrstr(kstat_named_t *k)
578 {
579 	if (KSTAT_NAMED_STR_PTR(k) == NULL)
580 		return;
581 
582 	kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k));
583 	kstat_named_setstr(k, NULL);
584 }
585 
586 static void
587 bd_init_errstats(bd_t *bd, bd_drive_t *drive)
588 {
589 	struct bd_errstats	*est = bd->d_kerr;
590 
591 	mutex_enter(&bd->d_errmutex);
592 
593 	if (drive->d_model_len > 0 &&
594 	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
595 		bd_errstats_setstr(&est->bd_model, drive->d_model,
596 		    drive->d_model_len, NULL);
597 	} else {
598 		bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
599 		    drive->d_vendor_len, "Unknown ");
600 		bd_errstats_setstr(&est->bd_pid, drive->d_product,
601 		    drive->d_product_len, "Unknown         ");
602 	}
603 
604 	bd_errstats_setstr(&est->bd_revision, drive->d_revision,
605 	    drive->d_revision_len, "0001");
606 	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
607 	    drive->d_serial_len, "0               ");
608 
609 	mutex_exit(&bd->d_errmutex);
610 }
611 
612 static void
613 bd_fini_errstats(bd_t *bd)
614 {
615 	struct bd_errstats	*est = bd->d_kerr;
616 
617 	mutex_enter(&bd->d_errmutex);
618 
619 	bd_errstats_clrstr(&est->bd_model);
620 	bd_errstats_clrstr(&est->bd_vid);
621 	bd_errstats_clrstr(&est->bd_pid);
622 	bd_errstats_clrstr(&est->bd_revision);
623 	bd_errstats_clrstr(&est->bd_serial);
624 
625 	mutex_exit(&bd->d_errmutex);
626 }
627 
628 static void
629 bd_queues_free(bd_t *bd)
630 {
631 	uint32_t i;
632 
633 	for (i = 0; i < bd->d_qcount; i++) {
634 		bd_queue_t *bq = &bd->d_queues[i];
635 
636 		mutex_destroy(&bq->q_iomutex);
637 		list_destroy(&bq->q_waitq);
638 		list_destroy(&bq->q_runq);
639 	}
640 
641 	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
642 }
643 
644 static int
645 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
646 {
647 	int		inst;
648 	bd_handle_t	hdl;
649 	bd_t		*bd;
650 	bd_drive_t	drive;
651 	uint32_t	i;
652 	int		rv;
653 	char		name[16];
654 	char		kcache[32];
655 
656 	switch (cmd) {
657 	case DDI_ATTACH:
658 		break;
659 	case DDI_RESUME:
660 		/* We don't do anything native for suspend/resume */
661 		return (DDI_SUCCESS);
662 	default:
663 		return (DDI_FAILURE);
664 	}
665 
666 	inst = ddi_get_instance(dip);
667 	hdl = ddi_get_parent_data(dip);
668 
669 	(void) snprintf(name, sizeof (name), "%s%d",
670 	    ddi_driver_name(dip), ddi_get_instance(dip));
671 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
672 
673 	if (hdl == NULL) {
674 		cmn_err(CE_WARN, "%s: missing parent data!", name);
675 		return (DDI_FAILURE);
676 	}
677 
678 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
679 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
680 		return (DDI_FAILURE);
681 	}
682 	bd = ddi_get_soft_state(bd_state, inst);
683 
684 	if (hdl->h_dma) {
685 		bd->d_dma = *(hdl->h_dma);
686 		bd->d_dma.dma_attr_granular =
687 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
688 		bd->d_use_dma = B_TRUE;
689 
690 		if (bd->d_maxxfer &&
691 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
692 			cmn_err(CE_WARN,
693 			    "%s: inconsistent maximum transfer size!",
694 			    name);
695 			/* We force it */
696 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
697 		} else {
698 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
699 		}
700 	} else {
701 		bd->d_use_dma = B_FALSE;
702 		if (bd->d_maxxfer == 0) {
703 			bd->d_maxxfer = 1024 * 1024;
704 		}
705 	}
706 	bd->d_ops = hdl->h_ops;
707 	bd->d_private = hdl->h_private;
708 	bd->d_blkshift = DEV_BSHIFT;	/* 512 bytes, to start */
709 
710 	if (bd->d_maxxfer % DEV_BSIZE) {
711 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
712 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
713 	}
714 	if (bd->d_maxxfer < DEV_BSIZE) {
715 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
716 		ddi_soft_state_free(bd_state, inst);
717 		return (DDI_FAILURE);
718 	}
719 
720 	bd->d_dip = dip;
721 	bd->d_handle = hdl;
722 	hdl->h_bd = bd;
723 	ddi_set_driver_private(dip, bd);
724 
725 	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
726 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
727 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
728 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
729 	mutex_init(&bd->d_dle_mutex, NULL, MUTEX_DRIVER, NULL);
730 	bd->d_dle_state = 0;
731 
732 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
733 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
734 
735 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
736 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
737 	if (bd->d_ksp != NULL) {
738 		bd->d_ksp->ks_lock = &bd->d_ksmutex;
739 		kstat_install(bd->d_ksp);
740 		bd->d_kiop = bd->d_ksp->ks_data;
741 	} else {
742 		/*
743 		 * Even if we cannot create the kstat, we create a
744 		 * scratch kstat.  The reason for this is to ensure
745 		 * that we can update the kstat all of the time,
746 		 * without adding an extra branch instruction.
747 		 */
748 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
749 	}
750 
751 	cmlb_alloc_handle(&bd->d_cmlbh);
752 
753 	bd->d_state = DKIO_NONE;
754 
755 	bzero(&drive, sizeof (drive));
756 	/*
757 	 * Default to one queue, and no restrictions on free space requests
758 	 * (if driver provides method) parent driver can override.
759 	 */
760 	drive.d_qcount = 1;
761 	drive.d_free_align = 1;
762 	bd->d_ops.o_drive_info(bd->d_private, &drive);
763 
764 	/*
765 	 * Several checks to make sure o_drive_info() didn't return bad
766 	 * values:
767 	 *
768 	 * There must be at least one queue
769 	 */
770 	if (drive.d_qcount == 0)
771 		goto fail_drive_info;
772 
773 	/* FREE/UNMAP/TRIM alignment needs to be at least 1 block */
774 	if (drive.d_free_align == 0)
775 		goto fail_drive_info;
776 
777 	/*
778 	 * If d_max_free_blks is not unlimited (not 0), then we cannot allow
779 	 * an unlimited segment size. It is however permissible to not impose
780 	 * a limit on the total number of blocks freed while limiting the
781 	 * amount allowed in an individual segment.
782 	 */
783 	if ((drive.d_max_free_blks > 0 && drive.d_max_free_seg_blks == 0))
784 		goto fail_drive_info;
785 
786 	/*
787 	 * If a limit is set on d_max_free_blks (by the above check, we know
788 	 * if there's a limit on d_max_free_blks, d_max_free_seg_blks cannot
789 	 * be unlimited), it cannot be smaller than the limit on an individual
790 	 * segment.
791 	 */
792 	if ((drive.d_max_free_blks > 0 &&
793 	    drive.d_max_free_seg_blks > drive.d_max_free_blks)) {
794 		goto fail_drive_info;
795 	}
796 
797 	bd->d_qcount = drive.d_qcount;
798 	bd->d_removable = drive.d_removable;
799 	bd->d_hotpluggable = drive.d_hotpluggable;
800 
801 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
802 		bd->d_maxxfer = drive.d_maxxfer;
803 
804 	bd->d_free_align = drive.d_free_align;
805 	bd->d_max_free_seg = drive.d_max_free_seg;
806 	bd->d_max_free_blks = drive.d_max_free_blks;
807 	bd->d_max_free_seg_blks = drive.d_max_free_seg_blks;
808 
809 	bd_create_inquiry_props(dip, &drive);
810 	bd_create_errstats(bd, inst, &drive);
811 	bd_update_state(bd);
812 
813 	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
814 	    KM_SLEEP);
815 	for (i = 0; i < bd->d_qcount; i++) {
816 		bd_queue_t *bq = &bd->d_queues[i];
817 
818 		bq->q_qsize = drive.d_qsize;
819 		bq->q_qactive = 0;
820 		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);
821 
822 		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
823 		    offsetof(struct bd_xfer_impl, i_linkage));
824 		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
825 		    offsetof(struct bd_xfer_impl, i_linkage));
826 	}
827 
828 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
829 	    bd->d_removable, bd->d_hotpluggable,
830 	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/
831 	    *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
832 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
833 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
834 	if (rv != 0) {
835 		goto fail_cmlb_attach;
836 	}
837 
838 	if (bd->d_ops.o_devid_init != NULL) {
839 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
840 		if (rv == DDI_SUCCESS) {
841 			if (ddi_devid_register(dip, bd->d_devid) !=
842 			    DDI_SUCCESS) {
843 				cmn_err(CE_WARN,
844 				    "%s: unable to register devid", name);
845 			}
846 		}
847 	}
848 
849 	/*
850 	 * Add a zero-length attribute to tell the world we support
851 	 * kernel ioctls (for layered drivers).  Also set up properties
852 	 * used by HAL to identify removable media.
853 	 */
854 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
855 	    DDI_KERNEL_IOCTL, NULL, 0);
856 	if (bd->d_removable) {
857 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
858 		    "removable-media", NULL, 0);
859 	}
860 	if (bd->d_hotpluggable) {
861 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
862 		    "hotpluggable", NULL, 0);
863 	}
864 
865 	ddi_report_dev(dip);
866 
867 	return (DDI_SUCCESS);
868 
869 fail_cmlb_attach:
870 	bd_queues_free(bd);
871 	bd_destroy_errstats(bd);
872 
873 fail_drive_info:
874 	cmlb_free_handle(&bd->d_cmlbh);
875 
876 	if (bd->d_ksp != NULL) {
877 		kstat_delete(bd->d_ksp);
878 		bd->d_ksp = NULL;
879 	} else {
880 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
881 	}
882 
883 	kmem_cache_destroy(bd->d_cache);
884 	cv_destroy(&bd->d_statecv);
885 	mutex_destroy(&bd->d_statemutex);
886 	mutex_destroy(&bd->d_ocmutex);
887 	mutex_destroy(&bd->d_ksmutex);
888 	mutex_destroy(&bd->d_dle_mutex);
889 	ddi_soft_state_free(bd_state, inst);
890 	return (DDI_FAILURE);
891 }
892 
893 static int
894 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
895 {
896 	bd_t	*bd;
897 
898 	bd = ddi_get_driver_private(dip);
899 
900 	switch (cmd) {
901 	case DDI_DETACH:
902 		break;
903 	case DDI_SUSPEND:
904 		/* We don't suspend, but our parent does */
905 		return (DDI_SUCCESS);
906 	default:
907 		return (DDI_FAILURE);
908 	}
909 
910 	if (bd->d_ksp != NULL) {
911 		kstat_delete(bd->d_ksp);
912 		bd->d_ksp = NULL;
913 	} else {
914 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
915 	}
916 
917 	bd_destroy_errstats(bd);
918 	cmlb_detach(bd->d_cmlbh, 0);
919 	cmlb_free_handle(&bd->d_cmlbh);
920 	if (bd->d_devid)
921 		ddi_devid_free(bd->d_devid);
922 	kmem_cache_destroy(bd->d_cache);
923 	mutex_destroy(&bd->d_ksmutex);
924 	mutex_destroy(&bd->d_ocmutex);
925 	mutex_destroy(&bd->d_statemutex);
926 	cv_destroy(&bd->d_statecv);
927 	mutex_destroy(&bd->d_dle_mutex);
928 	bd_queues_free(bd);
929 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
930 	return (DDI_SUCCESS);
931 }
932 
933 static int
934 bd_xfer_ctor(void *buf, void *arg, int kmflag)
935 {
936 	bd_xfer_impl_t	*xi;
937 	bd_t		*bd = arg;
938 	int		(*dcb)(caddr_t);
939 
940 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
941 		dcb = DDI_DMA_SLEEP;
942 	} else {
943 		dcb = DDI_DMA_DONTWAIT;
944 	}
945 
946 	xi = buf;
947 	bzero(xi, sizeof (*xi));
948 	xi->i_bd = bd;
949 
950 	if (bd->d_use_dma) {
951 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
952 		    &xi->i_dmah) != DDI_SUCCESS) {
953 			return (-1);
954 		}
955 	}
956 
957 	return (0);
958 }
959 
960 static void
961 bd_xfer_dtor(void *buf, void *arg)
962 {
963 	bd_xfer_impl_t	*xi = buf;
964 
965 	_NOTE(ARGUNUSED(arg));
966 
967 	if (xi->i_dmah)
968 		ddi_dma_free_handle(&xi->i_dmah);
969 	xi->i_dmah = NULL;
970 }
971 
972 static bd_xfer_impl_t *
973 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
974     int kmflag)
975 {
976 	bd_xfer_impl_t		*xi;
977 	int			rv = 0;
978 	int			status;
979 	unsigned		dir;
980 	int			(*cb)(caddr_t);
981 	size_t			len;
982 	uint32_t		shift;
983 
984 	if (kmflag == KM_SLEEP) {
985 		cb = DDI_DMA_SLEEP;
986 	} else {
987 		cb = DDI_DMA_DONTWAIT;
988 	}
989 
990 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
991 	if (xi == NULL) {
992 		bioerror(bp, ENOMEM);
993 		return (NULL);
994 	}
995 
996 	ASSERT(bp);
997 
998 	xi->i_bp = bp;
999 	xi->i_func = func;
1000 	xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
1001 
1002 	if (bp->b_bcount == 0) {
1003 		xi->i_len = 0;
1004 		xi->i_nblks = 0;
1005 		xi->i_kaddr = NULL;
1006 		xi->i_resid = 0;
1007 		xi->i_num_win = 0;
1008 		goto done;
1009 	}
1010 
1011 	if (bp->b_flags & B_READ) {
1012 		dir = DDI_DMA_READ;
1013 		xi->i_func = bd->d_ops.o_read;
1014 	} else {
1015 		dir = DDI_DMA_WRITE;
1016 		xi->i_func = bd->d_ops.o_write;
1017 	}
1018 
1019 	shift = bd->d_blkshift;
1020 	xi->i_blkshift = shift;
1021 
1022 	if (!bd->d_use_dma) {
1023 		bp_mapin(bp);
1024 		rv = 0;
1025 		xi->i_offset = 0;
1026 		xi->i_num_win =
1027 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
1028 		xi->i_cur_win = 0;
1029 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
1030 		xi->i_nblks = xi->i_len >> shift;
1031 		xi->i_kaddr = bp->b_un.b_addr;
1032 		xi->i_resid = bp->b_bcount;
1033 	} else {
1034 
1035 		/*
1036 		 * We have to use consistent DMA if the address is misaligned.
1037 		 */
1038 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
1039 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
1040 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
1041 		} else {
1042 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
1043 		}
1044 
1045 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
1046 		    NULL, &xi->i_dmac, &xi->i_ndmac);
1047 		switch (status) {
1048 		case DDI_DMA_MAPPED:
1049 			xi->i_num_win = 1;
1050 			xi->i_cur_win = 0;
1051 			xi->i_offset = 0;
1052 			xi->i_len = bp->b_bcount;
1053 			xi->i_nblks = xi->i_len >> shift;
1054 			xi->i_resid = bp->b_bcount;
1055 			rv = 0;
1056 			break;
1057 		case DDI_DMA_PARTIAL_MAP:
1058 			xi->i_cur_win = 0;
1059 
1060 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
1061 			    DDI_SUCCESS) ||
1062 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
1063 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
1064 			    DDI_SUCCESS) ||
1065 			    (P2PHASE(len, (1U << shift)) != 0)) {
1066 				(void) ddi_dma_unbind_handle(xi->i_dmah);
1067 				rv = EFAULT;
1068 				goto done;
1069 			}
1070 			xi->i_len = len;
1071 			xi->i_nblks = xi->i_len >> shift;
1072 			xi->i_resid = bp->b_bcount;
1073 			rv = 0;
1074 			break;
1075 		case DDI_DMA_NORESOURCES:
1076 			rv = EAGAIN;
1077 			goto done;
1078 		case DDI_DMA_TOOBIG:
1079 			rv = EINVAL;
1080 			goto done;
1081 		case DDI_DMA_NOMAPPING:
1082 		case DDI_DMA_INUSE:
1083 		default:
1084 			rv = EFAULT;
1085 			goto done;
1086 		}
1087 	}
1088 
1089 done:
1090 	if (rv != 0) {
1091 		kmem_cache_free(bd->d_cache, xi);
1092 		bioerror(bp, rv);
1093 		return (NULL);
1094 	}
1095 
1096 	return (xi);
1097 }
1098 
1099 static void
1100 bd_xfer_free(bd_xfer_impl_t *xi)
1101 {
1102 	if (xi->i_dmah) {
1103 		(void) ddi_dma_unbind_handle(xi->i_dmah);
1104 	}
1105 	if (xi->i_dfl != NULL) {
1106 		dfl_free((dkioc_free_list_t *)xi->i_dfl);
1107 		xi->i_dfl = NULL;
1108 	}
1109 	kmem_cache_free(xi->i_bd->d_cache, xi);
1110 }
1111 
1112 static int
1113 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1114 {
1115 	dev_t		dev = *devp;
1116 	bd_t		*bd;
1117 	minor_t		part;
1118 	minor_t		inst;
1119 	uint64_t	mask;
1120 	boolean_t	ndelay;
1121 	int		rv;
1122 	diskaddr_t	nblks;
1123 	diskaddr_t	lba;
1124 
1125 	_NOTE(ARGUNUSED(credp));
1126 
1127 	part = BDPART(dev);
1128 	inst = BDINST(dev);
1129 
1130 	if (otyp >= OTYPCNT)
1131 		return (EINVAL);
1132 
1133 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
1134 
1135 	/*
1136 	 * Block any DR events from changing the set of registered
1137 	 * devices while we function.
1138 	 */
1139 	rw_enter(&bd_lock, RW_READER);
1140 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1141 		rw_exit(&bd_lock);
1142 		return (ENXIO);
1143 	}
1144 
1145 	mutex_enter(&bd->d_ocmutex);
1146 
1147 	ASSERT(part < 64);
1148 	mask = (1U << part);
1149 
1150 	bd_update_state(bd);
1151 
1152 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
1153 
1154 		/* non-blocking opens are allowed to succeed */
1155 		if (!ndelay) {
1156 			rv = ENXIO;
1157 			goto done;
1158 		}
1159 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
1160 	    NULL, NULL, 0) == 0) {
1161 
1162 		/*
1163 		 * We read the partinfo, verify valid ranges.  If the
1164 		 * partition is invalid, and we aren't blocking or
1165 		 * doing a raw access, then fail. (Non-blocking and
1166 		 * raw accesses can still succeed to allow a disk with
1167 		 * bad partition data to opened by format and fdisk.)
1168 		 */
1169 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
1170 			rv = ENXIO;
1171 			goto done;
1172 		}
1173 	} else if (!ndelay) {
1174 		/*
1175 		 * cmlb_partinfo failed -- invalid partition or no
1176 		 * disk label.
1177 		 */
1178 		rv = ENXIO;
1179 		goto done;
1180 	}
1181 
1182 	if ((flag & FWRITE) && bd->d_rdonly) {
1183 		rv = EROFS;
1184 		goto done;
1185 	}
1186 
1187 	if ((bd->d_open_excl) & (mask)) {
1188 		rv = EBUSY;
1189 		goto done;
1190 	}
1191 	if (flag & FEXCL) {
1192 		if (bd->d_open_lyr[part]) {
1193 			rv = EBUSY;
1194 			goto done;
1195 		}
1196 		for (int i = 0; i < OTYP_LYR; i++) {
1197 			if (bd->d_open_reg[i] & mask) {
1198 				rv = EBUSY;
1199 				goto done;
1200 			}
1201 		}
1202 	}
1203 
1204 	if (otyp == OTYP_LYR) {
1205 		bd->d_open_lyr[part]++;
1206 	} else {
1207 		bd->d_open_reg[otyp] |= mask;
1208 	}
1209 	if (flag & FEXCL) {
1210 		bd->d_open_excl |= mask;
1211 	}
1212 
1213 	rv = 0;
1214 done:
1215 	mutex_exit(&bd->d_ocmutex);
1216 	rw_exit(&bd_lock);
1217 
1218 	return (rv);
1219 }
1220 
1221 static int
1222 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
1223 {
1224 	bd_t		*bd;
1225 	minor_t		inst;
1226 	minor_t		part;
1227 	uint64_t	mask;
1228 	boolean_t	last = B_TRUE;
1229 
1230 	_NOTE(ARGUNUSED(flag));
1231 	_NOTE(ARGUNUSED(credp));
1232 
1233 	part = BDPART(dev);
1234 	inst = BDINST(dev);
1235 
1236 	ASSERT(part < 64);
1237 	mask = (1U << part);
1238 
1239 	rw_enter(&bd_lock, RW_READER);
1240 
1241 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1242 		rw_exit(&bd_lock);
1243 		return (ENXIO);
1244 	}
1245 
1246 	mutex_enter(&bd->d_ocmutex);
1247 	if (bd->d_open_excl & mask) {
1248 		bd->d_open_excl &= ~mask;
1249 	}
1250 	if (otyp == OTYP_LYR) {
1251 		bd->d_open_lyr[part]--;
1252 	} else {
1253 		bd->d_open_reg[otyp] &= ~mask;
1254 	}
1255 	for (int i = 0; i < 64; i++) {
1256 		if (bd->d_open_lyr[part]) {
1257 			last = B_FALSE;
1258 		}
1259 	}
1260 	for (int i = 0; last && (i < OTYP_LYR); i++) {
1261 		if (bd->d_open_reg[i]) {
1262 			last = B_FALSE;
1263 		}
1264 	}
1265 	mutex_exit(&bd->d_ocmutex);
1266 
1267 	if (last) {
1268 		cmlb_invalidate(bd->d_cmlbh, 0);
1269 	}
1270 	rw_exit(&bd_lock);
1271 
1272 	return (0);
1273 }
1274 
1275 static int
1276 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1277 {
1278 	minor_t		inst;
1279 	minor_t		part;
1280 	diskaddr_t	pstart;
1281 	diskaddr_t	psize;
1282 	bd_t		*bd;
1283 	bd_xfer_impl_t	*xi;
1284 	buf_t		*bp;
1285 	int		rv;
1286 	uint32_t	shift;
1287 	daddr_t		d_blkno;
1288 	int	d_nblk;
1289 
1290 	rw_enter(&bd_lock, RW_READER);
1291 
1292 	part = BDPART(dev);
1293 	inst = BDINST(dev);
1294 
1295 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1296 		rw_exit(&bd_lock);
1297 		return (ENXIO);
1298 	}
1299 	shift = bd->d_blkshift;
1300 	d_blkno = blkno >> (shift - DEV_BSHIFT);
1301 	d_nblk = nblk >> (shift - DEV_BSHIFT);
1302 	/*
1303 	 * do cmlb, but do it synchronously unless we already have the
1304 	 * partition (which we probably should.)
1305 	 */
1306 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1307 	    (void *)1)) {
1308 		rw_exit(&bd_lock);
1309 		return (ENXIO);
1310 	}
1311 
1312 	if ((d_blkno + d_nblk) > psize) {
1313 		rw_exit(&bd_lock);
1314 		return (EINVAL);
1315 	}
1316 	bp = getrbuf(KM_NOSLEEP);
1317 	if (bp == NULL) {
1318 		rw_exit(&bd_lock);
1319 		return (ENOMEM);
1320 	}
1321 
1322 	bp->b_bcount = nblk << DEV_BSHIFT;
1323 	bp->b_resid = bp->b_bcount;
1324 	bp->b_lblkno = blkno;
1325 	bp->b_un.b_addr = caddr;
1326 
1327 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
1328 	if (xi == NULL) {
1329 		rw_exit(&bd_lock);
1330 		freerbuf(bp);
1331 		return (ENOMEM);
1332 	}
1333 	xi->i_blkno = d_blkno + pstart;
1334 	xi->i_flags = BD_XFER_POLL;
1335 	bd_submit(bd, xi);
1336 	rw_exit(&bd_lock);
1337 
1338 	/*
1339 	 * Generally, we should have run this entirely synchronously
1340 	 * at this point and the biowait call should be a no-op.  If
1341 	 * it didn't happen this way, it's a bug in the underlying
1342 	 * driver not honoring BD_XFER_POLL.
1343 	 */
1344 	(void) biowait(bp);
1345 	rv = geterror(bp);
1346 	freerbuf(bp);
1347 	return (rv);
1348 }
1349 
1350 void
1351 bd_minphys(struct buf *bp)
1352 {
1353 	minor_t inst;
1354 	bd_t	*bd;
1355 	inst = BDINST(bp->b_edev);
1356 
1357 	bd = ddi_get_soft_state(bd_state, inst);
1358 
1359 	/*
1360 	 * In a non-debug kernel, bd_strategy will catch !bd as
1361 	 * well, and will fail nicely.
1362 	 */
1363 	ASSERT(bd);
1364 
1365 	if (bp->b_bcount > bd->d_maxxfer)
1366 		bp->b_bcount = bd->d_maxxfer;
1367 }
1368 
1369 static int
1370 bd_check_uio(dev_t dev, struct uio *uio)
1371 {
1372 	bd_t		*bd;
1373 	uint32_t	shift;
1374 
1375 	if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1376 		return (ENXIO);
1377 	}
1378 
1379 	shift = bd->d_blkshift;
1380 	if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1381 	    (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1382 		return (EINVAL);
1383 	}
1384 
1385 	return (0);
1386 }
1387 
1388 static int
1389 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1390 {
1391 	_NOTE(ARGUNUSED(credp));
1392 	int	ret = bd_check_uio(dev, uio);
1393 	if (ret != 0) {
1394 		return (ret);
1395 	}
1396 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1397 }
1398 
1399 static int
1400 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1401 {
1402 	_NOTE(ARGUNUSED(credp));
1403 	int	ret = bd_check_uio(dev, uio);
1404 	if (ret != 0) {
1405 		return (ret);
1406 	}
1407 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1408 }
1409 
1410 static int
1411 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1412 {
1413 	_NOTE(ARGUNUSED(credp));
1414 	int	ret = bd_check_uio(dev, aio->aio_uio);
1415 	if (ret != 0) {
1416 		return (ret);
1417 	}
1418 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1419 }
1420 
1421 static int
1422 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1423 {
1424 	_NOTE(ARGUNUSED(credp));
1425 	int	ret = bd_check_uio(dev, aio->aio_uio);
1426 	if (ret != 0) {
1427 		return (ret);
1428 	}
1429 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1430 }
1431 
1432 static int
1433 bd_strategy(struct buf *bp)
1434 {
1435 	minor_t		inst;
1436 	minor_t		part;
1437 	bd_t		*bd;
1438 	diskaddr_t	p_lba;
1439 	diskaddr_t	p_nblks;
1440 	diskaddr_t	b_nblks;
1441 	bd_xfer_impl_t	*xi;
1442 	uint32_t	shift;
1443 	int		(*func)(void *, bd_xfer_t *);
1444 	diskaddr_t	lblkno;
1445 
1446 	part = BDPART(bp->b_edev);
1447 	inst = BDINST(bp->b_edev);
1448 
1449 	ASSERT(bp);
1450 
1451 	bp->b_resid = bp->b_bcount;
1452 
1453 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1454 		bioerror(bp, ENXIO);
1455 		biodone(bp);
1456 		return (0);
1457 	}
1458 
1459 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1460 	    NULL, NULL, 0)) {
1461 		bioerror(bp, ENXIO);
1462 		biodone(bp);
1463 		return (0);
1464 	}
1465 
1466 	shift = bd->d_blkshift;
1467 	lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1468 	if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1469 	    (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1470 	    (lblkno > p_nblks)) {
1471 		bioerror(bp, EINVAL);
1472 		biodone(bp);
1473 		return (0);
1474 	}
1475 	b_nblks = bp->b_bcount >> shift;
1476 	if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1477 		biodone(bp);
1478 		return (0);
1479 	}
1480 
1481 	if ((b_nblks + lblkno) > p_nblks) {
1482 		bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1483 		bp->b_bcount -= bp->b_resid;
1484 	} else {
1485 		bp->b_resid = 0;
1486 	}
1487 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1488 
1489 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1490 	if (xi == NULL) {
1491 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1492 	}
1493 	if (xi == NULL) {
1494 		/* bd_request_alloc will have done bioerror */
1495 		biodone(bp);
1496 		return (0);
1497 	}
1498 	xi->i_blkno = lblkno + p_lba;
1499 
1500 	bd_submit(bd, xi);
1501 
1502 	return (0);
1503 }
1504 
1505 static int
1506 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1507 {
1508 	minor_t		inst;
1509 	uint16_t	part;
1510 	bd_t		*bd;
1511 	void		*ptr = (void *)arg;
1512 	int		rv;
1513 
1514 	part = BDPART(dev);
1515 	inst = BDINST(dev);
1516 
1517 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1518 		return (ENXIO);
1519 	}
1520 
1521 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1522 	if (rv != ENOTTY)
1523 		return (rv);
1524 
1525 	if (rvalp != NULL) {
1526 		/* the return value of the ioctl is 0 by default */
1527 		*rvalp = 0;
1528 	}
1529 
1530 	switch (cmd) {
1531 	case DKIOCGMEDIAINFO: {
1532 		struct dk_minfo minfo;
1533 
1534 		/* make sure our state information is current */
1535 		bd_update_state(bd);
1536 		bzero(&minfo, sizeof (minfo));
1537 		minfo.dki_media_type = DK_FIXED_DISK;
1538 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1539 		minfo.dki_capacity = bd->d_numblks;
1540 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1541 			return (EFAULT);
1542 		}
1543 		return (0);
1544 	}
1545 	case DKIOCGMEDIAINFOEXT: {
1546 		struct dk_minfo_ext miext;
1547 		size_t len;
1548 
1549 		/* make sure our state information is current */
1550 		bd_update_state(bd);
1551 		bzero(&miext, sizeof (miext));
1552 		miext.dki_media_type = DK_FIXED_DISK;
1553 		miext.dki_lbsize = (1U << bd->d_blkshift);
1554 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1555 		miext.dki_capacity = bd->d_numblks;
1556 
1557 		switch (ddi_model_convert_from(flag & FMODELS)) {
1558 		case DDI_MODEL_ILP32:
1559 			len = sizeof (struct dk_minfo_ext32);
1560 			break;
1561 		default:
1562 			len = sizeof (struct dk_minfo_ext);
1563 			break;
1564 		}
1565 
1566 		if (ddi_copyout(&miext, ptr, len, flag)) {
1567 			return (EFAULT);
1568 		}
1569 		return (0);
1570 	}
1571 	case DKIOCINFO: {
1572 		struct dk_cinfo cinfo;
1573 		bzero(&cinfo, sizeof (cinfo));
1574 		cinfo.dki_ctype = DKC_BLKDEV;
1575 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1576 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1577 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1578 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1579 		    "%s", ddi_driver_name(bd->d_dip));
1580 		cinfo.dki_unit = inst;
1581 		cinfo.dki_flags = DKI_FMTVOL;
1582 		cinfo.dki_partition = part;
1583 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1584 		cinfo.dki_addr = 0;
1585 		cinfo.dki_slave = 0;
1586 		cinfo.dki_space = 0;
1587 		cinfo.dki_prio = 0;
1588 		cinfo.dki_vec = 0;
1589 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1590 			return (EFAULT);
1591 		}
1592 		return (0);
1593 	}
1594 	case DKIOCREMOVABLE: {
1595 		int i;
1596 		i = bd->d_removable ? 1 : 0;
1597 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1598 			return (EFAULT);
1599 		}
1600 		return (0);
1601 	}
1602 	case DKIOCHOTPLUGGABLE: {
1603 		int i;
1604 		i = bd->d_hotpluggable ? 1 : 0;
1605 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1606 			return (EFAULT);
1607 		}
1608 		return (0);
1609 	}
1610 	case DKIOCREADONLY: {
1611 		int i;
1612 		i = bd->d_rdonly ? 1 : 0;
1613 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1614 			return (EFAULT);
1615 		}
1616 		return (0);
1617 	}
1618 	case DKIOCSOLIDSTATE: {
1619 		int i;
1620 		i = bd->d_ssd ? 1 : 0;
1621 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1622 			return (EFAULT);
1623 		}
1624 		return (0);
1625 	}
1626 	case DKIOCSTATE: {
1627 		enum dkio_state	state;
1628 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1629 			return (EFAULT);
1630 		}
1631 		if ((rv = bd_check_state(bd, &state)) != 0) {
1632 			return (rv);
1633 		}
1634 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1635 			return (EFAULT);
1636 		}
1637 		return (0);
1638 	}
1639 	case DKIOCFLUSHWRITECACHE: {
1640 		struct dk_callback *dkc = NULL;
1641 
1642 		if (flag & FKIOCTL)
1643 			dkc = (void *)arg;
1644 
1645 		rv = bd_flush_write_cache(bd, dkc);
1646 		return (rv);
1647 	}
1648 	case DKIOCFREE: {
1649 		dkioc_free_list_t *dfl = NULL;
1650 
1651 		/*
1652 		 * Check free space support early to avoid copyin/allocation
1653 		 * when unnecessary.
1654 		 */
1655 		if (!CAN_FREESPACE(bd))
1656 			return (ENOTSUP);
1657 
1658 		rv = dfl_copyin(ptr, &dfl, flag, KM_SLEEP);
1659 		if (rv != 0)
1660 			return (rv);
1661 
1662 		/*
1663 		 * bd_free_space() consumes 'dfl'. bd_free_space() will
1664 		 * call dfl_iter() which will normally try to pass dfl through
1665 		 * to bd_free_space_cb() which attaches dfl to the bd_xfer_t
1666 		 * that is then queued for the underlying driver. Once the
1667 		 * driver processes the request, the bd_xfer_t instance is
1668 		 * disposed of, including any attached dkioc_free_list_t.
1669 		 *
1670 		 * If dfl cannot be processed by the underlying driver due to
1671 		 * size or alignment requirements of the driver, dfl_iter()
1672 		 * will replace dfl with one or more new dkioc_free_list_t
1673 		 * instances with the correct alignment and sizes for the driver
1674 		 * (and free the original dkioc_free_list_t).
1675 		 */
1676 		rv = bd_free_space(dev, bd, dfl);
1677 		return (rv);
1678 	}
1679 
1680 	case DKIOC_CANFREE: {
1681 		boolean_t supported = CAN_FREESPACE(bd);
1682 
1683 		if (ddi_copyout(&supported, (void *)arg, sizeof (supported),
1684 		    flag) != 0) {
1685 			return (EFAULT);
1686 		}
1687 
1688 		return (0);
1689 	}
1690 
1691 	default:
1692 		break;
1693 
1694 	}
1695 	return (ENOTTY);
1696 }
1697 
1698 static int
1699 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1700     char *name, caddr_t valuep, int *lengthp)
1701 {
1702 	bd_t	*bd;
1703 
1704 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1705 	if (bd == NULL)
1706 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1707 		    name, valuep, lengthp));
1708 
1709 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1710 	    valuep, lengthp, BDPART(dev), 0));
1711 }
1712 
1713 
1714 static int
1715 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1716     size_t length, void *tg_cookie)
1717 {
1718 	bd_t		*bd;
1719 	buf_t		*bp;
1720 	bd_xfer_impl_t	*xi;
1721 	int		rv;
1722 	int		(*func)(void *, bd_xfer_t *);
1723 	int		kmflag;
1724 
1725 	/*
1726 	 * If we are running in polled mode (such as during dump(9e)
1727 	 * execution), then we cannot sleep for kernel allocations.
1728 	 */
1729 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1730 
1731 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1732 
1733 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1734 		/* We can only transfer whole blocks at a time! */
1735 		return (EINVAL);
1736 	}
1737 
1738 	if ((bp = getrbuf(kmflag)) == NULL) {
1739 		return (ENOMEM);
1740 	}
1741 
1742 	switch (cmd) {
1743 	case TG_READ:
1744 		bp->b_flags = B_READ;
1745 		func = bd->d_ops.o_read;
1746 		break;
1747 	case TG_WRITE:
1748 		bp->b_flags = B_WRITE;
1749 		func = bd->d_ops.o_write;
1750 		break;
1751 	default:
1752 		freerbuf(bp);
1753 		return (EINVAL);
1754 	}
1755 
1756 	bp->b_un.b_addr = bufaddr;
1757 	bp->b_bcount = length;
1758 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1759 	if (xi == NULL) {
1760 		rv = geterror(bp);
1761 		freerbuf(bp);
1762 		return (rv);
1763 	}
1764 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1765 	xi->i_blkno = start;
1766 	bd_submit(bd, xi);
1767 	(void) biowait(bp);
1768 	rv = geterror(bp);
1769 	freerbuf(bp);
1770 
1771 	return (rv);
1772 }
1773 
1774 static int
1775 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1776 {
1777 	bd_t		*bd;
1778 
1779 	_NOTE(ARGUNUSED(tg_cookie));
1780 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1781 
1782 	switch (cmd) {
1783 	case TG_GETPHYGEOM:
1784 	case TG_GETVIRTGEOM:
1785 		/*
1786 		 * We don't have any "geometry" as such, let cmlb
1787 		 * fabricate something.
1788 		 */
1789 		return (ENOTTY);
1790 
1791 	case TG_GETCAPACITY:
1792 		bd_update_state(bd);
1793 		*(diskaddr_t *)arg = bd->d_numblks;
1794 		return (0);
1795 
1796 	case TG_GETBLOCKSIZE:
1797 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1798 		return (0);
1799 
1800 	case TG_GETATTR:
1801 		/*
1802 		 * It turns out that cmlb really doesn't do much for
1803 		 * non-writable media, but lets make the information
1804 		 * available for it in case it does more in the
1805 		 * future.  (The value is currently used for
1806 		 * triggering special behavior for CD-ROMs.)
1807 		 */
1808 		bd_update_state(bd);
1809 		((tg_attribute_t *)arg)->media_is_writable =
1810 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1811 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1812 		((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1813 		return (0);
1814 
1815 	default:
1816 		return (EINVAL);
1817 	}
1818 }
1819 
1820 
1821 static void
1822 bd_sched(bd_t *bd, bd_queue_t *bq)
1823 {
1824 	bd_xfer_impl_t	*xi;
1825 	struct buf	*bp;
1826 	int		rv;
1827 
1828 	mutex_enter(&bq->q_iomutex);
1829 
1830 	while ((bq->q_qactive < bq->q_qsize) &&
1831 	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
1832 		mutex_enter(&bd->d_ksmutex);
1833 		kstat_waitq_to_runq(bd->d_kiop);
1834 		mutex_exit(&bd->d_ksmutex);
1835 
1836 		bq->q_qactive++;
1837 		list_insert_tail(&bq->q_runq, xi);
1838 
1839 		/*
1840 		 * Submit the job to the driver.  We drop the I/O mutex
1841 		 * so that we can deal with the case where the driver
1842 		 * completion routine calls back into us synchronously.
1843 		 */
1844 
1845 		mutex_exit(&bq->q_iomutex);
1846 
1847 		rv = xi->i_func(bd->d_private, &xi->i_public);
1848 		if (rv != 0) {
1849 			bp = xi->i_bp;
1850 			bioerror(bp, rv);
1851 			biodone(bp);
1852 
1853 			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1854 
1855 			mutex_enter(&bq->q_iomutex);
1856 
1857 			mutex_enter(&bd->d_ksmutex);
1858 			kstat_runq_exit(bd->d_kiop);
1859 			mutex_exit(&bd->d_ksmutex);
1860 
1861 			bq->q_qactive--;
1862 			list_remove(&bq->q_runq, xi);
1863 			bd_xfer_free(xi);
1864 		} else {
1865 			mutex_enter(&bq->q_iomutex);
1866 		}
1867 	}
1868 
1869 	mutex_exit(&bq->q_iomutex);
1870 }
1871 
1872 static void
1873 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1874 {
1875 	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
1876 	unsigned	q = nv % bd->d_qcount;
1877 	bd_queue_t	*bq = &bd->d_queues[q];
1878 
1879 	xi->i_bq = bq;
1880 	xi->i_qnum = q;
1881 
1882 	mutex_enter(&bq->q_iomutex);
1883 
1884 	list_insert_tail(&bq->q_waitq, xi);
1885 
1886 	mutex_enter(&bd->d_ksmutex);
1887 	kstat_waitq_enter(bd->d_kiop);
1888 	mutex_exit(&bd->d_ksmutex);
1889 
1890 	mutex_exit(&bq->q_iomutex);
1891 
1892 	bd_sched(bd, bq);
1893 }
1894 
1895 static void
1896 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1897 {
1898 	bd_t		*bd = xi->i_bd;
1899 	buf_t		*bp = xi->i_bp;
1900 	bd_queue_t	*bq = xi->i_bq;
1901 
1902 	mutex_enter(&bq->q_iomutex);
1903 	bq->q_qactive--;
1904 
1905 	mutex_enter(&bd->d_ksmutex);
1906 	kstat_runq_exit(bd->d_kiop);
1907 	mutex_exit(&bd->d_ksmutex);
1908 
1909 	list_remove(&bq->q_runq, xi);
1910 	mutex_exit(&bq->q_iomutex);
1911 
1912 	if (err == 0) {
1913 		if (bp->b_flags & B_READ) {
1914 			atomic_inc_uint(&bd->d_kiop->reads);
1915 			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
1916 			    bp->b_bcount - xi->i_resid);
1917 		} else {
1918 			atomic_inc_uint(&bd->d_kiop->writes);
1919 			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
1920 			    bp->b_bcount - xi->i_resid);
1921 		}
1922 	}
1923 	bd_sched(bd, bq);
1924 }
1925 
1926 static void
1927 bd_dle_sysevent_task(void *arg)
1928 {
1929 	nvlist_t *attr = NULL;
1930 	char *path = NULL;
1931 	bd_t *bd = arg;
1932 	dev_info_t *dip = bd->d_dip;
1933 	size_t n;
1934 
1935 	mutex_enter(&bd->d_dle_mutex);
1936 	bd->d_dle_state &= ~BD_DLE_PENDING;
1937 	bd->d_dle_state |= BD_DLE_RUNNING;
1938 	mutex_exit(&bd->d_dle_mutex);
1939 
1940 	dev_err(dip, CE_NOTE, "!dynamic LUN expansion");
1941 
1942 	if (nvlist_alloc(&attr, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0) {
1943 		mutex_enter(&bd->d_dle_mutex);
1944 		bd->d_dle_state &= ~(BD_DLE_RUNNING|BD_DLE_PENDING);
1945 		mutex_exit(&bd->d_dle_mutex);
1946 		return;
1947 	}
1948 
1949 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1950 
1951 	n = snprintf(path, MAXPATHLEN, "/devices");
1952 	(void) ddi_pathname(dip, path + n);
1953 	n = strlen(path);
1954 	n += snprintf(path + n, MAXPATHLEN - n, ":x");
1955 
1956 	for (;;) {
1957 		/*
1958 		 * On receipt of this event, the ZFS sysevent module will scan
1959 		 * active zpools for child vdevs matching this physical path.
1960 		 * In order to catch both whole disk pools and those with an
1961 		 * EFI boot partition, generate separate sysevents for minor
1962 		 * node 'a' and 'b'. (By comparison, io/scsi/targets/sd.c sends
1963 		 * events for just 'a')
1964 		 */
1965 		for (char c = 'a'; c < 'c'; c++) {
1966 			path[n - 1] = c;
1967 
1968 			if (nvlist_add_string(attr, DEV_PHYS_PATH, path) != 0)
1969 				break;
1970 
1971 			(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW,
1972 			    EC_DEV_STATUS, ESC_DEV_DLE, attr, NULL, DDI_SLEEP);
1973 		}
1974 
1975 		mutex_enter(&bd->d_dle_mutex);
1976 		if ((bd->d_dle_state & BD_DLE_PENDING) == 0) {
1977 			bd->d_dle_state &= ~BD_DLE_RUNNING;
1978 			mutex_exit(&bd->d_dle_mutex);
1979 			break;
1980 		}
1981 		bd->d_dle_state &= ~BD_DLE_PENDING;
1982 		mutex_exit(&bd->d_dle_mutex);
1983 	}
1984 
1985 	nvlist_free(attr);
1986 	kmem_free(path, MAXPATHLEN);
1987 }
1988 
1989 static void
1990 bd_update_state(bd_t *bd)
1991 {
1992 	enum	dkio_state	state = DKIO_INSERTED;
1993 	boolean_t		docmlb = B_FALSE;
1994 	bd_media_t		media;
1995 
1996 	bzero(&media, sizeof (media));
1997 
1998 	mutex_enter(&bd->d_statemutex);
1999 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
2000 		bd->d_numblks = 0;
2001 		state = DKIO_EJECTED;
2002 		goto done;
2003 	}
2004 
2005 	if ((media.m_blksize < 512) ||
2006 	    (!ISP2(media.m_blksize)) ||
2007 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
2008 		dev_err(bd->d_dip, CE_WARN, "Invalid media block size (%d)",
2009 		    media.m_blksize);
2010 		/*
2011 		 * We can't use the media, treat it as not present.
2012 		 */
2013 		state = DKIO_EJECTED;
2014 		bd->d_numblks = 0;
2015 		goto done;
2016 	}
2017 
2018 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
2019 	    (bd->d_numblks != media.m_nblks)) {
2020 		/* Device size changed */
2021 		docmlb = B_TRUE;
2022 	}
2023 
2024 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
2025 	bd->d_pblkshift = bd->d_blkshift;
2026 	bd->d_numblks = media.m_nblks;
2027 	bd->d_rdonly = media.m_readonly;
2028 	bd->d_ssd = media.m_solidstate;
2029 
2030 	/*
2031 	 * Only use the supplied physical block size if it is non-zero,
2032 	 * greater or equal to the block size, and a power of 2. Ignore it
2033 	 * if not, it's just informational and we can still use the media.
2034 	 */
2035 	if ((media.m_pblksize != 0) &&
2036 	    (media.m_pblksize >= media.m_blksize) &&
2037 	    (ISP2(media.m_pblksize)))
2038 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
2039 
2040 done:
2041 	if (state != bd->d_state) {
2042 		bd->d_state = state;
2043 		cv_broadcast(&bd->d_statecv);
2044 		docmlb = B_TRUE;
2045 	}
2046 	mutex_exit(&bd->d_statemutex);
2047 
2048 	bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
2049 
2050 	if (docmlb) {
2051 		if (state == DKIO_INSERTED) {
2052 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
2053 
2054 			mutex_enter(&bd->d_dle_mutex);
2055 			/*
2056 			 * If there is already an event pending, there's
2057 			 * nothing to do; we coalesce multiple events.
2058 			 */
2059 			if ((bd->d_dle_state & BD_DLE_PENDING) == 0) {
2060 				if ((bd->d_dle_state & BD_DLE_RUNNING) == 0) {
2061 					taskq_dispatch_ent(bd_taskq,
2062 					    bd_dle_sysevent_task, bd, 0,
2063 					    &bd->d_dle_ent);
2064 				}
2065 				bd->d_dle_state |= BD_DLE_PENDING;
2066 			}
2067 			mutex_exit(&bd->d_dle_mutex);
2068 		} else {
2069 			cmlb_invalidate(bd->d_cmlbh, 0);
2070 		}
2071 	}
2072 }
2073 
2074 static int
2075 bd_check_state(bd_t *bd, enum dkio_state *state)
2076 {
2077 	clock_t		when;
2078 
2079 	for (;;) {
2080 
2081 		bd_update_state(bd);
2082 
2083 		mutex_enter(&bd->d_statemutex);
2084 
2085 		if (bd->d_state != *state) {
2086 			*state = bd->d_state;
2087 			mutex_exit(&bd->d_statemutex);
2088 			break;
2089 		}
2090 
2091 		when = drv_usectohz(1000000);
2092 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
2093 		    when, TR_CLOCK_TICK) == 0) {
2094 			mutex_exit(&bd->d_statemutex);
2095 			return (EINTR);
2096 		}
2097 
2098 		mutex_exit(&bd->d_statemutex);
2099 	}
2100 
2101 	return (0);
2102 }
2103 
2104 static int
2105 bd_flush_write_cache_done(struct buf *bp)
2106 {
2107 	struct dk_callback *dc = (void *)bp->b_private;
2108 
2109 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
2110 	kmem_free(dc, sizeof (*dc));
2111 	freerbuf(bp);
2112 	return (0);
2113 }
2114 
2115 static int
2116 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
2117 {
2118 	buf_t			*bp;
2119 	struct dk_callback	*dc;
2120 	bd_xfer_impl_t		*xi;
2121 	int			rv;
2122 
2123 	if (bd->d_ops.o_sync_cache == NULL) {
2124 		return (ENOTSUP);
2125 	}
2126 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
2127 		return (ENOMEM);
2128 	}
2129 	bp->b_resid = 0;
2130 	bp->b_bcount = 0;
2131 
2132 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
2133 	if (xi == NULL) {
2134 		rv = geterror(bp);
2135 		freerbuf(bp);
2136 		return (rv);
2137 	}
2138 
2139 	/* Make an asynchronous flush, but only if there is a callback */
2140 	if (dkc != NULL && dkc->dkc_callback != NULL) {
2141 		/* Make a private copy of the callback structure */
2142 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
2143 		*dc = *dkc;
2144 		bp->b_private = dc;
2145 		bp->b_iodone = bd_flush_write_cache_done;
2146 
2147 		bd_submit(bd, xi);
2148 		return (0);
2149 	}
2150 
2151 	/* In case there is no callback, perform a synchronous flush */
2152 	bd_submit(bd, xi);
2153 	(void) biowait(bp);
2154 	rv = geterror(bp);
2155 	freerbuf(bp);
2156 
2157 	return (rv);
2158 }
2159 
2160 static int
2161 bd_free_space_done(struct buf *bp)
2162 {
2163 	freerbuf(bp);
2164 	return (0);
2165 }
2166 
2167 static int
2168 bd_free_space_cb(dkioc_free_list_t *dfl, void *arg, int kmflag)
2169 {
2170 	bd_t		*bd = arg;
2171 	buf_t		*bp = NULL;
2172 	bd_xfer_impl_t	*xi = NULL;
2173 	boolean_t	sync = DFL_ISSYNC(dfl) ?  B_TRUE : B_FALSE;
2174 	int		rv = 0;
2175 
2176 	bp = getrbuf(KM_SLEEP);
2177 	bp->b_resid = 0;
2178 	bp->b_bcount = 0;
2179 	bp->b_lblkno = 0;
2180 
2181 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_free_space, kmflag);
2182 	xi->i_dfl = dfl;
2183 
2184 	if (!sync) {
2185 		bp->b_iodone = bd_free_space_done;
2186 		bd_submit(bd, xi);
2187 		return (0);
2188 	}
2189 
2190 	xi->i_flags |= BD_XFER_POLL;
2191 	bd_submit(bd, xi);
2192 
2193 	(void) biowait(bp);
2194 	rv = geterror(bp);
2195 	freerbuf(bp);
2196 
2197 	return (rv);
2198 }
2199 
2200 static int
2201 bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
2202 {
2203 	diskaddr_t p_len, p_offset;
2204 	uint64_t offset_bytes, len_bytes;
2205 	minor_t part = BDPART(dev);
2206 	const uint_t bshift = bd->d_blkshift;
2207 	dkioc_free_info_t dfi = {
2208 		.dfi_bshift = bshift,
2209 		.dfi_align = bd->d_free_align << bshift,
2210 		.dfi_max_bytes = bd->d_max_free_blks << bshift,
2211 		.dfi_max_ext = bd->d_max_free_seg,
2212 		.dfi_max_ext_bytes = bd->d_max_free_seg_blks << bshift,
2213 	};
2214 
2215 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_len, &p_offset, NULL,
2216 	    NULL, 0) != 0) {
2217 		dfl_free(dfl);
2218 		return (ENXIO);
2219 	}
2220 
2221 	/*
2222 	 * bd_ioctl created our own copy of dfl, so we can modify as
2223 	 * necessary
2224 	 */
2225 	offset_bytes = (uint64_t)p_offset << bshift;
2226 	len_bytes = (uint64_t)p_len << bshift;
2227 
2228 	dfl->dfl_offset += offset_bytes;
2229 	if (dfl->dfl_offset < offset_bytes) {
2230 		dfl_free(dfl);
2231 		return (EOVERFLOW);
2232 	}
2233 
2234 	return (dfl_iter(dfl, &dfi, offset_bytes + len_bytes, bd_free_space_cb,
2235 	    bd, KM_SLEEP));
2236 }
2237 
2238 /*
2239  * Nexus support.
2240  */
2241 int
2242 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
2243     void *arg, void *result)
2244 {
2245 	bd_handle_t	hdl;
2246 
2247 	switch (ctlop) {
2248 	case DDI_CTLOPS_REPORTDEV:
2249 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
2250 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
2251 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
2252 		return (DDI_SUCCESS);
2253 
2254 	case DDI_CTLOPS_INITCHILD:
2255 		hdl = ddi_get_parent_data((dev_info_t *)arg);
2256 		if (hdl == NULL) {
2257 			return (DDI_NOT_WELL_FORMED);
2258 		}
2259 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
2260 		return (DDI_SUCCESS);
2261 
2262 	case DDI_CTLOPS_UNINITCHILD:
2263 		ddi_set_name_addr((dev_info_t *)arg, NULL);
2264 		ndi_prop_remove_all((dev_info_t *)arg);
2265 		return (DDI_SUCCESS);
2266 
2267 	default:
2268 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
2269 	}
2270 }
2271 
2272 /*
2273  * Functions for device drivers.
2274  */
2275 bd_handle_t
2276 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
2277 {
2278 	bd_handle_t	hdl;
2279 
2280 	switch (ops->o_version) {
2281 	case BD_OPS_VERSION_0:
2282 	case BD_OPS_VERSION_1:
2283 	case BD_OPS_VERSION_2:
2284 		break;
2285 
2286 	default:
2287 		/* Unsupported version */
2288 		return (NULL);
2289 	}
2290 
2291 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
2292 	if (hdl == NULL) {
2293 		return (NULL);
2294 	}
2295 
2296 	switch (ops->o_version) {
2297 	case BD_OPS_VERSION_2:
2298 		hdl->h_ops.o_free_space = ops->o_free_space;
2299 		/*FALLTHRU*/
2300 	case BD_OPS_VERSION_1:
2301 	case BD_OPS_VERSION_0:
2302 		hdl->h_ops.o_drive_info = ops->o_drive_info;
2303 		hdl->h_ops.o_media_info = ops->o_media_info;
2304 		hdl->h_ops.o_devid_init = ops->o_devid_init;
2305 		hdl->h_ops.o_sync_cache = ops->o_sync_cache;
2306 		hdl->h_ops.o_read = ops->o_read;
2307 		hdl->h_ops.o_write = ops->o_write;
2308 		break;
2309 	}
2310 
2311 	hdl->h_dma = dma;
2312 	hdl->h_private = private;
2313 
2314 	return (hdl);
2315 }
2316 
2317 void
2318 bd_free_handle(bd_handle_t hdl)
2319 {
2320 	kmem_free(hdl, sizeof (*hdl));
2321 }
2322 
2323 int
2324 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
2325 {
2326 	dev_info_t	*child;
2327 	bd_drive_t	drive = { 0 };
2328 
2329 	/*
2330 	 * It's not an error if bd_attach_handle() is called on a handle that
2331 	 * already is attached. We just ignore the request to attach and return.
2332 	 * This way drivers using blkdev don't have to keep track about blkdev
2333 	 * state, they can just call this function to make sure it attached.
2334 	 */
2335 	if (hdl->h_child != NULL) {
2336 		return (DDI_SUCCESS);
2337 	}
2338 
2339 	/* if drivers don't override this, make it assume none */
2340 	drive.d_lun = -1;
2341 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
2342 
2343 	hdl->h_parent = dip;
2344 	hdl->h_name = "blkdev";
2345 
2346 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2347 	if (*(uint64_t *)drive.d_eui64 != 0) {
2348 		if (drive.d_lun >= 0) {
2349 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2350 			    "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
2351 			    drive.d_eui64[0], drive.d_eui64[1],
2352 			    drive.d_eui64[2], drive.d_eui64[3],
2353 			    drive.d_eui64[4], drive.d_eui64[5],
2354 			    drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
2355 		} else {
2356 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2357 			    "w%02X%02X%02X%02X%02X%02X%02X%02X",
2358 			    drive.d_eui64[0], drive.d_eui64[1],
2359 			    drive.d_eui64[2], drive.d_eui64[3],
2360 			    drive.d_eui64[4], drive.d_eui64[5],
2361 			    drive.d_eui64[6], drive.d_eui64[7]);
2362 		}
2363 	} else {
2364 		if (drive.d_lun >= 0) {
2365 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2366 			    "%X,%X", drive.d_target, drive.d_lun);
2367 		} else {
2368 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2369 			    "%X", drive.d_target);
2370 		}
2371 	}
2372 
2373 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
2374 	    &child) != NDI_SUCCESS) {
2375 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
2376 		    ddi_driver_name(dip), ddi_get_instance(dip),
2377 		    "blkdev", hdl->h_addr);
2378 		return (DDI_FAILURE);
2379 	}
2380 
2381 	ddi_set_parent_data(child, hdl);
2382 	hdl->h_child = child;
2383 
2384 	if (ndi_devi_online(child, 0) != NDI_SUCCESS) {
2385 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
2386 		    ddi_driver_name(dip), ddi_get_instance(dip),
2387 		    hdl->h_name, hdl->h_addr);
2388 		(void) ndi_devi_free(child);
2389 		hdl->h_child = NULL;
2390 		return (DDI_FAILURE);
2391 	}
2392 
2393 	return (DDI_SUCCESS);
2394 }
2395 
2396 int
2397 bd_detach_handle(bd_handle_t hdl)
2398 {
2399 	int	circ;
2400 	int	rv;
2401 	char	*devnm;
2402 
2403 	/*
2404 	 * It's not an error if bd_detach_handle() is called on a handle that
2405 	 * already is detached. We just ignore the request to detach and return.
2406 	 * This way drivers using blkdev don't have to keep track about blkdev
2407 	 * state, they can just call this function to make sure it detached.
2408 	 */
2409 	if (hdl->h_child == NULL) {
2410 		return (DDI_SUCCESS);
2411 	}
2412 	ndi_devi_enter(hdl->h_parent, &circ);
2413 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
2414 		rv = ddi_remove_child(hdl->h_child, 0);
2415 	} else {
2416 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
2417 		(void) ddi_deviname(hdl->h_child, devnm);
2418 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
2419 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
2420 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
2421 		kmem_free(devnm, MAXNAMELEN + 1);
2422 	}
2423 	if (rv == 0) {
2424 		hdl->h_child = NULL;
2425 	}
2426 
2427 	ndi_devi_exit(hdl->h_parent, circ);
2428 	return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
2429 }
2430 
2431 void
2432 bd_xfer_done(bd_xfer_t *xfer, int err)
2433 {
2434 	bd_xfer_impl_t	*xi = (void *)xfer;
2435 	buf_t		*bp = xi->i_bp;
2436 	int		rv = DDI_SUCCESS;
2437 	bd_t		*bd = xi->i_bd;
2438 	size_t		len;
2439 
2440 	if (err != 0) {
2441 		bd_runq_exit(xi, err);
2442 		atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
2443 
2444 		bp->b_resid += xi->i_resid;
2445 		bd_xfer_free(xi);
2446 		bioerror(bp, err);
2447 		biodone(bp);
2448 		return;
2449 	}
2450 
2451 	xi->i_cur_win++;
2452 	xi->i_resid -= xi->i_len;
2453 
2454 	if (xi->i_resid == 0) {
2455 		/* Job completed succcessfully! */
2456 		bd_runq_exit(xi, 0);
2457 
2458 		bd_xfer_free(xi);
2459 		biodone(bp);
2460 		return;
2461 	}
2462 
2463 	xi->i_blkno += xi->i_nblks;
2464 
2465 	if (bd->d_use_dma) {
2466 		/* More transfer still pending... advance to next DMA window. */
2467 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
2468 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
2469 	} else {
2470 		/* Advance memory window. */
2471 		xi->i_kaddr += xi->i_len;
2472 		xi->i_offset += xi->i_len;
2473 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
2474 	}
2475 
2476 
2477 	if ((rv != DDI_SUCCESS) ||
2478 	    (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
2479 		bd_runq_exit(xi, EFAULT);
2480 
2481 		bp->b_resid += xi->i_resid;
2482 		bd_xfer_free(xi);
2483 		bioerror(bp, EFAULT);
2484 		biodone(bp);
2485 		return;
2486 	}
2487 	xi->i_len = len;
2488 	xi->i_nblks = len >> xi->i_blkshift;
2489 
2490 	/* Submit next window to hardware. */
2491 	rv = xi->i_func(bd->d_private, &xi->i_public);
2492 	if (rv != 0) {
2493 		bd_runq_exit(xi, rv);
2494 
2495 		atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
2496 
2497 		bp->b_resid += xi->i_resid;
2498 		bd_xfer_free(xi);
2499 		bioerror(bp, rv);
2500 		biodone(bp);
2501 	}
2502 }
2503 
2504 void
2505 bd_error(bd_xfer_t *xfer, int error)
2506 {
2507 	bd_xfer_impl_t	*xi = (void *)xfer;
2508 	bd_t		*bd = xi->i_bd;
2509 
2510 	switch (error) {
2511 	case BD_ERR_MEDIA:
2512 		atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2513 		break;
2514 	case BD_ERR_NTRDY:
2515 		atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2516 		break;
2517 	case BD_ERR_NODEV:
2518 		atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2519 		break;
2520 	case BD_ERR_RECOV:
2521 		atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2522 		break;
2523 	case BD_ERR_ILLRQ:
2524 		atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2525 		break;
2526 	case BD_ERR_PFA:
2527 		atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2528 		break;
2529 	default:
2530 		cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2531 		break;
2532 	}
2533 }
2534 
2535 void
2536 bd_state_change(bd_handle_t hdl)
2537 {
2538 	bd_t		*bd;
2539 
2540 	if ((bd = hdl->h_bd) != NULL) {
2541 		bd_update_state(bd);
2542 	}
2543 }
2544 
2545 void
2546 bd_mod_init(struct dev_ops *devops)
2547 {
2548 	static struct bus_ops bd_bus_ops = {
2549 		BUSO_REV,		/* busops_rev */
2550 		nullbusmap,		/* bus_map */
2551 		NULL,			/* bus_get_intrspec (OBSOLETE) */
2552 		NULL,			/* bus_add_intrspec (OBSOLETE) */
2553 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
2554 		i_ddi_map_fault,	/* bus_map_fault */
2555 		NULL,			/* bus_dma_map (OBSOLETE) */
2556 		ddi_dma_allochdl,	/* bus_dma_allochdl */
2557 		ddi_dma_freehdl,	/* bus_dma_freehdl */
2558 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
2559 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
2560 		ddi_dma_flush,		/* bus_dma_flush */
2561 		ddi_dma_win,		/* bus_dma_win */
2562 		ddi_dma_mctl,		/* bus_dma_ctl */
2563 		bd_bus_ctl,		/* bus_ctl */
2564 		ddi_bus_prop_op,	/* bus_prop_op */
2565 		NULL,			/* bus_get_eventcookie */
2566 		NULL,			/* bus_add_eventcall */
2567 		NULL,			/* bus_remove_eventcall */
2568 		NULL,			/* bus_post_event */
2569 		NULL,			/* bus_intr_ctl (OBSOLETE) */
2570 		NULL,			/* bus_config */
2571 		NULL,			/* bus_unconfig */
2572 		NULL,			/* bus_fm_init */
2573 		NULL,			/* bus_fm_fini */
2574 		NULL,			/* bus_fm_access_enter */
2575 		NULL,			/* bus_fm_access_exit */
2576 		NULL,			/* bus_power */
2577 		NULL,			/* bus_intr_op */
2578 	};
2579 
2580 	devops->devo_bus_ops = &bd_bus_ops;
2581 
2582 	/*
2583 	 * NB: The device driver is free to supply its own
2584 	 * character entry device support.
2585 	 */
2586 }
2587 
2588 void
2589 bd_mod_fini(struct dev_ops *devops)
2590 {
2591 	devops->devo_bus_ops = NULL;
2592 }
2593