xref: /illumos-gate/usr/src/uts/common/io/blkdev/blkdev.c (revision 19687f06a4a23ca82910eccd0f898c90ecd3cf62)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2017 The MathWorks, Inc.  All rights reserved.
27  * Copyright 2019 Western Digital Corporation.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/ksynch.h>
32 #include <sys/kmem.h>
33 #include <sys/file.h>
34 #include <sys/errno.h>
35 #include <sys/open.h>
36 #include <sys/buf.h>
37 #include <sys/uio.h>
38 #include <sys/aio_req.h>
39 #include <sys/cred.h>
40 #include <sys/modctl.h>
41 #include <sys/cmlb.h>
42 #include <sys/conf.h>
43 #include <sys/devops.h>
44 #include <sys/list.h>
45 #include <sys/sysmacros.h>
46 #include <sys/dkio.h>
47 #include <sys/vtoc.h>
48 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
49 #include <sys/kstat.h>
50 #include <sys/fs/dv_node.h>
51 #include <sys/ddi.h>
52 #include <sys/sunddi.h>
53 #include <sys/note.h>
54 #include <sys/blkdev.h>
55 #include <sys/scsi/impl/inquiry.h>
56 
57 /*
58  * blkdev is a driver which provides a lot of the common functionality
59  * a block device driver may need and helps by removing code which
60  * is frequently duplicated in block device drivers.
61  *
62  * Within this driver all the struct cb_ops functions required for a
63  * block device driver are written with appropriate call back functions
64  * to be provided by the parent driver.
65  *
66  * To use blkdev, a driver needs to:
67  *	1. Create a bd_ops_t structure which has the call back operations
68  *	   blkdev will use.
69  *	2. Create a handle by calling bd_alloc_handle(). One of the
70  *	   arguments to this function is the bd_ops_t.
71  *	3. Call bd_attach_handle(). This will instantiate a blkdev device
72  *	   as a child device node of the calling driver.
73  *
74  * A parent driver is not restricted to just allocating and attaching a
75  * single instance, it may attach as many as it wishes. For each handle
76  * attached, appropriate entries in /dev/[r]dsk are created.
77  *
78  * The bd_ops_t routines that a parent of blkdev need to provide are:
79  *
80  * o_drive_info: Provide information to blkdev such as how many I/O queues
81  *		 to create and the size of those queues. Also some device
82  *		 specifics such as EUI, vendor, product, model, serial
83  *		 number ....
84  *
85  * o_media_info: Provide information about the media. Eg size and block size.
86  *
87  * o_devid_init: Creates and initializes the device id. Typically calls
88  *		 ddi_devid_init().
89  *
90  * o_sync_cache: Issues a device appropriate command to flush any write
91  *		 caches.
92  *
93  * o_read:	 Read data as described by bd_xfer_t argument.
94  *
95  * o_write:	 Write data as described by bd_xfer_t argument.
96  *
97  *
98  * Queues
99  * ------
100  * Part of the drive_info data is a queue count. blkdev will create
101  * "queue count" number of waitq/runq pairs. Each waitq/runq pair
102  * operates independently. As an I/O is scheduled up to the parent
103  * driver via o_read or o_write its queue number is given. If the
104  * parent driver supports multiple hardware queues it can then select
105  * where to submit the I/O request.
106  *
107  * Currently blkdev uses a simplistic round-robin queue selection method.
108  * It has the advantage that it is lockless. In the future it will be
109  * worthwhile reviewing this strategy for something which prioritizes queues
110  * depending on how busy they are.
111  *
112  * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
113  * I/O requests are initially added to the waitq. They are taken off the
114  * waitq, added to the runq and submitted, providing the runq is less
115  * than the qsize as specified in the drive_info. As an I/O request
116  * completes, the parent driver is required to call bd_xfer_done(), which
117  * will remove the I/O request from the runq and pass I/O completion
118  * status up the stack.
119  *
120  * Locks
121  * -----
122  * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and
123  * d_statemutex. As well a q_iomutex per waitq/runq pair.
124  *
125  * Lock Hierarchy
126  * --------------
127  * The only two locks which may be held simultaneously are q_iomutex and
128  * d_ksmutex. In all cases q_iomutex must be acquired before d_ksmutex.
129  */
130 
131 #define	BD_MAXPART	64
132 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
133 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
134 
135 typedef struct bd bd_t;
136 typedef struct bd_xfer_impl bd_xfer_impl_t;
137 typedef struct bd_queue bd_queue_t;
138 
139 struct bd {
140 	void		*d_private;
141 	dev_info_t	*d_dip;
142 	kmutex_t	d_ocmutex;
143 	kmutex_t	d_ksmutex;
144 	kmutex_t	d_errmutex;
145 	kmutex_t	d_statemutex;
146 	kcondvar_t	d_statecv;
147 	enum dkio_state	d_state;
148 	cmlb_handle_t	d_cmlbh;
149 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
150 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
151 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
152 	uint64_t	d_io_counter;
153 
154 	uint32_t	d_qcount;
155 	uint32_t	d_qactive;
156 	uint32_t	d_maxxfer;
157 	uint32_t	d_blkshift;
158 	uint32_t	d_pblkshift;
159 	uint64_t	d_numblks;
160 	ddi_devid_t	d_devid;
161 
162 	kmem_cache_t	*d_cache;
163 	bd_queue_t	*d_queues;
164 	kstat_t		*d_ksp;
165 	kstat_io_t	*d_kiop;
166 	kstat_t		*d_errstats;
167 	struct bd_errstats *d_kerr;
168 
169 	boolean_t	d_rdonly;
170 	boolean_t	d_ssd;
171 	boolean_t	d_removable;
172 	boolean_t	d_hotpluggable;
173 	boolean_t	d_use_dma;
174 
175 	ddi_dma_attr_t	d_dma;
176 	bd_ops_t	d_ops;
177 	bd_handle_t	d_handle;
178 };
179 
180 struct bd_handle {
181 	bd_ops_t	h_ops;
182 	ddi_dma_attr_t	*h_dma;
183 	dev_info_t	*h_parent;
184 	dev_info_t	*h_child;
185 	void		*h_private;
186 	bd_t		*h_bd;
187 	char		*h_name;
188 	char		h_addr[30];	/* enough for w%0.16x,%X */
189 };
190 
191 struct bd_xfer_impl {
192 	bd_xfer_t	i_public;
193 	list_node_t	i_linkage;
194 	bd_t		*i_bd;
195 	buf_t		*i_bp;
196 	bd_queue_t	*i_bq;
197 	uint_t		i_num_win;
198 	uint_t		i_cur_win;
199 	off_t		i_offset;
200 	int		(*i_func)(void *, bd_xfer_t *);
201 	uint32_t	i_blkshift;
202 	size_t		i_len;
203 	size_t		i_resid;
204 };
205 
206 struct bd_queue {
207 	kmutex_t	q_iomutex;
208 	uint32_t	q_qsize;
209 	uint32_t	q_qactive;
210 	list_t		q_runq;
211 	list_t		q_waitq;
212 };
213 
214 #define	i_dmah		i_public.x_dmah
215 #define	i_dmac		i_public.x_dmac
216 #define	i_ndmac		i_public.x_ndmac
217 #define	i_kaddr		i_public.x_kaddr
218 #define	i_nblks		i_public.x_nblks
219 #define	i_blkno		i_public.x_blkno
220 #define	i_flags		i_public.x_flags
221 #define	i_qnum		i_public.x_qnum
222 
223 
224 /*
225  * Private prototypes.
226  */
227 
228 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
229 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
230 static void bd_create_errstats(bd_t *, int, bd_drive_t *);
231 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
232 static void bd_init_errstats(bd_t *, bd_drive_t *);
233 
234 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
235 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
236 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
237 
238 static int bd_open(dev_t *, int, int, cred_t *);
239 static int bd_close(dev_t, int, int, cred_t *);
240 static int bd_strategy(struct buf *);
241 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
242 static int bd_dump(dev_t, caddr_t, daddr_t, int);
243 static int bd_read(dev_t, struct uio *, cred_t *);
244 static int bd_write(dev_t, struct uio *, cred_t *);
245 static int bd_aread(dev_t, struct aio_req *, cred_t *);
246 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
247 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
248     caddr_t, int *);
249 
250 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
251     void *);
252 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
253 static int bd_xfer_ctor(void *, void *, int);
254 static void bd_xfer_dtor(void *, void *);
255 static void bd_sched(bd_t *, bd_queue_t *);
256 static void bd_submit(bd_t *, bd_xfer_impl_t *);
257 static void bd_runq_exit(bd_xfer_impl_t *, int);
258 static void bd_update_state(bd_t *);
259 static int bd_check_state(bd_t *, enum dkio_state *);
260 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
261 static int bd_check_uio(dev_t, struct uio *);
262 
263 struct cmlb_tg_ops bd_tg_ops = {
264 	TG_DK_OPS_VERSION_1,
265 	bd_tg_rdwr,
266 	bd_tg_getinfo,
267 };
268 
269 static struct cb_ops bd_cb_ops = {
270 	bd_open,		/* open */
271 	bd_close,		/* close */
272 	bd_strategy,		/* strategy */
273 	nodev,			/* print */
274 	bd_dump,		/* dump */
275 	bd_read,		/* read */
276 	bd_write,		/* write */
277 	bd_ioctl,		/* ioctl */
278 	nodev,			/* devmap */
279 	nodev,			/* mmap */
280 	nodev,			/* segmap */
281 	nochpoll,		/* poll */
282 	bd_prop_op,		/* cb_prop_op */
283 	0,			/* streamtab  */
284 	D_64BIT | D_MP,		/* Driver comaptibility flag */
285 	CB_REV,			/* cb_rev */
286 	bd_aread,		/* async read */
287 	bd_awrite		/* async write */
288 };
289 
290 struct dev_ops bd_dev_ops = {
291 	DEVO_REV,		/* devo_rev, */
292 	0,			/* refcnt  */
293 	bd_getinfo,		/* getinfo */
294 	nulldev,		/* identify */
295 	nulldev,		/* probe */
296 	bd_attach,		/* attach */
297 	bd_detach,		/* detach */
298 	nodev,			/* reset */
299 	&bd_cb_ops,		/* driver operations */
300 	NULL,			/* bus operations */
301 	NULL,			/* power */
302 	ddi_quiesce_not_needed,	/* quiesce */
303 };
304 
305 static struct modldrv modldrv = {
306 	&mod_driverops,
307 	"Generic Block Device",
308 	&bd_dev_ops,
309 };
310 
311 static struct modlinkage modlinkage = {
312 	MODREV_1, { &modldrv, NULL }
313 };
314 
315 static void *bd_state;
316 static krwlock_t bd_lock;
317 
318 int
319 _init(void)
320 {
321 	int	rv;
322 
323 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
324 	if (rv != DDI_SUCCESS) {
325 		return (rv);
326 	}
327 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
328 	rv = mod_install(&modlinkage);
329 	if (rv != DDI_SUCCESS) {
330 		rw_destroy(&bd_lock);
331 		ddi_soft_state_fini(&bd_state);
332 	}
333 	return (rv);
334 }
335 
336 int
337 _fini(void)
338 {
339 	int	rv;
340 
341 	rv = mod_remove(&modlinkage);
342 	if (rv == DDI_SUCCESS) {
343 		rw_destroy(&bd_lock);
344 		ddi_soft_state_fini(&bd_state);
345 	}
346 	return (rv);
347 }
348 
349 int
350 _info(struct modinfo *modinfop)
351 {
352 	return (mod_info(&modlinkage, modinfop));
353 }
354 
355 static int
356 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
357 {
358 	bd_t	*bd;
359 	minor_t	inst;
360 
361 	_NOTE(ARGUNUSED(dip));
362 
363 	inst = BDINST((dev_t)arg);
364 
365 	switch (cmd) {
366 	case DDI_INFO_DEVT2DEVINFO:
367 		bd = ddi_get_soft_state(bd_state, inst);
368 		if (bd == NULL) {
369 			return (DDI_FAILURE);
370 		}
371 		*resultp = (void *)bd->d_dip;
372 		break;
373 
374 	case DDI_INFO_DEVT2INSTANCE:
375 		*resultp = (void *)(intptr_t)inst;
376 		break;
377 
378 	default:
379 		return (DDI_FAILURE);
380 	}
381 	return (DDI_SUCCESS);
382 }
383 
384 static void
385 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
386 {
387 	int	ilen;
388 	char	*data_string;
389 
390 	ilen = scsi_ascii_inquiry_len(data, len);
391 	ASSERT3U(ilen, <=, len);
392 	if (ilen <= 0)
393 		return;
394 	/* ensure null termination */
395 	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
396 	bcopy(data, data_string, ilen);
397 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
398 	kmem_free(data_string, ilen + 1);
399 }
400 
401 static void
402 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
403 {
404 	if (drive->d_vendor_len > 0)
405 		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
406 		    drive->d_vendor, drive->d_vendor_len);
407 
408 	if (drive->d_product_len > 0)
409 		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
410 		    drive->d_product, drive->d_product_len);
411 
412 	if (drive->d_serial_len > 0)
413 		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
414 		    drive->d_serial, drive->d_serial_len);
415 
416 	if (drive->d_revision_len > 0)
417 		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
418 		    drive->d_revision, drive->d_revision_len);
419 }
420 
421 static void
422 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
423 {
424 	char	ks_module[KSTAT_STRLEN];
425 	char	ks_name[KSTAT_STRLEN];
426 	int	ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
427 
428 	if (bd->d_errstats != NULL)
429 		return;
430 
431 	(void) snprintf(ks_module, sizeof (ks_module), "%serr",
432 	    ddi_driver_name(bd->d_dip));
433 	(void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
434 	    ddi_driver_name(bd->d_dip), inst);
435 
436 	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
437 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
438 
439 	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
440 	if (bd->d_errstats == NULL) {
441 		/*
442 		 * Even if we cannot create the kstat, we create a
443 		 * scratch kstat.  The reason for this is to ensure
444 		 * that we can update the kstat all of the time,
445 		 * without adding an extra branch instruction.
446 		 */
447 		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
448 		    KM_SLEEP);
449 	} else {
450 		bd->d_errstats->ks_lock = &bd->d_errmutex;
451 		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
452 	}
453 
454 	kstat_named_init(&bd->d_kerr->bd_softerrs,	"Soft Errors",
455 	    KSTAT_DATA_UINT32);
456 	kstat_named_init(&bd->d_kerr->bd_harderrs,	"Hard Errors",
457 	    KSTAT_DATA_UINT32);
458 	kstat_named_init(&bd->d_kerr->bd_transerrs,	"Transport Errors",
459 	    KSTAT_DATA_UINT32);
460 
461 	if (drive->d_model_len > 0) {
462 		kstat_named_init(&bd->d_kerr->bd_model,	"Model",
463 		    KSTAT_DATA_STRING);
464 	} else {
465 		kstat_named_init(&bd->d_kerr->bd_vid,	"Vendor",
466 		    KSTAT_DATA_STRING);
467 		kstat_named_init(&bd->d_kerr->bd_pid,	"Product",
468 		    KSTAT_DATA_STRING);
469 	}
470 
471 	kstat_named_init(&bd->d_kerr->bd_revision,	"Revision",
472 	    KSTAT_DATA_STRING);
473 	kstat_named_init(&bd->d_kerr->bd_serial,	"Serial No",
474 	    KSTAT_DATA_STRING);
475 	kstat_named_init(&bd->d_kerr->bd_capacity,	"Size",
476 	    KSTAT_DATA_ULONGLONG);
477 	kstat_named_init(&bd->d_kerr->bd_rq_media_err,	"Media Error",
478 	    KSTAT_DATA_UINT32);
479 	kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err,	"Device Not Ready",
480 	    KSTAT_DATA_UINT32);
481 	kstat_named_init(&bd->d_kerr->bd_rq_nodev_err,	"No Device",
482 	    KSTAT_DATA_UINT32);
483 	kstat_named_init(&bd->d_kerr->bd_rq_recov_err,	"Recoverable",
484 	    KSTAT_DATA_UINT32);
485 	kstat_named_init(&bd->d_kerr->bd_rq_illrq_err,	"Illegal Request",
486 	    KSTAT_DATA_UINT32);
487 	kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
488 	    "Predictive Failure Analysis", KSTAT_DATA_UINT32);
489 
490 	bd->d_errstats->ks_private = bd;
491 
492 	kstat_install(bd->d_errstats);
493 }
494 
495 static void
496 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
497 {
498 	char	*tmp;
499 	size_t	km_len;
500 
501 	if (KSTAT_NAMED_STR_PTR(k) == NULL) {
502 		if (len > 0)
503 			km_len = strnlen(str, len);
504 		else if (alt != NULL)
505 			km_len = strlen(alt);
506 		else
507 			return;
508 
509 		tmp = kmem_alloc(km_len + 1, KM_SLEEP);
510 		bcopy(len > 0 ? str : alt, tmp, km_len);
511 		tmp[km_len] = '\0';
512 
513 		kstat_named_setstr(k, tmp);
514 	}
515 }
516 
517 static void
518 bd_errstats_clrstr(kstat_named_t *k)
519 {
520 	if (KSTAT_NAMED_STR_PTR(k) == NULL)
521 		return;
522 
523 	kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k));
524 	kstat_named_setstr(k, NULL);
525 }
526 
527 static void
528 bd_init_errstats(bd_t *bd, bd_drive_t *drive)
529 {
530 	struct bd_errstats	*est = bd->d_kerr;
531 
532 	mutex_enter(&bd->d_errmutex);
533 
534 	if (drive->d_model_len > 0 &&
535 	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
536 		bd_errstats_setstr(&est->bd_model, drive->d_model,
537 		    drive->d_model_len, NULL);
538 	} else {
539 		bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
540 		    drive->d_vendor_len, "Unknown ");
541 		bd_errstats_setstr(&est->bd_pid, drive->d_product,
542 		    drive->d_product_len, "Unknown         ");
543 	}
544 
545 	bd_errstats_setstr(&est->bd_revision, drive->d_revision,
546 	    drive->d_revision_len, "0001");
547 	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
548 	    drive->d_serial_len, "0               ");
549 
550 	mutex_exit(&bd->d_errmutex);
551 }
552 
553 static void
554 bd_fini_errstats(bd_t *bd)
555 {
556 	struct bd_errstats	*est = bd->d_kerr;
557 
558 	mutex_enter(&bd->d_errmutex);
559 
560 	bd_errstats_clrstr(&est->bd_model);
561 	bd_errstats_clrstr(&est->bd_vid);
562 	bd_errstats_clrstr(&est->bd_pid);
563 	bd_errstats_clrstr(&est->bd_revision);
564 	bd_errstats_clrstr(&est->bd_serial);
565 
566 	mutex_exit(&bd->d_errmutex);
567 }
568 
569 static void
570 bd_queues_free(bd_t *bd)
571 {
572 	uint32_t i;
573 
574 	for (i = 0; i < bd->d_qcount; i++) {
575 		bd_queue_t *bq = &bd->d_queues[i];
576 
577 		mutex_destroy(&bq->q_iomutex);
578 		list_destroy(&bq->q_waitq);
579 		list_destroy(&bq->q_runq);
580 	}
581 
582 	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
583 }
584 
585 static int
586 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
587 {
588 	int		inst;
589 	bd_handle_t	hdl;
590 	bd_t		*bd;
591 	bd_drive_t	drive;
592 	uint32_t	i;
593 	int		rv;
594 	char		name[16];
595 	char		kcache[32];
596 
597 	switch (cmd) {
598 	case DDI_ATTACH:
599 		break;
600 	case DDI_RESUME:
601 		/* We don't do anything native for suspend/resume */
602 		return (DDI_SUCCESS);
603 	default:
604 		return (DDI_FAILURE);
605 	}
606 
607 	inst = ddi_get_instance(dip);
608 	hdl = ddi_get_parent_data(dip);
609 
610 	(void) snprintf(name, sizeof (name), "%s%d",
611 	    ddi_driver_name(dip), ddi_get_instance(dip));
612 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
613 
614 	if (hdl == NULL) {
615 		cmn_err(CE_WARN, "%s: missing parent data!", name);
616 		return (DDI_FAILURE);
617 	}
618 
619 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
620 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
621 		return (DDI_FAILURE);
622 	}
623 	bd = ddi_get_soft_state(bd_state, inst);
624 
625 	if (hdl->h_dma) {
626 		bd->d_dma = *(hdl->h_dma);
627 		bd->d_dma.dma_attr_granular =
628 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
629 		bd->d_use_dma = B_TRUE;
630 
631 		if (bd->d_maxxfer &&
632 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
633 			cmn_err(CE_WARN,
634 			    "%s: inconsistent maximum transfer size!",
635 			    name);
636 			/* We force it */
637 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
638 		} else {
639 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
640 		}
641 	} else {
642 		bd->d_use_dma = B_FALSE;
643 		if (bd->d_maxxfer == 0) {
644 			bd->d_maxxfer = 1024 * 1024;
645 		}
646 	}
647 	bd->d_ops = hdl->h_ops;
648 	bd->d_private = hdl->h_private;
649 	bd->d_blkshift = 9;	/* 512 bytes, to start */
650 
651 	if (bd->d_maxxfer % DEV_BSIZE) {
652 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
653 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
654 	}
655 	if (bd->d_maxxfer < DEV_BSIZE) {
656 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
657 		ddi_soft_state_free(bd_state, inst);
658 		return (DDI_FAILURE);
659 	}
660 
661 	bd->d_dip = dip;
662 	bd->d_handle = hdl;
663 	hdl->h_bd = bd;
664 	ddi_set_driver_private(dip, bd);
665 
666 	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
667 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
668 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
669 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
670 
671 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
672 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
673 
674 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
675 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
676 	if (bd->d_ksp != NULL) {
677 		bd->d_ksp->ks_lock = &bd->d_ksmutex;
678 		kstat_install(bd->d_ksp);
679 		bd->d_kiop = bd->d_ksp->ks_data;
680 	} else {
681 		/*
682 		 * Even if we cannot create the kstat, we create a
683 		 * scratch kstat.  The reason for this is to ensure
684 		 * that we can update the kstat all of the time,
685 		 * without adding an extra branch instruction.
686 		 */
687 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
688 	}
689 
690 	cmlb_alloc_handle(&bd->d_cmlbh);
691 
692 	bd->d_state = DKIO_NONE;
693 
694 	bzero(&drive, sizeof (drive));
695 	/*
696 	 * Default to one queue, parent driver can override.
697 	 */
698 	drive.d_qcount = 1;
699 	bd->d_ops.o_drive_info(bd->d_private, &drive);
700 	bd->d_qcount = drive.d_qcount;
701 	bd->d_removable = drive.d_removable;
702 	bd->d_hotpluggable = drive.d_hotpluggable;
703 
704 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
705 		bd->d_maxxfer = drive.d_maxxfer;
706 
707 	bd_create_inquiry_props(dip, &drive);
708 
709 	bd_create_errstats(bd, inst, &drive);
710 	bd_init_errstats(bd, &drive);
711 	bd_update_state(bd);
712 
713 	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
714 	    KM_SLEEP);
715 	for (i = 0; i < bd->d_qcount; i++) {
716 		bd_queue_t *bq = &bd->d_queues[i];
717 
718 		bq->q_qsize = drive.d_qsize;
719 		bq->q_qactive = 0;
720 		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);
721 
722 		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
723 		    offsetof(struct bd_xfer_impl, i_linkage));
724 		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
725 		    offsetof(struct bd_xfer_impl, i_linkage));
726 	}
727 
728 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
729 	    bd->d_removable, bd->d_hotpluggable,
730 	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/
731 	    *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
732 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
733 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
734 	if (rv != 0) {
735 		cmlb_free_handle(&bd->d_cmlbh);
736 		kmem_cache_destroy(bd->d_cache);
737 		mutex_destroy(&bd->d_ksmutex);
738 		mutex_destroy(&bd->d_ocmutex);
739 		mutex_destroy(&bd->d_statemutex);
740 		cv_destroy(&bd->d_statecv);
741 		bd_queues_free(bd);
742 		if (bd->d_ksp != NULL) {
743 			kstat_delete(bd->d_ksp);
744 			bd->d_ksp = NULL;
745 		} else {
746 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
747 		}
748 		ddi_soft_state_free(bd_state, inst);
749 		return (DDI_FAILURE);
750 	}
751 
752 	if (bd->d_ops.o_devid_init != NULL) {
753 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
754 		if (rv == DDI_SUCCESS) {
755 			if (ddi_devid_register(dip, bd->d_devid) !=
756 			    DDI_SUCCESS) {
757 				cmn_err(CE_WARN,
758 				    "%s: unable to register devid", name);
759 			}
760 		}
761 	}
762 
763 	/*
764 	 * Add a zero-length attribute to tell the world we support
765 	 * kernel ioctls (for layered drivers).  Also set up properties
766 	 * used by HAL to identify removable media.
767 	 */
768 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
769 	    DDI_KERNEL_IOCTL, NULL, 0);
770 	if (bd->d_removable) {
771 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
772 		    "removable-media", NULL, 0);
773 	}
774 	if (bd->d_hotpluggable) {
775 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
776 		    "hotpluggable", NULL, 0);
777 	}
778 
779 	ddi_report_dev(dip);
780 
781 	return (DDI_SUCCESS);
782 }
783 
784 static int
785 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
786 {
787 	bd_t	*bd;
788 
789 	bd = ddi_get_driver_private(dip);
790 
791 	switch (cmd) {
792 	case DDI_DETACH:
793 		break;
794 	case DDI_SUSPEND:
795 		/* We don't suspend, but our parent does */
796 		return (DDI_SUCCESS);
797 	default:
798 		return (DDI_FAILURE);
799 	}
800 	if (bd->d_ksp != NULL) {
801 		kstat_delete(bd->d_ksp);
802 		bd->d_ksp = NULL;
803 	} else {
804 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
805 	}
806 
807 	if (bd->d_errstats != NULL) {
808 		bd_fini_errstats(bd);
809 		kstat_delete(bd->d_errstats);
810 		bd->d_errstats = NULL;
811 	} else {
812 		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
813 		mutex_destroy(&bd->d_errmutex);
814 	}
815 
816 	cmlb_detach(bd->d_cmlbh, 0);
817 	cmlb_free_handle(&bd->d_cmlbh);
818 	if (bd->d_devid)
819 		ddi_devid_free(bd->d_devid);
820 	kmem_cache_destroy(bd->d_cache);
821 	mutex_destroy(&bd->d_ksmutex);
822 	mutex_destroy(&bd->d_ocmutex);
823 	mutex_destroy(&bd->d_statemutex);
824 	cv_destroy(&bd->d_statecv);
825 	bd_queues_free(bd);
826 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
827 	return (DDI_SUCCESS);
828 }
829 
830 static int
831 bd_xfer_ctor(void *buf, void *arg, int kmflag)
832 {
833 	bd_xfer_impl_t	*xi;
834 	bd_t		*bd = arg;
835 	int		(*dcb)(caddr_t);
836 
837 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
838 		dcb = DDI_DMA_SLEEP;
839 	} else {
840 		dcb = DDI_DMA_DONTWAIT;
841 	}
842 
843 	xi = buf;
844 	bzero(xi, sizeof (*xi));
845 	xi->i_bd = bd;
846 
847 	if (bd->d_use_dma) {
848 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
849 		    &xi->i_dmah) != DDI_SUCCESS) {
850 			return (-1);
851 		}
852 	}
853 
854 	return (0);
855 }
856 
857 static void
858 bd_xfer_dtor(void *buf, void *arg)
859 {
860 	bd_xfer_impl_t	*xi = buf;
861 
862 	_NOTE(ARGUNUSED(arg));
863 
864 	if (xi->i_dmah)
865 		ddi_dma_free_handle(&xi->i_dmah);
866 	xi->i_dmah = NULL;
867 }
868 
869 static bd_xfer_impl_t *
870 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
871     int kmflag)
872 {
873 	bd_xfer_impl_t		*xi;
874 	int			rv = 0;
875 	int			status;
876 	unsigned		dir;
877 	int			(*cb)(caddr_t);
878 	size_t			len;
879 	uint32_t		shift;
880 
881 	if (kmflag == KM_SLEEP) {
882 		cb = DDI_DMA_SLEEP;
883 	} else {
884 		cb = DDI_DMA_DONTWAIT;
885 	}
886 
887 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
888 	if (xi == NULL) {
889 		bioerror(bp, ENOMEM);
890 		return (NULL);
891 	}
892 
893 	ASSERT(bp);
894 
895 	xi->i_bp = bp;
896 	xi->i_func = func;
897 	xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
898 
899 	if (bp->b_bcount == 0) {
900 		xi->i_len = 0;
901 		xi->i_nblks = 0;
902 		xi->i_kaddr = NULL;
903 		xi->i_resid = 0;
904 		xi->i_num_win = 0;
905 		goto done;
906 	}
907 
908 	if (bp->b_flags & B_READ) {
909 		dir = DDI_DMA_READ;
910 		xi->i_func = bd->d_ops.o_read;
911 	} else {
912 		dir = DDI_DMA_WRITE;
913 		xi->i_func = bd->d_ops.o_write;
914 	}
915 
916 	shift = bd->d_blkshift;
917 	xi->i_blkshift = shift;
918 
919 	if (!bd->d_use_dma) {
920 		bp_mapin(bp);
921 		rv = 0;
922 		xi->i_offset = 0;
923 		xi->i_num_win =
924 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
925 		xi->i_cur_win = 0;
926 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
927 		xi->i_nblks = xi->i_len >> shift;
928 		xi->i_kaddr = bp->b_un.b_addr;
929 		xi->i_resid = bp->b_bcount;
930 	} else {
931 
932 		/*
933 		 * We have to use consistent DMA if the address is misaligned.
934 		 */
935 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
936 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
937 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
938 		} else {
939 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
940 		}
941 
942 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
943 		    NULL, &xi->i_dmac, &xi->i_ndmac);
944 		switch (status) {
945 		case DDI_DMA_MAPPED:
946 			xi->i_num_win = 1;
947 			xi->i_cur_win = 0;
948 			xi->i_offset = 0;
949 			xi->i_len = bp->b_bcount;
950 			xi->i_nblks = xi->i_len >> shift;
951 			xi->i_resid = bp->b_bcount;
952 			rv = 0;
953 			break;
954 		case DDI_DMA_PARTIAL_MAP:
955 			xi->i_cur_win = 0;
956 
957 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
958 			    DDI_SUCCESS) ||
959 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
960 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
961 			    DDI_SUCCESS) ||
962 			    (P2PHASE(len, (1U << shift)) != 0)) {
963 				(void) ddi_dma_unbind_handle(xi->i_dmah);
964 				rv = EFAULT;
965 				goto done;
966 			}
967 			xi->i_len = len;
968 			xi->i_nblks = xi->i_len >> shift;
969 			xi->i_resid = bp->b_bcount;
970 			rv = 0;
971 			break;
972 		case DDI_DMA_NORESOURCES:
973 			rv = EAGAIN;
974 			goto done;
975 		case DDI_DMA_TOOBIG:
976 			rv = EINVAL;
977 			goto done;
978 		case DDI_DMA_NOMAPPING:
979 		case DDI_DMA_INUSE:
980 		default:
981 			rv = EFAULT;
982 			goto done;
983 		}
984 	}
985 
986 done:
987 	if (rv != 0) {
988 		kmem_cache_free(bd->d_cache, xi);
989 		bioerror(bp, rv);
990 		return (NULL);
991 	}
992 
993 	return (xi);
994 }
995 
996 static void
997 bd_xfer_free(bd_xfer_impl_t *xi)
998 {
999 	if (xi->i_dmah) {
1000 		(void) ddi_dma_unbind_handle(xi->i_dmah);
1001 	}
1002 	kmem_cache_free(xi->i_bd->d_cache, xi);
1003 }
1004 
1005 static int
1006 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1007 {
1008 	dev_t		dev = *devp;
1009 	bd_t		*bd;
1010 	minor_t		part;
1011 	minor_t		inst;
1012 	uint64_t	mask;
1013 	boolean_t	ndelay;
1014 	int		rv;
1015 	diskaddr_t	nblks;
1016 	diskaddr_t	lba;
1017 
1018 	_NOTE(ARGUNUSED(credp));
1019 
1020 	part = BDPART(dev);
1021 	inst = BDINST(dev);
1022 
1023 	if (otyp >= OTYPCNT)
1024 		return (EINVAL);
1025 
1026 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
1027 
1028 	/*
1029 	 * Block any DR events from changing the set of registered
1030 	 * devices while we function.
1031 	 */
1032 	rw_enter(&bd_lock, RW_READER);
1033 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1034 		rw_exit(&bd_lock);
1035 		return (ENXIO);
1036 	}
1037 
1038 	mutex_enter(&bd->d_ocmutex);
1039 
1040 	ASSERT(part < 64);
1041 	mask = (1U << part);
1042 
1043 	bd_update_state(bd);
1044 
1045 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
1046 
1047 		/* non-blocking opens are allowed to succeed */
1048 		if (!ndelay) {
1049 			rv = ENXIO;
1050 			goto done;
1051 		}
1052 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
1053 	    NULL, NULL, 0) == 0) {
1054 
1055 		/*
1056 		 * We read the partinfo, verify valid ranges.  If the
1057 		 * partition is invalid, and we aren't blocking or
1058 		 * doing a raw access, then fail. (Non-blocking and
1059 		 * raw accesses can still succeed to allow a disk with
1060 		 * bad partition data to opened by format and fdisk.)
1061 		 */
1062 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
1063 			rv = ENXIO;
1064 			goto done;
1065 		}
1066 	} else if (!ndelay) {
1067 		/*
1068 		 * cmlb_partinfo failed -- invalid partition or no
1069 		 * disk label.
1070 		 */
1071 		rv = ENXIO;
1072 		goto done;
1073 	}
1074 
1075 	if ((flag & FWRITE) && bd->d_rdonly) {
1076 		rv = EROFS;
1077 		goto done;
1078 	}
1079 
1080 	if ((bd->d_open_excl) & (mask)) {
1081 		rv = EBUSY;
1082 		goto done;
1083 	}
1084 	if (flag & FEXCL) {
1085 		if (bd->d_open_lyr[part]) {
1086 			rv = EBUSY;
1087 			goto done;
1088 		}
1089 		for (int i = 0; i < OTYP_LYR; i++) {
1090 			if (bd->d_open_reg[i] & mask) {
1091 				rv = EBUSY;
1092 				goto done;
1093 			}
1094 		}
1095 	}
1096 
1097 	if (otyp == OTYP_LYR) {
1098 		bd->d_open_lyr[part]++;
1099 	} else {
1100 		bd->d_open_reg[otyp] |= mask;
1101 	}
1102 	if (flag & FEXCL) {
1103 		bd->d_open_excl |= mask;
1104 	}
1105 
1106 	rv = 0;
1107 done:
1108 	mutex_exit(&bd->d_ocmutex);
1109 	rw_exit(&bd_lock);
1110 
1111 	return (rv);
1112 }
1113 
1114 static int
1115 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
1116 {
1117 	bd_t		*bd;
1118 	minor_t		inst;
1119 	minor_t		part;
1120 	uint64_t	mask;
1121 	boolean_t	last = B_TRUE;
1122 
1123 	_NOTE(ARGUNUSED(flag));
1124 	_NOTE(ARGUNUSED(credp));
1125 
1126 	part = BDPART(dev);
1127 	inst = BDINST(dev);
1128 
1129 	ASSERT(part < 64);
1130 	mask = (1U << part);
1131 
1132 	rw_enter(&bd_lock, RW_READER);
1133 
1134 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1135 		rw_exit(&bd_lock);
1136 		return (ENXIO);
1137 	}
1138 
1139 	mutex_enter(&bd->d_ocmutex);
1140 	if (bd->d_open_excl & mask) {
1141 		bd->d_open_excl &= ~mask;
1142 	}
1143 	if (otyp == OTYP_LYR) {
1144 		bd->d_open_lyr[part]--;
1145 	} else {
1146 		bd->d_open_reg[otyp] &= ~mask;
1147 	}
1148 	for (int i = 0; i < 64; i++) {
1149 		if (bd->d_open_lyr[part]) {
1150 			last = B_FALSE;
1151 		}
1152 	}
1153 	for (int i = 0; last && (i < OTYP_LYR); i++) {
1154 		if (bd->d_open_reg[i]) {
1155 			last = B_FALSE;
1156 		}
1157 	}
1158 	mutex_exit(&bd->d_ocmutex);
1159 
1160 	if (last) {
1161 		cmlb_invalidate(bd->d_cmlbh, 0);
1162 	}
1163 	rw_exit(&bd_lock);
1164 
1165 	return (0);
1166 }
1167 
1168 static int
1169 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1170 {
1171 	minor_t		inst;
1172 	minor_t		part;
1173 	diskaddr_t	pstart;
1174 	diskaddr_t	psize;
1175 	bd_t		*bd;
1176 	bd_xfer_impl_t	*xi;
1177 	buf_t		*bp;
1178 	int		rv;
1179 	uint32_t	shift;
1180 	daddr_t		d_blkno;
1181 	int	d_nblk;
1182 
1183 	rw_enter(&bd_lock, RW_READER);
1184 
1185 	part = BDPART(dev);
1186 	inst = BDINST(dev);
1187 
1188 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1189 		rw_exit(&bd_lock);
1190 		return (ENXIO);
1191 	}
1192 	shift = bd->d_blkshift;
1193 	d_blkno = blkno >> (shift - DEV_BSHIFT);
1194 	d_nblk = nblk >> (shift - DEV_BSHIFT);
1195 	/*
1196 	 * do cmlb, but do it synchronously unless we already have the
1197 	 * partition (which we probably should.)
1198 	 */
1199 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1200 	    (void *)1)) {
1201 		rw_exit(&bd_lock);
1202 		return (ENXIO);
1203 	}
1204 
1205 	if ((d_blkno + d_nblk) > psize) {
1206 		rw_exit(&bd_lock);
1207 		return (EINVAL);
1208 	}
1209 	bp = getrbuf(KM_NOSLEEP);
1210 	if (bp == NULL) {
1211 		rw_exit(&bd_lock);
1212 		return (ENOMEM);
1213 	}
1214 
1215 	bp->b_bcount = nblk << DEV_BSHIFT;
1216 	bp->b_resid = bp->b_bcount;
1217 	bp->b_lblkno = blkno;
1218 	bp->b_un.b_addr = caddr;
1219 
1220 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
1221 	if (xi == NULL) {
1222 		rw_exit(&bd_lock);
1223 		freerbuf(bp);
1224 		return (ENOMEM);
1225 	}
1226 	xi->i_blkno = d_blkno + pstart;
1227 	xi->i_flags = BD_XFER_POLL;
1228 	bd_submit(bd, xi);
1229 	rw_exit(&bd_lock);
1230 
1231 	/*
1232 	 * Generally, we should have run this entirely synchronously
1233 	 * at this point and the biowait call should be a no-op.  If
1234 	 * it didn't happen this way, it's a bug in the underlying
1235 	 * driver not honoring BD_XFER_POLL.
1236 	 */
1237 	(void) biowait(bp);
1238 	rv = geterror(bp);
1239 	freerbuf(bp);
1240 	return (rv);
1241 }
1242 
1243 void
1244 bd_minphys(struct buf *bp)
1245 {
1246 	minor_t inst;
1247 	bd_t	*bd;
1248 	inst = BDINST(bp->b_edev);
1249 
1250 	bd = ddi_get_soft_state(bd_state, inst);
1251 
1252 	/*
1253 	 * In a non-debug kernel, bd_strategy will catch !bd as
1254 	 * well, and will fail nicely.
1255 	 */
1256 	ASSERT(bd);
1257 
1258 	if (bp->b_bcount > bd->d_maxxfer)
1259 		bp->b_bcount = bd->d_maxxfer;
1260 }
1261 
1262 static int
1263 bd_check_uio(dev_t dev, struct uio *uio)
1264 {
1265 	bd_t		*bd;
1266 	uint32_t	shift;
1267 
1268 	if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1269 		return (ENXIO);
1270 	}
1271 
1272 	shift = bd->d_blkshift;
1273 	if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1274 	    (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1275 		return (EINVAL);
1276 	}
1277 
1278 	return (0);
1279 }
1280 
1281 static int
1282 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1283 {
1284 	_NOTE(ARGUNUSED(credp));
1285 	int	ret = bd_check_uio(dev, uio);
1286 	if (ret != 0) {
1287 		return (ret);
1288 	}
1289 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1290 }
1291 
1292 static int
1293 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1294 {
1295 	_NOTE(ARGUNUSED(credp));
1296 	int	ret = bd_check_uio(dev, uio);
1297 	if (ret != 0) {
1298 		return (ret);
1299 	}
1300 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1301 }
1302 
1303 static int
1304 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1305 {
1306 	_NOTE(ARGUNUSED(credp));
1307 	int	ret = bd_check_uio(dev, aio->aio_uio);
1308 	if (ret != 0) {
1309 		return (ret);
1310 	}
1311 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1312 }
1313 
1314 static int
1315 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1316 {
1317 	_NOTE(ARGUNUSED(credp));
1318 	int	ret = bd_check_uio(dev, aio->aio_uio);
1319 	if (ret != 0) {
1320 		return (ret);
1321 	}
1322 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1323 }
1324 
1325 static int
1326 bd_strategy(struct buf *bp)
1327 {
1328 	minor_t		inst;
1329 	minor_t		part;
1330 	bd_t		*bd;
1331 	diskaddr_t	p_lba;
1332 	diskaddr_t	p_nblks;
1333 	diskaddr_t	b_nblks;
1334 	bd_xfer_impl_t	*xi;
1335 	uint32_t	shift;
1336 	int		(*func)(void *, bd_xfer_t *);
1337 	diskaddr_t	lblkno;
1338 
1339 	part = BDPART(bp->b_edev);
1340 	inst = BDINST(bp->b_edev);
1341 
1342 	ASSERT(bp);
1343 
1344 	bp->b_resid = bp->b_bcount;
1345 
1346 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1347 		bioerror(bp, ENXIO);
1348 		biodone(bp);
1349 		return (0);
1350 	}
1351 
1352 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1353 	    NULL, NULL, 0)) {
1354 		bioerror(bp, ENXIO);
1355 		biodone(bp);
1356 		return (0);
1357 	}
1358 
1359 	shift = bd->d_blkshift;
1360 	lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1361 	if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1362 	    (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1363 	    (lblkno > p_nblks)) {
1364 		bioerror(bp, EINVAL);
1365 		biodone(bp);
1366 		return (0);
1367 	}
1368 	b_nblks = bp->b_bcount >> shift;
1369 	if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1370 		biodone(bp);
1371 		return (0);
1372 	}
1373 
1374 	if ((b_nblks + lblkno) > p_nblks) {
1375 		bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1376 		bp->b_bcount -= bp->b_resid;
1377 	} else {
1378 		bp->b_resid = 0;
1379 	}
1380 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1381 
1382 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1383 	if (xi == NULL) {
1384 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1385 	}
1386 	if (xi == NULL) {
1387 		/* bd_request_alloc will have done bioerror */
1388 		biodone(bp);
1389 		return (0);
1390 	}
1391 	xi->i_blkno = lblkno + p_lba;
1392 
1393 	bd_submit(bd, xi);
1394 
1395 	return (0);
1396 }
1397 
1398 static int
1399 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1400 {
1401 	minor_t		inst;
1402 	uint16_t	part;
1403 	bd_t		*bd;
1404 	void		*ptr = (void *)arg;
1405 	int		rv;
1406 
1407 	part = BDPART(dev);
1408 	inst = BDINST(dev);
1409 
1410 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1411 		return (ENXIO);
1412 	}
1413 
1414 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1415 	if (rv != ENOTTY)
1416 		return (rv);
1417 
1418 	if (rvalp != NULL) {
1419 		/* the return value of the ioctl is 0 by default */
1420 		*rvalp = 0;
1421 	}
1422 
1423 	switch (cmd) {
1424 	case DKIOCGMEDIAINFO: {
1425 		struct dk_minfo minfo;
1426 
1427 		/* make sure our state information is current */
1428 		bd_update_state(bd);
1429 		bzero(&minfo, sizeof (minfo));
1430 		minfo.dki_media_type = DK_FIXED_DISK;
1431 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1432 		minfo.dki_capacity = bd->d_numblks;
1433 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1434 			return (EFAULT);
1435 		}
1436 		return (0);
1437 	}
1438 	case DKIOCGMEDIAINFOEXT: {
1439 		struct dk_minfo_ext miext;
1440 
1441 		/* make sure our state information is current */
1442 		bd_update_state(bd);
1443 		bzero(&miext, sizeof (miext));
1444 		miext.dki_media_type = DK_FIXED_DISK;
1445 		miext.dki_lbsize = (1U << bd->d_blkshift);
1446 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1447 		miext.dki_capacity = bd->d_numblks;
1448 		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1449 			return (EFAULT);
1450 		}
1451 		return (0);
1452 	}
1453 	case DKIOCINFO: {
1454 		struct dk_cinfo cinfo;
1455 		bzero(&cinfo, sizeof (cinfo));
1456 		cinfo.dki_ctype = DKC_BLKDEV;
1457 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1458 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1459 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1460 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1461 		    "%s", ddi_driver_name(bd->d_dip));
1462 		cinfo.dki_unit = inst;
1463 		cinfo.dki_flags = DKI_FMTVOL;
1464 		cinfo.dki_partition = part;
1465 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1466 		cinfo.dki_addr = 0;
1467 		cinfo.dki_slave = 0;
1468 		cinfo.dki_space = 0;
1469 		cinfo.dki_prio = 0;
1470 		cinfo.dki_vec = 0;
1471 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1472 			return (EFAULT);
1473 		}
1474 		return (0);
1475 	}
1476 	case DKIOCREMOVABLE: {
1477 		int i;
1478 		i = bd->d_removable ? 1 : 0;
1479 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1480 			return (EFAULT);
1481 		}
1482 		return (0);
1483 	}
1484 	case DKIOCHOTPLUGGABLE: {
1485 		int i;
1486 		i = bd->d_hotpluggable ? 1 : 0;
1487 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1488 			return (EFAULT);
1489 		}
1490 		return (0);
1491 	}
1492 	case DKIOCREADONLY: {
1493 		int i;
1494 		i = bd->d_rdonly ? 1 : 0;
1495 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1496 			return (EFAULT);
1497 		}
1498 		return (0);
1499 	}
1500 	case DKIOCSOLIDSTATE: {
1501 		int i;
1502 		i = bd->d_ssd ? 1 : 0;
1503 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1504 			return (EFAULT);
1505 		}
1506 		return (0);
1507 	}
1508 	case DKIOCSTATE: {
1509 		enum dkio_state	state;
1510 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1511 			return (EFAULT);
1512 		}
1513 		if ((rv = bd_check_state(bd, &state)) != 0) {
1514 			return (rv);
1515 		}
1516 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1517 			return (EFAULT);
1518 		}
1519 		return (0);
1520 	}
1521 	case DKIOCFLUSHWRITECACHE: {
1522 		struct dk_callback *dkc = NULL;
1523 
1524 		if (flag & FKIOCTL)
1525 			dkc = (void *)arg;
1526 
1527 		rv = bd_flush_write_cache(bd, dkc);
1528 		return (rv);
1529 	}
1530 
1531 	default:
1532 		break;
1533 
1534 	}
1535 	return (ENOTTY);
1536 }
1537 
1538 static int
1539 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1540     char *name, caddr_t valuep, int *lengthp)
1541 {
1542 	bd_t	*bd;
1543 
1544 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1545 	if (bd == NULL)
1546 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1547 		    name, valuep, lengthp));
1548 
1549 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1550 	    valuep, lengthp, BDPART(dev), 0));
1551 }
1552 
1553 
1554 static int
1555 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1556     size_t length, void *tg_cookie)
1557 {
1558 	bd_t		*bd;
1559 	buf_t		*bp;
1560 	bd_xfer_impl_t	*xi;
1561 	int		rv;
1562 	int		(*func)(void *, bd_xfer_t *);
1563 	int		kmflag;
1564 
1565 	/*
1566 	 * If we are running in polled mode (such as during dump(9e)
1567 	 * execution), then we cannot sleep for kernel allocations.
1568 	 */
1569 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1570 
1571 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1572 
1573 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1574 		/* We can only transfer whole blocks at a time! */
1575 		return (EINVAL);
1576 	}
1577 
1578 	if ((bp = getrbuf(kmflag)) == NULL) {
1579 		return (ENOMEM);
1580 	}
1581 
1582 	switch (cmd) {
1583 	case TG_READ:
1584 		bp->b_flags = B_READ;
1585 		func = bd->d_ops.o_read;
1586 		break;
1587 	case TG_WRITE:
1588 		bp->b_flags = B_WRITE;
1589 		func = bd->d_ops.o_write;
1590 		break;
1591 	default:
1592 		freerbuf(bp);
1593 		return (EINVAL);
1594 	}
1595 
1596 	bp->b_un.b_addr = bufaddr;
1597 	bp->b_bcount = length;
1598 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1599 	if (xi == NULL) {
1600 		rv = geterror(bp);
1601 		freerbuf(bp);
1602 		return (rv);
1603 	}
1604 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1605 	xi->i_blkno = start;
1606 	bd_submit(bd, xi);
1607 	(void) biowait(bp);
1608 	rv = geterror(bp);
1609 	freerbuf(bp);
1610 
1611 	return (rv);
1612 }
1613 
1614 static int
1615 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1616 {
1617 	bd_t		*bd;
1618 
1619 	_NOTE(ARGUNUSED(tg_cookie));
1620 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1621 
1622 	switch (cmd) {
1623 	case TG_GETPHYGEOM:
1624 	case TG_GETVIRTGEOM:
1625 		/*
1626 		 * We don't have any "geometry" as such, let cmlb
1627 		 * fabricate something.
1628 		 */
1629 		return (ENOTTY);
1630 
1631 	case TG_GETCAPACITY:
1632 		bd_update_state(bd);
1633 		*(diskaddr_t *)arg = bd->d_numblks;
1634 		return (0);
1635 
1636 	case TG_GETBLOCKSIZE:
1637 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1638 		return (0);
1639 
1640 	case TG_GETATTR:
1641 		/*
1642 		 * It turns out that cmlb really doesn't do much for
1643 		 * non-writable media, but lets make the information
1644 		 * available for it in case it does more in the
1645 		 * future.  (The value is currently used for
1646 		 * triggering special behavior for CD-ROMs.)
1647 		 */
1648 		bd_update_state(bd);
1649 		((tg_attribute_t *)arg)->media_is_writable =
1650 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1651 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1652 		((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1653 		return (0);
1654 
1655 	default:
1656 		return (EINVAL);
1657 	}
1658 }
1659 
1660 
1661 static void
1662 bd_sched(bd_t *bd, bd_queue_t *bq)
1663 {
1664 	bd_xfer_impl_t	*xi;
1665 	struct buf	*bp;
1666 	int		rv;
1667 
1668 	mutex_enter(&bq->q_iomutex);
1669 
1670 	while ((bq->q_qactive < bq->q_qsize) &&
1671 	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
1672 		mutex_enter(&bd->d_ksmutex);
1673 		kstat_waitq_to_runq(bd->d_kiop);
1674 		mutex_exit(&bd->d_ksmutex);
1675 
1676 		bq->q_qactive++;
1677 		list_insert_tail(&bq->q_runq, xi);
1678 
1679 		/*
1680 		 * Submit the job to the driver.  We drop the I/O mutex
1681 		 * so that we can deal with the case where the driver
1682 		 * completion routine calls back into us synchronously.
1683 		 */
1684 
1685 		mutex_exit(&bq->q_iomutex);
1686 
1687 		rv = xi->i_func(bd->d_private, &xi->i_public);
1688 		if (rv != 0) {
1689 			bp = xi->i_bp;
1690 			bioerror(bp, rv);
1691 			biodone(bp);
1692 
1693 			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1694 
1695 			mutex_enter(&bq->q_iomutex);
1696 
1697 			mutex_enter(&bd->d_ksmutex);
1698 			kstat_runq_exit(bd->d_kiop);
1699 			mutex_exit(&bd->d_ksmutex);
1700 
1701 			bq->q_qactive--;
1702 			list_remove(&bq->q_runq, xi);
1703 			bd_xfer_free(xi);
1704 		} else {
1705 			mutex_enter(&bq->q_iomutex);
1706 		}
1707 	}
1708 
1709 	mutex_exit(&bq->q_iomutex);
1710 }
1711 
1712 static void
1713 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1714 {
1715 	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
1716 	unsigned	q = nv % bd->d_qcount;
1717 	bd_queue_t	*bq = &bd->d_queues[q];
1718 
1719 	xi->i_bq = bq;
1720 	xi->i_qnum = q;
1721 
1722 	mutex_enter(&bq->q_iomutex);
1723 
1724 	list_insert_tail(&bq->q_waitq, xi);
1725 
1726 	mutex_enter(&bd->d_ksmutex);
1727 	kstat_waitq_enter(bd->d_kiop);
1728 	mutex_exit(&bd->d_ksmutex);
1729 
1730 	mutex_exit(&bq->q_iomutex);
1731 
1732 	bd_sched(bd, bq);
1733 }
1734 
1735 static void
1736 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1737 {
1738 	bd_t		*bd = xi->i_bd;
1739 	buf_t		*bp = xi->i_bp;
1740 	bd_queue_t	*bq = xi->i_bq;
1741 
1742 	mutex_enter(&bq->q_iomutex);
1743 	bq->q_qactive--;
1744 
1745 	mutex_enter(&bd->d_ksmutex);
1746 	kstat_runq_exit(bd->d_kiop);
1747 	mutex_exit(&bd->d_ksmutex);
1748 
1749 	list_remove(&bq->q_runq, xi);
1750 	mutex_exit(&bq->q_iomutex);
1751 
1752 	if (err == 0) {
1753 		if (bp->b_flags & B_READ) {
1754 			atomic_inc_uint(&bd->d_kiop->reads);
1755 			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
1756 			    bp->b_bcount - xi->i_resid);
1757 		} else {
1758 			atomic_inc_uint(&bd->d_kiop->writes);
1759 			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
1760 			    bp->b_bcount - xi->i_resid);
1761 		}
1762 	}
1763 	bd_sched(bd, bq);
1764 }
1765 
1766 static void
1767 bd_update_state(bd_t *bd)
1768 {
1769 	enum	dkio_state	state = DKIO_INSERTED;
1770 	boolean_t		docmlb = B_FALSE;
1771 	bd_media_t		media;
1772 
1773 	bzero(&media, sizeof (media));
1774 
1775 	mutex_enter(&bd->d_statemutex);
1776 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1777 		bd->d_numblks = 0;
1778 		state = DKIO_EJECTED;
1779 		goto done;
1780 	}
1781 
1782 	if ((media.m_blksize < 512) ||
1783 	    (!ISP2(media.m_blksize)) ||
1784 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1785 		cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1786 		    ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1787 		    media.m_blksize);
1788 		/*
1789 		 * We can't use the media, treat it as not present.
1790 		 */
1791 		state = DKIO_EJECTED;
1792 		bd->d_numblks = 0;
1793 		goto done;
1794 	}
1795 
1796 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
1797 	    (bd->d_numblks != media.m_nblks)) {
1798 		/* Device size changed */
1799 		docmlb = B_TRUE;
1800 	}
1801 
1802 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1803 	bd->d_pblkshift = bd->d_blkshift;
1804 	bd->d_numblks = media.m_nblks;
1805 	bd->d_rdonly = media.m_readonly;
1806 	bd->d_ssd = media.m_solidstate;
1807 
1808 	/*
1809 	 * Only use the supplied physical block size if it is non-zero,
1810 	 * greater or equal to the block size, and a power of 2. Ignore it
1811 	 * if not, it's just informational and we can still use the media.
1812 	 */
1813 	if ((media.m_pblksize != 0) &&
1814 	    (media.m_pblksize >= media.m_blksize) &&
1815 	    (ISP2(media.m_pblksize)))
1816 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1817 
1818 done:
1819 	if (state != bd->d_state) {
1820 		bd->d_state = state;
1821 		cv_broadcast(&bd->d_statecv);
1822 		docmlb = B_TRUE;
1823 	}
1824 	mutex_exit(&bd->d_statemutex);
1825 
1826 	bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
1827 
1828 	if (docmlb) {
1829 		if (state == DKIO_INSERTED) {
1830 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1831 		} else {
1832 			cmlb_invalidate(bd->d_cmlbh, 0);
1833 		}
1834 	}
1835 }
1836 
1837 static int
1838 bd_check_state(bd_t *bd, enum dkio_state *state)
1839 {
1840 	clock_t		when;
1841 
1842 	for (;;) {
1843 
1844 		bd_update_state(bd);
1845 
1846 		mutex_enter(&bd->d_statemutex);
1847 
1848 		if (bd->d_state != *state) {
1849 			*state = bd->d_state;
1850 			mutex_exit(&bd->d_statemutex);
1851 			break;
1852 		}
1853 
1854 		when = drv_usectohz(1000000);
1855 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1856 		    when, TR_CLOCK_TICK) == 0) {
1857 			mutex_exit(&bd->d_statemutex);
1858 			return (EINTR);
1859 		}
1860 
1861 		mutex_exit(&bd->d_statemutex);
1862 	}
1863 
1864 	return (0);
1865 }
1866 
1867 static int
1868 bd_flush_write_cache_done(struct buf *bp)
1869 {
1870 	struct dk_callback *dc = (void *)bp->b_private;
1871 
1872 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1873 	kmem_free(dc, sizeof (*dc));
1874 	freerbuf(bp);
1875 	return (0);
1876 }
1877 
1878 static int
1879 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1880 {
1881 	buf_t			*bp;
1882 	struct dk_callback	*dc;
1883 	bd_xfer_impl_t		*xi;
1884 	int			rv;
1885 
1886 	if (bd->d_ops.o_sync_cache == NULL) {
1887 		return (ENOTSUP);
1888 	}
1889 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1890 		return (ENOMEM);
1891 	}
1892 	bp->b_resid = 0;
1893 	bp->b_bcount = 0;
1894 
1895 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1896 	if (xi == NULL) {
1897 		rv = geterror(bp);
1898 		freerbuf(bp);
1899 		return (rv);
1900 	}
1901 
1902 	/* Make an asynchronous flush, but only if there is a callback */
1903 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1904 		/* Make a private copy of the callback structure */
1905 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1906 		*dc = *dkc;
1907 		bp->b_private = dc;
1908 		bp->b_iodone = bd_flush_write_cache_done;
1909 
1910 		bd_submit(bd, xi);
1911 		return (0);
1912 	}
1913 
1914 	/* In case there is no callback, perform a synchronous flush */
1915 	bd_submit(bd, xi);
1916 	(void) biowait(bp);
1917 	rv = geterror(bp);
1918 	freerbuf(bp);
1919 
1920 	return (rv);
1921 }
1922 
1923 /*
1924  * Nexus support.
1925  */
1926 int
1927 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1928     void *arg, void *result)
1929 {
1930 	bd_handle_t	hdl;
1931 
1932 	switch (ctlop) {
1933 	case DDI_CTLOPS_REPORTDEV:
1934 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1935 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1936 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1937 		return (DDI_SUCCESS);
1938 
1939 	case DDI_CTLOPS_INITCHILD:
1940 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1941 		if (hdl == NULL) {
1942 			return (DDI_NOT_WELL_FORMED);
1943 		}
1944 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1945 		return (DDI_SUCCESS);
1946 
1947 	case DDI_CTLOPS_UNINITCHILD:
1948 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1949 		ndi_prop_remove_all((dev_info_t *)arg);
1950 		return (DDI_SUCCESS);
1951 
1952 	default:
1953 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1954 	}
1955 }
1956 
1957 /*
1958  * Functions for device drivers.
1959  */
1960 bd_handle_t
1961 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1962 {
1963 	bd_handle_t	hdl;
1964 
1965 	/*
1966 	 * There is full compatability between the version 0 API and the
1967 	 * current version.
1968 	 */
1969 	switch (ops->o_version) {
1970 	case BD_OPS_VERSION_0:
1971 	case BD_OPS_CURRENT_VERSION:
1972 		break;
1973 
1974 	default:
1975 		return (NULL);
1976 	}
1977 
1978 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1979 	if (hdl != NULL) {
1980 		hdl->h_ops = *ops;
1981 		hdl->h_dma = dma;
1982 		hdl->h_private = private;
1983 	}
1984 
1985 	return (hdl);
1986 }
1987 
1988 void
1989 bd_free_handle(bd_handle_t hdl)
1990 {
1991 	kmem_free(hdl, sizeof (*hdl));
1992 }
1993 
1994 int
1995 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1996 {
1997 	dev_info_t	*child;
1998 	bd_drive_t	drive = { 0 };
1999 
2000 	/*
2001 	 * It's not an error if bd_attach_handle() is called on a handle that
2002 	 * already is attached. We just ignore the request to attach and return.
2003 	 * This way drivers using blkdev don't have to keep track about blkdev
2004 	 * state, they can just call this function to make sure it attached.
2005 	 */
2006 	if (hdl->h_child != NULL) {
2007 		return (DDI_SUCCESS);
2008 	}
2009 
2010 	/* if drivers don't override this, make it assume none */
2011 	drive.d_lun = -1;
2012 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
2013 
2014 	hdl->h_parent = dip;
2015 	hdl->h_name = "blkdev";
2016 
2017 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2018 	if (*(uint64_t *)drive.d_eui64 != 0) {
2019 		if (drive.d_lun >= 0) {
2020 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2021 			    "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
2022 			    drive.d_eui64[0], drive.d_eui64[1],
2023 			    drive.d_eui64[2], drive.d_eui64[3],
2024 			    drive.d_eui64[4], drive.d_eui64[5],
2025 			    drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
2026 		} else {
2027 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2028 			    "w%02X%02X%02X%02X%02X%02X%02X%02X",
2029 			    drive.d_eui64[0], drive.d_eui64[1],
2030 			    drive.d_eui64[2], drive.d_eui64[3],
2031 			    drive.d_eui64[4], drive.d_eui64[5],
2032 			    drive.d_eui64[6], drive.d_eui64[7]);
2033 		}
2034 	} else {
2035 		if (drive.d_lun >= 0) {
2036 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2037 			    "%X,%X", drive.d_target, drive.d_lun);
2038 		} else {
2039 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2040 			    "%X", drive.d_target);
2041 		}
2042 	}
2043 
2044 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
2045 	    &child) != NDI_SUCCESS) {
2046 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
2047 		    ddi_driver_name(dip), ddi_get_instance(dip),
2048 		    "blkdev", hdl->h_addr);
2049 		return (DDI_FAILURE);
2050 	}
2051 
2052 	ddi_set_parent_data(child, hdl);
2053 	hdl->h_child = child;
2054 
2055 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
2056 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
2057 		    ddi_driver_name(dip), ddi_get_instance(dip),
2058 		    hdl->h_name, hdl->h_addr);
2059 		(void) ndi_devi_free(child);
2060 		return (DDI_FAILURE);
2061 	}
2062 
2063 	return (DDI_SUCCESS);
2064 }
2065 
2066 int
2067 bd_detach_handle(bd_handle_t hdl)
2068 {
2069 	int	circ;
2070 	int	rv;
2071 	char	*devnm;
2072 
2073 	/*
2074 	 * It's not an error if bd_detach_handle() is called on a handle that
2075 	 * already is detached. We just ignore the request to detach and return.
2076 	 * This way drivers using blkdev don't have to keep track about blkdev
2077 	 * state, they can just call this function to make sure it detached.
2078 	 */
2079 	if (hdl->h_child == NULL) {
2080 		return (DDI_SUCCESS);
2081 	}
2082 	ndi_devi_enter(hdl->h_parent, &circ);
2083 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
2084 		rv = ddi_remove_child(hdl->h_child, 0);
2085 	} else {
2086 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
2087 		(void) ddi_deviname(hdl->h_child, devnm);
2088 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
2089 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
2090 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
2091 		kmem_free(devnm, MAXNAMELEN + 1);
2092 	}
2093 	if (rv == 0) {
2094 		hdl->h_child = NULL;
2095 	}
2096 
2097 	ndi_devi_exit(hdl->h_parent, circ);
2098 	return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
2099 }
2100 
2101 void
2102 bd_xfer_done(bd_xfer_t *xfer, int err)
2103 {
2104 	bd_xfer_impl_t	*xi = (void *)xfer;
2105 	buf_t		*bp = xi->i_bp;
2106 	int		rv = DDI_SUCCESS;
2107 	bd_t		*bd = xi->i_bd;
2108 	size_t		len;
2109 
2110 	if (err != 0) {
2111 		bd_runq_exit(xi, err);
2112 		atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
2113 
2114 		bp->b_resid += xi->i_resid;
2115 		bd_xfer_free(xi);
2116 		bioerror(bp, err);
2117 		biodone(bp);
2118 		return;
2119 	}
2120 
2121 	xi->i_cur_win++;
2122 	xi->i_resid -= xi->i_len;
2123 
2124 	if (xi->i_resid == 0) {
2125 		/* Job completed succcessfully! */
2126 		bd_runq_exit(xi, 0);
2127 
2128 		bd_xfer_free(xi);
2129 		biodone(bp);
2130 		return;
2131 	}
2132 
2133 	xi->i_blkno += xi->i_nblks;
2134 
2135 	if (bd->d_use_dma) {
2136 		/* More transfer still pending... advance to next DMA window. */
2137 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
2138 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
2139 	} else {
2140 		/* Advance memory window. */
2141 		xi->i_kaddr += xi->i_len;
2142 		xi->i_offset += xi->i_len;
2143 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
2144 	}
2145 
2146 
2147 	if ((rv != DDI_SUCCESS) ||
2148 	    (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
2149 		bd_runq_exit(xi, EFAULT);
2150 
2151 		bp->b_resid += xi->i_resid;
2152 		bd_xfer_free(xi);
2153 		bioerror(bp, EFAULT);
2154 		biodone(bp);
2155 		return;
2156 	}
2157 	xi->i_len = len;
2158 	xi->i_nblks = len >> xi->i_blkshift;
2159 
2160 	/* Submit next window to hardware. */
2161 	rv = xi->i_func(bd->d_private, &xi->i_public);
2162 	if (rv != 0) {
2163 		bd_runq_exit(xi, rv);
2164 
2165 		atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
2166 
2167 		bp->b_resid += xi->i_resid;
2168 		bd_xfer_free(xi);
2169 		bioerror(bp, rv);
2170 		biodone(bp);
2171 	}
2172 }
2173 
2174 void
2175 bd_error(bd_xfer_t *xfer, int error)
2176 {
2177 	bd_xfer_impl_t	*xi = (void *)xfer;
2178 	bd_t		*bd = xi->i_bd;
2179 
2180 	switch (error) {
2181 	case BD_ERR_MEDIA:
2182 		atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2183 		break;
2184 	case BD_ERR_NTRDY:
2185 		atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2186 		break;
2187 	case BD_ERR_NODEV:
2188 		atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2189 		break;
2190 	case BD_ERR_RECOV:
2191 		atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2192 		break;
2193 	case BD_ERR_ILLRQ:
2194 		atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2195 		break;
2196 	case BD_ERR_PFA:
2197 		atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2198 		break;
2199 	default:
2200 		cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2201 		break;
2202 	}
2203 }
2204 
2205 void
2206 bd_state_change(bd_handle_t hdl)
2207 {
2208 	bd_t		*bd;
2209 
2210 	if ((bd = hdl->h_bd) != NULL) {
2211 		bd_update_state(bd);
2212 	}
2213 }
2214 
2215 void
2216 bd_mod_init(struct dev_ops *devops)
2217 {
2218 	static struct bus_ops bd_bus_ops = {
2219 		BUSO_REV,		/* busops_rev */
2220 		nullbusmap,		/* bus_map */
2221 		NULL,			/* bus_get_intrspec (OBSOLETE) */
2222 		NULL,			/* bus_add_intrspec (OBSOLETE) */
2223 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
2224 		i_ddi_map_fault,	/* bus_map_fault */
2225 		NULL,			/* bus_dma_map (OBSOLETE) */
2226 		ddi_dma_allochdl,	/* bus_dma_allochdl */
2227 		ddi_dma_freehdl,	/* bus_dma_freehdl */
2228 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
2229 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
2230 		ddi_dma_flush,		/* bus_dma_flush */
2231 		ddi_dma_win,		/* bus_dma_win */
2232 		ddi_dma_mctl,		/* bus_dma_ctl */
2233 		bd_bus_ctl,		/* bus_ctl */
2234 		ddi_bus_prop_op,	/* bus_prop_op */
2235 		NULL,			/* bus_get_eventcookie */
2236 		NULL,			/* bus_add_eventcall */
2237 		NULL,			/* bus_remove_eventcall */
2238 		NULL,			/* bus_post_event */
2239 		NULL,			/* bus_intr_ctl (OBSOLETE) */
2240 		NULL,			/* bus_config */
2241 		NULL,			/* bus_unconfig */
2242 		NULL,			/* bus_fm_init */
2243 		NULL,			/* bus_fm_fini */
2244 		NULL,			/* bus_fm_access_enter */
2245 		NULL,			/* bus_fm_access_exit */
2246 		NULL,			/* bus_power */
2247 		NULL,			/* bus_intr_op */
2248 	};
2249 
2250 	devops->devo_bus_ops = &bd_bus_ops;
2251 
2252 	/*
2253 	 * NB: The device driver is free to supply its own
2254 	 * character entry device support.
2255 	 */
2256 }
2257 
2258 void
2259 bd_mod_fini(struct dev_ops *devops)
2260 {
2261 	devops->devo_bus_ops = NULL;
2262 }
2263