xref: /illumos-gate/usr/src/uts/common/io/blkdev/blkdev.c (revision 510a6847)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/ksynch.h>
30 #include <sys/kmem.h>
31 #include <sys/file.h>
32 #include <sys/errno.h>
33 #include <sys/open.h>
34 #include <sys/buf.h>
35 #include <sys/uio.h>
36 #include <sys/aio_req.h>
37 #include <sys/cred.h>
38 #include <sys/modctl.h>
39 #include <sys/cmlb.h>
40 #include <sys/conf.h>
41 #include <sys/devops.h>
42 #include <sys/list.h>
43 #include <sys/sysmacros.h>
44 #include <sys/dkio.h>
45 #include <sys/vtoc.h>
46 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
47 #include <sys/kstat.h>
48 #include <sys/fs/dv_node.h>
49 #include <sys/ddi.h>
50 #include <sys/sunddi.h>
51 #include <sys/note.h>
52 #include <sys/blkdev.h>
53 #include <sys/scsi/impl/inquiry.h>
54 
55 #define	BD_MAXPART	64
56 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
57 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
58 
59 typedef struct bd bd_t;
60 typedef struct bd_xfer_impl bd_xfer_impl_t;
61 
62 struct bd {
63 	void		*d_private;
64 	dev_info_t	*d_dip;
65 	kmutex_t	d_ocmutex;
66 	kmutex_t	d_iomutex;
67 	kmutex_t	d_statemutex;
68 	kcondvar_t	d_statecv;
69 	enum dkio_state	d_state;
70 	cmlb_handle_t	d_cmlbh;
71 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
72 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
73 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
74 
75 	uint32_t	d_qsize;
76 	uint32_t	d_qactive;
77 	uint32_t	d_maxxfer;
78 	uint32_t	d_blkshift;
79 	uint32_t	d_pblkshift;
80 	uint64_t	d_numblks;
81 	ddi_devid_t	d_devid;
82 
83 	kmem_cache_t	*d_cache;
84 	list_t		d_runq;
85 	list_t		d_waitq;
86 	kstat_t		*d_ksp;
87 	kstat_io_t	*d_kiop;
88 
89 	boolean_t	d_rdonly;
90 	boolean_t	d_ssd;
91 	boolean_t	d_removable;
92 	boolean_t	d_hotpluggable;
93 	boolean_t	d_use_dma;
94 
95 	ddi_dma_attr_t	d_dma;
96 	bd_ops_t	d_ops;
97 	bd_handle_t	d_handle;
98 };
99 
100 struct bd_handle {
101 	bd_ops_t	h_ops;
102 	ddi_dma_attr_t	*h_dma;
103 	dev_info_t	*h_parent;
104 	dev_info_t	*h_child;
105 	void		*h_private;
106 	bd_t		*h_bd;
107 	char		*h_name;
108 	char		h_addr[20];	/* enough for %X,%X */
109 };
110 
111 struct bd_xfer_impl {
112 	bd_xfer_t	i_public;
113 	list_node_t	i_linkage;
114 	bd_t		*i_bd;
115 	buf_t		*i_bp;
116 	uint_t		i_num_win;
117 	uint_t		i_cur_win;
118 	off_t		i_offset;
119 	int		(*i_func)(void *, bd_xfer_t *);
120 	uint32_t	i_blkshift;
121 	size_t		i_len;
122 	size_t		i_resid;
123 };
124 
125 #define	i_dmah		i_public.x_dmah
126 #define	i_dmac		i_public.x_dmac
127 #define	i_ndmac		i_public.x_ndmac
128 #define	i_kaddr		i_public.x_kaddr
129 #define	i_nblks		i_public.x_nblks
130 #define	i_blkno		i_public.x_blkno
131 #define	i_flags		i_public.x_flags
132 
133 
134 /*
135  * Private prototypes.
136  */
137 
138 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
139 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
140 
141 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
142 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
143 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
144 
145 static int bd_open(dev_t *, int, int, cred_t *);
146 static int bd_close(dev_t, int, int, cred_t *);
147 static int bd_strategy(struct buf *);
148 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
149 static int bd_dump(dev_t, caddr_t, daddr_t, int);
150 static int bd_read(dev_t, struct uio *, cred_t *);
151 static int bd_write(dev_t, struct uio *, cred_t *);
152 static int bd_aread(dev_t, struct aio_req *, cred_t *);
153 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
154 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
155     caddr_t, int *);
156 
157 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
158     void *);
159 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
160 static int bd_xfer_ctor(void *, void *, int);
161 static void bd_xfer_dtor(void *, void *);
162 static void bd_sched(bd_t *);
163 static void bd_submit(bd_t *, bd_xfer_impl_t *);
164 static void bd_runq_exit(bd_xfer_impl_t *, int);
165 static void bd_update_state(bd_t *);
166 static int bd_check_state(bd_t *, enum dkio_state *);
167 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
168 
169 struct cmlb_tg_ops bd_tg_ops = {
170 	TG_DK_OPS_VERSION_1,
171 	bd_tg_rdwr,
172 	bd_tg_getinfo,
173 };
174 
175 static struct cb_ops bd_cb_ops = {
176 	bd_open, 		/* open */
177 	bd_close, 		/* close */
178 	bd_strategy, 		/* strategy */
179 	nodev, 			/* print */
180 	bd_dump,		/* dump */
181 	bd_read, 		/* read */
182 	bd_write, 		/* write */
183 	bd_ioctl, 		/* ioctl */
184 	nodev, 			/* devmap */
185 	nodev, 			/* mmap */
186 	nodev, 			/* segmap */
187 	nochpoll, 		/* poll */
188 	bd_prop_op, 		/* cb_prop_op */
189 	0, 			/* streamtab  */
190 	D_64BIT | D_MP,		/* Driver comaptibility flag */
191 	CB_REV,			/* cb_rev */
192 	bd_aread,		/* async read */
193 	bd_awrite		/* async write */
194 };
195 
196 struct dev_ops bd_dev_ops = {
197 	DEVO_REV, 		/* devo_rev, */
198 	0, 			/* refcnt  */
199 	bd_getinfo,		/* getinfo */
200 	nulldev, 		/* identify */
201 	nulldev, 		/* probe */
202 	bd_attach, 		/* attach */
203 	bd_detach,		/* detach */
204 	nodev, 			/* reset */
205 	&bd_cb_ops, 		/* driver operations */
206 	NULL,			/* bus operations */
207 	NULL,			/* power */
208 	ddi_quiesce_not_needed,	/* quiesce */
209 };
210 
211 static struct modldrv modldrv = {
212 	&mod_driverops,
213 	"Generic Block Device",
214 	&bd_dev_ops,
215 };
216 
217 static struct modlinkage modlinkage = {
218 	MODREV_1, { &modldrv, NULL }
219 };
220 
221 static void *bd_state;
222 static krwlock_t bd_lock;
223 
224 int
225 _init(void)
226 {
227 	int	rv;
228 
229 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
230 	if (rv != DDI_SUCCESS) {
231 		return (rv);
232 	}
233 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
234 	rv = mod_install(&modlinkage);
235 	if (rv != DDI_SUCCESS) {
236 		rw_destroy(&bd_lock);
237 		ddi_soft_state_fini(&bd_state);
238 	}
239 	return (rv);
240 }
241 
242 int
243 _fini(void)
244 {
245 	int	rv;
246 
247 	rv = mod_remove(&modlinkage);
248 	if (rv == DDI_SUCCESS) {
249 		rw_destroy(&bd_lock);
250 		ddi_soft_state_fini(&bd_state);
251 	}
252 	return (rv);
253 }
254 
255 int
256 _info(struct modinfo *modinfop)
257 {
258 	return (mod_info(&modlinkage, modinfop));
259 }
260 
261 static int
262 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
263 {
264 	bd_t	*bd;
265 	minor_t	inst;
266 
267 	_NOTE(ARGUNUSED(dip));
268 
269 	inst = BDINST((dev_t)arg);
270 
271 	switch (cmd) {
272 	case DDI_INFO_DEVT2DEVINFO:
273 		bd = ddi_get_soft_state(bd_state, inst);
274 		if (bd == NULL) {
275 			return (DDI_FAILURE);
276 		}
277 		*resultp = (void *)bd->d_dip;
278 		break;
279 
280 	case DDI_INFO_DEVT2INSTANCE:
281 		*resultp = (void *)(intptr_t)inst;
282 		break;
283 
284 	default:
285 		return (DDI_FAILURE);
286 	}
287 	return (DDI_SUCCESS);
288 }
289 
290 static void
291 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
292 {
293 	int	ilen;
294 	char	*data_string;
295 
296 	ilen = scsi_ascii_inquiry_len(data, len);
297 	ASSERT3U(ilen, <=, len);
298 	if (ilen <= 0)
299 		return;
300 	/* ensure null termination */
301 	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
302 	bcopy(data, data_string, ilen);
303 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
304 	kmem_free(data_string, ilen + 1);
305 }
306 
307 static void
308 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
309 {
310 	if (drive->d_vendor_len > 0)
311 		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
312 		    drive->d_vendor, drive->d_vendor_len);
313 
314 	if (drive->d_product_len > 0)
315 		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
316 		    drive->d_product, drive->d_product_len);
317 
318 	if (drive->d_serial_len > 0)
319 		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
320 		    drive->d_serial, drive->d_serial_len);
321 
322 	if (drive->d_revision_len > 0)
323 		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
324 		    drive->d_revision, drive->d_revision_len);
325 }
326 
327 static int
328 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
329 {
330 	int		inst;
331 	bd_handle_t	hdl;
332 	bd_t		*bd;
333 	bd_drive_t	drive;
334 	int		rv;
335 	char		name[16];
336 	char		kcache[32];
337 
338 	switch (cmd) {
339 	case DDI_ATTACH:
340 		break;
341 	case DDI_RESUME:
342 		/* We don't do anything native for suspend/resume */
343 		return (DDI_SUCCESS);
344 	default:
345 		return (DDI_FAILURE);
346 	}
347 
348 	inst = ddi_get_instance(dip);
349 	hdl = ddi_get_parent_data(dip);
350 
351 	(void) snprintf(name, sizeof (name), "%s%d",
352 	    ddi_driver_name(dip), ddi_get_instance(dip));
353 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
354 
355 	if (hdl == NULL) {
356 		cmn_err(CE_WARN, "%s: missing parent data!", name);
357 		return (DDI_FAILURE);
358 	}
359 
360 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
361 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
362 		return (DDI_FAILURE);
363 	}
364 	bd = ddi_get_soft_state(bd_state, inst);
365 
366 	if (hdl->h_dma) {
367 		bd->d_dma = *(hdl->h_dma);
368 		bd->d_dma.dma_attr_granular =
369 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
370 		bd->d_use_dma = B_TRUE;
371 
372 		if (bd->d_maxxfer &&
373 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
374 			cmn_err(CE_WARN,
375 			    "%s: inconsistent maximum transfer size!",
376 			    name);
377 			/* We force it */
378 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
379 		} else {
380 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
381 		}
382 	} else {
383 		bd->d_use_dma = B_FALSE;
384 		if (bd->d_maxxfer == 0) {
385 			bd->d_maxxfer = 1024 * 1024;
386 		}
387 	}
388 	bd->d_ops = hdl->h_ops;
389 	bd->d_private = hdl->h_private;
390 	bd->d_blkshift = 9;	/* 512 bytes, to start */
391 
392 	if (bd->d_maxxfer % DEV_BSIZE) {
393 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
394 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
395 	}
396 	if (bd->d_maxxfer < DEV_BSIZE) {
397 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
398 		ddi_soft_state_free(bd_state, inst);
399 		return (DDI_FAILURE);
400 	}
401 
402 	bd->d_dip = dip;
403 	bd->d_handle = hdl;
404 	hdl->h_bd = bd;
405 	ddi_set_driver_private(dip, bd);
406 
407 	mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
408 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
409 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
410 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
411 
412 	list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
413 	    offsetof(struct bd_xfer_impl, i_linkage));
414 	list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
415 	    offsetof(struct bd_xfer_impl, i_linkage));
416 
417 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
418 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
419 
420 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
421 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
422 	if (bd->d_ksp != NULL) {
423 		bd->d_ksp->ks_lock = &bd->d_iomutex;
424 		kstat_install(bd->d_ksp);
425 		bd->d_kiop = bd->d_ksp->ks_data;
426 	} else {
427 		/*
428 		 * Even if we cannot create the kstat, we create a
429 		 * scratch kstat.  The reason for this is to ensure
430 		 * that we can update the kstat all of the time,
431 		 * without adding an extra branch instruction.
432 		 */
433 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
434 	}
435 
436 	cmlb_alloc_handle(&bd->d_cmlbh);
437 
438 	bd->d_state = DKIO_NONE;
439 
440 	bzero(&drive, sizeof (drive));
441 	bd->d_ops.o_drive_info(bd->d_private, &drive);
442 	bd->d_qsize = drive.d_qsize;
443 	bd->d_removable = drive.d_removable;
444 	bd->d_hotpluggable = drive.d_hotpluggable;
445 
446 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
447 		bd->d_maxxfer = drive.d_maxxfer;
448 
449 	bd_create_inquiry_props(dip, &drive);
450 
451 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
452 	    bd->d_removable, bd->d_hotpluggable,
453 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
454 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
455 	if (rv != 0) {
456 		cmlb_free_handle(&bd->d_cmlbh);
457 		kmem_cache_destroy(bd->d_cache);
458 		mutex_destroy(&bd->d_iomutex);
459 		mutex_destroy(&bd->d_ocmutex);
460 		mutex_destroy(&bd->d_statemutex);
461 		cv_destroy(&bd->d_statecv);
462 		list_destroy(&bd->d_waitq);
463 		list_destroy(&bd->d_runq);
464 		if (bd->d_ksp != NULL) {
465 			kstat_delete(bd->d_ksp);
466 			bd->d_ksp = NULL;
467 		} else {
468 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
469 		}
470 		ddi_soft_state_free(bd_state, inst);
471 		return (DDI_FAILURE);
472 	}
473 
474 	if (bd->d_ops.o_devid_init != NULL) {
475 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
476 		if (rv == DDI_SUCCESS) {
477 			if (ddi_devid_register(dip, bd->d_devid) !=
478 			    DDI_SUCCESS) {
479 				cmn_err(CE_WARN,
480 				    "%s: unable to register devid", name);
481 			}
482 		}
483 	}
484 
485 	/*
486 	 * Add a zero-length attribute to tell the world we support
487 	 * kernel ioctls (for layered drivers).  Also set up properties
488 	 * used by HAL to identify removable media.
489 	 */
490 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
491 	    DDI_KERNEL_IOCTL, NULL, 0);
492 	if (bd->d_removable) {
493 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
494 		    "removable-media", NULL, 0);
495 	}
496 	if (bd->d_hotpluggable) {
497 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
498 		    "hotpluggable", NULL, 0);
499 	}
500 
501 	ddi_report_dev(dip);
502 
503 	return (DDI_SUCCESS);
504 }
505 
506 static int
507 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
508 {
509 	bd_t	*bd;
510 
511 	bd = ddi_get_driver_private(dip);
512 
513 	switch (cmd) {
514 	case DDI_DETACH:
515 		break;
516 	case DDI_SUSPEND:
517 		/* We don't suspend, but our parent does */
518 		return (DDI_SUCCESS);
519 	default:
520 		return (DDI_FAILURE);
521 	}
522 	if (bd->d_ksp != NULL) {
523 		kstat_delete(bd->d_ksp);
524 		bd->d_ksp = NULL;
525 	} else {
526 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
527 	}
528 	cmlb_detach(bd->d_cmlbh, 0);
529 	cmlb_free_handle(&bd->d_cmlbh);
530 	if (bd->d_devid)
531 		ddi_devid_free(bd->d_devid);
532 	kmem_cache_destroy(bd->d_cache);
533 	mutex_destroy(&bd->d_iomutex);
534 	mutex_destroy(&bd->d_ocmutex);
535 	mutex_destroy(&bd->d_statemutex);
536 	cv_destroy(&bd->d_statecv);
537 	list_destroy(&bd->d_waitq);
538 	list_destroy(&bd->d_runq);
539 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
540 	return (DDI_SUCCESS);
541 }
542 
543 static int
544 bd_xfer_ctor(void *buf, void *arg, int kmflag)
545 {
546 	bd_xfer_impl_t	*xi;
547 	bd_t		*bd = arg;
548 	int		(*dcb)(caddr_t);
549 
550 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
551 		dcb = DDI_DMA_SLEEP;
552 	} else {
553 		dcb = DDI_DMA_DONTWAIT;
554 	}
555 
556 	xi = buf;
557 	bzero(xi, sizeof (*xi));
558 	xi->i_bd = bd;
559 
560 	if (bd->d_use_dma) {
561 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
562 		    &xi->i_dmah) != DDI_SUCCESS) {
563 			return (-1);
564 		}
565 	}
566 
567 	return (0);
568 }
569 
570 static void
571 bd_xfer_dtor(void *buf, void *arg)
572 {
573 	bd_xfer_impl_t	*xi = buf;
574 
575 	_NOTE(ARGUNUSED(arg));
576 
577 	if (xi->i_dmah)
578 		ddi_dma_free_handle(&xi->i_dmah);
579 	xi->i_dmah = NULL;
580 }
581 
582 static bd_xfer_impl_t *
583 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
584     int kmflag)
585 {
586 	bd_xfer_impl_t		*xi;
587 	int			rv = 0;
588 	int			status;
589 	unsigned		dir;
590 	int			(*cb)(caddr_t);
591 	size_t			len;
592 	uint32_t		shift;
593 
594 	if (kmflag == KM_SLEEP) {
595 		cb = DDI_DMA_SLEEP;
596 	} else {
597 		cb = DDI_DMA_DONTWAIT;
598 	}
599 
600 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
601 	if (xi == NULL) {
602 		bioerror(bp, ENOMEM);
603 		return (NULL);
604 	}
605 
606 	ASSERT(bp);
607 
608 	xi->i_bp = bp;
609 	xi->i_func = func;
610 	xi->i_blkno = bp->b_lblkno;
611 
612 	if (bp->b_bcount == 0) {
613 		xi->i_len = 0;
614 		xi->i_nblks = 0;
615 		xi->i_kaddr = NULL;
616 		xi->i_resid = 0;
617 		xi->i_num_win = 0;
618 		goto done;
619 	}
620 
621 	if (bp->b_flags & B_READ) {
622 		dir = DDI_DMA_READ;
623 		xi->i_func = bd->d_ops.o_read;
624 	} else {
625 		dir = DDI_DMA_WRITE;
626 		xi->i_func = bd->d_ops.o_write;
627 	}
628 
629 	shift = bd->d_blkshift;
630 	xi->i_blkshift = shift;
631 
632 	if (!bd->d_use_dma) {
633 		bp_mapin(bp);
634 		rv = 0;
635 		xi->i_offset = 0;
636 		xi->i_num_win =
637 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
638 		xi->i_cur_win = 0;
639 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
640 		xi->i_nblks = xi->i_len >> shift;
641 		xi->i_kaddr = bp->b_un.b_addr;
642 		xi->i_resid = bp->b_bcount;
643 	} else {
644 
645 		/*
646 		 * We have to use consistent DMA if the address is misaligned.
647 		 */
648 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
649 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
650 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
651 		} else {
652 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
653 		}
654 
655 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
656 		    NULL, &xi->i_dmac, &xi->i_ndmac);
657 		switch (status) {
658 		case DDI_DMA_MAPPED:
659 			xi->i_num_win = 1;
660 			xi->i_cur_win = 0;
661 			xi->i_offset = 0;
662 			xi->i_len = bp->b_bcount;
663 			xi->i_nblks = xi->i_len >> shift;
664 			xi->i_resid = bp->b_bcount;
665 			rv = 0;
666 			break;
667 		case DDI_DMA_PARTIAL_MAP:
668 			xi->i_cur_win = 0;
669 
670 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
671 			    DDI_SUCCESS) ||
672 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
673 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
674 			    DDI_SUCCESS) ||
675 			    (P2PHASE(len, shift) != 0)) {
676 				(void) ddi_dma_unbind_handle(xi->i_dmah);
677 				rv = EFAULT;
678 				goto done;
679 			}
680 			xi->i_len = len;
681 			xi->i_nblks = xi->i_len >> shift;
682 			xi->i_resid = bp->b_bcount;
683 			rv = 0;
684 			break;
685 		case DDI_DMA_NORESOURCES:
686 			rv = EAGAIN;
687 			goto done;
688 		case DDI_DMA_TOOBIG:
689 			rv = EINVAL;
690 			goto done;
691 		case DDI_DMA_NOMAPPING:
692 		case DDI_DMA_INUSE:
693 		default:
694 			rv = EFAULT;
695 			goto done;
696 		}
697 	}
698 
699 done:
700 	if (rv != 0) {
701 		kmem_cache_free(bd->d_cache, xi);
702 		bioerror(bp, rv);
703 		return (NULL);
704 	}
705 
706 	return (xi);
707 }
708 
709 static void
710 bd_xfer_free(bd_xfer_impl_t *xi)
711 {
712 	if (xi->i_dmah) {
713 		(void) ddi_dma_unbind_handle(xi->i_dmah);
714 	}
715 	kmem_cache_free(xi->i_bd->d_cache, xi);
716 }
717 
718 static int
719 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
720 {
721 	dev_t		dev = *devp;
722 	bd_t		*bd;
723 	minor_t		part;
724 	minor_t		inst;
725 	uint64_t	mask;
726 	boolean_t	ndelay;
727 	int		rv;
728 	diskaddr_t	nblks;
729 	diskaddr_t	lba;
730 
731 	_NOTE(ARGUNUSED(credp));
732 
733 	part = BDPART(dev);
734 	inst = BDINST(dev);
735 
736 	if (otyp >= OTYPCNT)
737 		return (EINVAL);
738 
739 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
740 
741 	/*
742 	 * Block any DR events from changing the set of registered
743 	 * devices while we function.
744 	 */
745 	rw_enter(&bd_lock, RW_READER);
746 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
747 		rw_exit(&bd_lock);
748 		return (ENXIO);
749 	}
750 
751 	mutex_enter(&bd->d_ocmutex);
752 
753 	ASSERT(part < 64);
754 	mask = (1U << part);
755 
756 	bd_update_state(bd);
757 
758 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
759 
760 		/* non-blocking opens are allowed to succeed */
761 		if (!ndelay) {
762 			rv = ENXIO;
763 			goto done;
764 		}
765 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
766 	    NULL, NULL, 0) == 0) {
767 
768 		/*
769 		 * We read the partinfo, verify valid ranges.  If the
770 		 * partition is invalid, and we aren't blocking or
771 		 * doing a raw access, then fail. (Non-blocking and
772 		 * raw accesses can still succeed to allow a disk with
773 		 * bad partition data to opened by format and fdisk.)
774 		 */
775 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
776 			rv = ENXIO;
777 			goto done;
778 		}
779 	} else if (!ndelay) {
780 		/*
781 		 * cmlb_partinfo failed -- invalid partition or no
782 		 * disk label.
783 		 */
784 		rv = ENXIO;
785 		goto done;
786 	}
787 
788 	if ((flag & FWRITE) && bd->d_rdonly) {
789 		rv = EROFS;
790 		goto done;
791 	}
792 
793 	if ((bd->d_open_excl) & (mask)) {
794 		rv = EBUSY;
795 		goto done;
796 	}
797 	if (flag & FEXCL) {
798 		if (bd->d_open_lyr[part]) {
799 			rv = EBUSY;
800 			goto done;
801 		}
802 		for (int i = 0; i < OTYP_LYR; i++) {
803 			if (bd->d_open_reg[i] & mask) {
804 				rv = EBUSY;
805 				goto done;
806 			}
807 		}
808 	}
809 
810 	if (otyp == OTYP_LYR) {
811 		bd->d_open_lyr[part]++;
812 	} else {
813 		bd->d_open_reg[otyp] |= mask;
814 	}
815 	if (flag & FEXCL) {
816 		bd->d_open_excl |= mask;
817 	}
818 
819 	rv = 0;
820 done:
821 	mutex_exit(&bd->d_ocmutex);
822 	rw_exit(&bd_lock);
823 
824 	return (rv);
825 }
826 
827 static int
828 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
829 {
830 	bd_t		*bd;
831 	minor_t		inst;
832 	minor_t		part;
833 	uint64_t	mask;
834 	boolean_t	last = B_TRUE;
835 
836 	_NOTE(ARGUNUSED(flag));
837 	_NOTE(ARGUNUSED(credp));
838 
839 	part = BDPART(dev);
840 	inst = BDINST(dev);
841 
842 	ASSERT(part < 64);
843 	mask = (1U << part);
844 
845 	rw_enter(&bd_lock, RW_READER);
846 
847 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
848 		rw_exit(&bd_lock);
849 		return (ENXIO);
850 	}
851 
852 	mutex_enter(&bd->d_ocmutex);
853 	if (bd->d_open_excl & mask) {
854 		bd->d_open_excl &= ~mask;
855 	}
856 	if (otyp == OTYP_LYR) {
857 		bd->d_open_lyr[part]--;
858 	} else {
859 		bd->d_open_reg[otyp] &= ~mask;
860 	}
861 	for (int i = 0; i < 64; i++) {
862 		if (bd->d_open_lyr[part]) {
863 			last = B_FALSE;
864 		}
865 	}
866 	for (int i = 0; last && (i < OTYP_LYR); i++) {
867 		if (bd->d_open_reg[i]) {
868 			last = B_FALSE;
869 		}
870 	}
871 	mutex_exit(&bd->d_ocmutex);
872 
873 	if (last) {
874 		cmlb_invalidate(bd->d_cmlbh, 0);
875 	}
876 	rw_exit(&bd_lock);
877 
878 	return (0);
879 }
880 
881 static int
882 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
883 {
884 	minor_t		inst;
885 	minor_t		part;
886 	diskaddr_t	pstart;
887 	diskaddr_t	psize;
888 	bd_t		*bd;
889 	bd_xfer_impl_t	*xi;
890 	buf_t		*bp;
891 	int		rv;
892 
893 	rw_enter(&bd_lock, RW_READER);
894 
895 	part = BDPART(dev);
896 	inst = BDINST(dev);
897 
898 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
899 		rw_exit(&bd_lock);
900 		return (ENXIO);
901 	}
902 	/*
903 	 * do cmlb, but do it synchronously unless we already have the
904 	 * partition (which we probably should.)
905 	 */
906 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
907 	    (void *)1)) {
908 		rw_exit(&bd_lock);
909 		return (ENXIO);
910 	}
911 
912 	if ((blkno + nblk) > psize) {
913 		rw_exit(&bd_lock);
914 		return (EINVAL);
915 	}
916 	bp = getrbuf(KM_NOSLEEP);
917 	if (bp == NULL) {
918 		rw_exit(&bd_lock);
919 		return (ENOMEM);
920 	}
921 
922 	bp->b_bcount = nblk << bd->d_blkshift;
923 	bp->b_resid = bp->b_bcount;
924 	bp->b_lblkno = blkno;
925 	bp->b_un.b_addr = caddr;
926 
927 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
928 	if (xi == NULL) {
929 		rw_exit(&bd_lock);
930 		freerbuf(bp);
931 		return (ENOMEM);
932 	}
933 	xi->i_blkno = blkno + pstart;
934 	xi->i_flags = BD_XFER_POLL;
935 	bd_submit(bd, xi);
936 	rw_exit(&bd_lock);
937 
938 	/*
939 	 * Generally, we should have run this entirely synchronously
940 	 * at this point and the biowait call should be a no-op.  If
941 	 * it didn't happen this way, it's a bug in the underlying
942 	 * driver not honoring BD_XFER_POLL.
943 	 */
944 	(void) biowait(bp);
945 	rv = geterror(bp);
946 	freerbuf(bp);
947 	return (rv);
948 }
949 
950 void
951 bd_minphys(struct buf *bp)
952 {
953 	minor_t inst;
954 	bd_t	*bd;
955 	inst = BDINST(bp->b_edev);
956 
957 	bd = ddi_get_soft_state(bd_state, inst);
958 
959 	/*
960 	 * In a non-debug kernel, bd_strategy will catch !bd as
961 	 * well, and will fail nicely.
962 	 */
963 	ASSERT(bd);
964 
965 	if (bp->b_bcount > bd->d_maxxfer)
966 		bp->b_bcount = bd->d_maxxfer;
967 }
968 
969 static int
970 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
971 {
972 	_NOTE(ARGUNUSED(credp));
973 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
974 }
975 
976 static int
977 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
978 {
979 	_NOTE(ARGUNUSED(credp));
980 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
981 }
982 
983 static int
984 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
985 {
986 	_NOTE(ARGUNUSED(credp));
987 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
988 }
989 
990 static int
991 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
992 {
993 	_NOTE(ARGUNUSED(credp));
994 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
995 }
996 
997 static int
998 bd_strategy(struct buf *bp)
999 {
1000 	minor_t		inst;
1001 	minor_t		part;
1002 	bd_t		*bd;
1003 	diskaddr_t	p_lba;
1004 	diskaddr_t	p_nblks;
1005 	diskaddr_t	b_nblks;
1006 	bd_xfer_impl_t	*xi;
1007 	uint32_t	shift;
1008 	int		(*func)(void *, bd_xfer_t *);
1009 
1010 	part = BDPART(bp->b_edev);
1011 	inst = BDINST(bp->b_edev);
1012 
1013 	ASSERT(bp);
1014 
1015 	bp->b_resid = bp->b_bcount;
1016 
1017 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1018 		bioerror(bp, ENXIO);
1019 		biodone(bp);
1020 		return (0);
1021 	}
1022 
1023 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1024 	    NULL, NULL, 0)) {
1025 		bioerror(bp, ENXIO);
1026 		biodone(bp);
1027 		return (0);
1028 	}
1029 
1030 	shift = bd->d_blkshift;
1031 
1032 	if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1033 	    (bp->b_lblkno > p_nblks)) {
1034 		bioerror(bp, ENXIO);
1035 		biodone(bp);
1036 		return (0);
1037 	}
1038 	b_nblks = bp->b_bcount >> shift;
1039 	if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
1040 		biodone(bp);
1041 		return (0);
1042 	}
1043 
1044 	if ((b_nblks + bp->b_lblkno) > p_nblks) {
1045 		bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
1046 		bp->b_bcount -= bp->b_resid;
1047 	} else {
1048 		bp->b_resid = 0;
1049 	}
1050 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1051 
1052 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1053 	if (xi == NULL) {
1054 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1055 	}
1056 	if (xi == NULL) {
1057 		/* bd_request_alloc will have done bioerror */
1058 		biodone(bp);
1059 		return (0);
1060 	}
1061 	xi->i_blkno = bp->b_lblkno + p_lba;
1062 
1063 	bd_submit(bd, xi);
1064 
1065 	return (0);
1066 }
1067 
1068 static int
1069 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1070 {
1071 	minor_t		inst;
1072 	uint16_t	part;
1073 	bd_t		*bd;
1074 	void		*ptr = (void *)arg;
1075 	int		rv;
1076 
1077 	part = BDPART(dev);
1078 	inst = BDINST(dev);
1079 
1080 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1081 		return (ENXIO);
1082 	}
1083 
1084 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1085 	if (rv != ENOTTY)
1086 		return (rv);
1087 
1088 	if (rvalp != NULL) {
1089 		/* the return value of the ioctl is 0 by default */
1090 		*rvalp = 0;
1091 	}
1092 
1093 	switch (cmd) {
1094 	case DKIOCGMEDIAINFO: {
1095 		struct dk_minfo minfo;
1096 
1097 		/* make sure our state information is current */
1098 		bd_update_state(bd);
1099 		bzero(&minfo, sizeof (minfo));
1100 		minfo.dki_media_type = DK_FIXED_DISK;
1101 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1102 		minfo.dki_capacity = bd->d_numblks;
1103 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1104 			return (EFAULT);
1105 		}
1106 		return (0);
1107 	}
1108 	case DKIOCGMEDIAINFOEXT: {
1109 		struct dk_minfo_ext miext;
1110 
1111 		/* make sure our state information is current */
1112 		bd_update_state(bd);
1113 		bzero(&miext, sizeof (miext));
1114 		miext.dki_media_type = DK_FIXED_DISK;
1115 		miext.dki_lbsize = (1U << bd->d_blkshift);
1116 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1117 		miext.dki_capacity = bd->d_numblks;
1118 		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1119 			return (EFAULT);
1120 		}
1121 		return (0);
1122 	}
1123 	case DKIOCINFO: {
1124 		struct dk_cinfo cinfo;
1125 		bzero(&cinfo, sizeof (cinfo));
1126 		cinfo.dki_ctype = DKC_BLKDEV;
1127 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1128 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1129 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1130 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1131 		    "%s", ddi_driver_name(bd->d_dip));
1132 		cinfo.dki_unit = inst;
1133 		cinfo.dki_flags = DKI_FMTVOL;
1134 		cinfo.dki_partition = part;
1135 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1136 		cinfo.dki_addr = 0;
1137 		cinfo.dki_slave = 0;
1138 		cinfo.dki_space = 0;
1139 		cinfo.dki_prio = 0;
1140 		cinfo.dki_vec = 0;
1141 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1142 			return (EFAULT);
1143 		}
1144 		return (0);
1145 	}
1146 	case DKIOCREMOVABLE: {
1147 		int i;
1148 		i = bd->d_removable ? 1 : 0;
1149 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1150 			return (EFAULT);
1151 		}
1152 		return (0);
1153 	}
1154 	case DKIOCHOTPLUGGABLE: {
1155 		int i;
1156 		i = bd->d_hotpluggable ? 1 : 0;
1157 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1158 			return (EFAULT);
1159 		}
1160 		return (0);
1161 	}
1162 	case DKIOCREADONLY: {
1163 		int i;
1164 		i = bd->d_rdonly ? 1 : 0;
1165 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1166 			return (EFAULT);
1167 		}
1168 		return (0);
1169 	}
1170 	case DKIOCSOLIDSTATE: {
1171 		int i;
1172 		i = bd->d_ssd ? 1 : 0;
1173 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1174 			return (EFAULT);
1175 		}
1176 		return (0);
1177 	}
1178 	case DKIOCSTATE: {
1179 		enum dkio_state	state;
1180 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1181 			return (EFAULT);
1182 		}
1183 		if ((rv = bd_check_state(bd, &state)) != 0) {
1184 			return (rv);
1185 		}
1186 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1187 			return (EFAULT);
1188 		}
1189 		return (0);
1190 	}
1191 	case DKIOCFLUSHWRITECACHE: {
1192 		struct dk_callback *dkc = NULL;
1193 
1194 		if (flag & FKIOCTL)
1195 			dkc = (void *)arg;
1196 
1197 		rv = bd_flush_write_cache(bd, dkc);
1198 		return (rv);
1199 	}
1200 
1201 	default:
1202 		break;
1203 
1204 	}
1205 	return (ENOTTY);
1206 }
1207 
1208 static int
1209 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1210     char *name, caddr_t valuep, int *lengthp)
1211 {
1212 	bd_t	*bd;
1213 
1214 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1215 	if (bd == NULL)
1216 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1217 		    name, valuep, lengthp));
1218 
1219 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1220 	    valuep, lengthp, BDPART(dev), 0));
1221 }
1222 
1223 
1224 static int
1225 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1226     size_t length, void *tg_cookie)
1227 {
1228 	bd_t		*bd;
1229 	buf_t		*bp;
1230 	bd_xfer_impl_t	*xi;
1231 	int		rv;
1232 	int		(*func)(void *, bd_xfer_t *);
1233 	int		kmflag;
1234 
1235 	/*
1236 	 * If we are running in polled mode (such as during dump(9e)
1237 	 * execution), then we cannot sleep for kernel allocations.
1238 	 */
1239 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1240 
1241 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1242 
1243 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1244 		/* We can only transfer whole blocks at a time! */
1245 		return (EINVAL);
1246 	}
1247 
1248 	if ((bp = getrbuf(kmflag)) == NULL) {
1249 		return (ENOMEM);
1250 	}
1251 
1252 	switch (cmd) {
1253 	case TG_READ:
1254 		bp->b_flags = B_READ;
1255 		func = bd->d_ops.o_read;
1256 		break;
1257 	case TG_WRITE:
1258 		bp->b_flags = B_WRITE;
1259 		func = bd->d_ops.o_write;
1260 		break;
1261 	default:
1262 		freerbuf(bp);
1263 		return (EINVAL);
1264 	}
1265 
1266 	bp->b_un.b_addr = bufaddr;
1267 	bp->b_bcount = length;
1268 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1269 	if (xi == NULL) {
1270 		rv = geterror(bp);
1271 		freerbuf(bp);
1272 		return (rv);
1273 	}
1274 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1275 	xi->i_blkno = start;
1276 	bd_submit(bd, xi);
1277 	(void) biowait(bp);
1278 	rv = geterror(bp);
1279 	freerbuf(bp);
1280 
1281 	return (rv);
1282 }
1283 
1284 static int
1285 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1286 {
1287 	bd_t		*bd;
1288 
1289 	_NOTE(ARGUNUSED(tg_cookie));
1290 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1291 
1292 	switch (cmd) {
1293 	case TG_GETPHYGEOM:
1294 	case TG_GETVIRTGEOM:
1295 		/*
1296 		 * We don't have any "geometry" as such, let cmlb
1297 		 * fabricate something.
1298 		 */
1299 		return (ENOTTY);
1300 
1301 	case TG_GETCAPACITY:
1302 		bd_update_state(bd);
1303 		*(diskaddr_t *)arg = bd->d_numblks;
1304 		return (0);
1305 
1306 	case TG_GETBLOCKSIZE:
1307 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1308 		return (0);
1309 
1310 	case TG_GETATTR:
1311 		/*
1312 		 * It turns out that cmlb really doesn't do much for
1313 		 * non-writable media, but lets make the information
1314 		 * available for it in case it does more in the
1315 		 * future.  (The value is currently used for
1316 		 * triggering special behavior for CD-ROMs.)
1317 		 */
1318 		bd_update_state(bd);
1319 		((tg_attribute_t *)arg)->media_is_writable =
1320 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1321 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1322 		return (0);
1323 
1324 	default:
1325 		return (EINVAL);
1326 	}
1327 }
1328 
1329 
1330 static void
1331 bd_sched(bd_t *bd)
1332 {
1333 	bd_xfer_impl_t	*xi;
1334 	struct buf	*bp;
1335 	int		rv;
1336 
1337 	mutex_enter(&bd->d_iomutex);
1338 
1339 	while ((bd->d_qactive < bd->d_qsize) &&
1340 	    ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1341 		bd->d_qactive++;
1342 		kstat_waitq_to_runq(bd->d_kiop);
1343 		list_insert_tail(&bd->d_runq, xi);
1344 
1345 		/*
1346 		 * Submit the job to the driver.  We drop the I/O mutex
1347 		 * so that we can deal with the case where the driver
1348 		 * completion routine calls back into us synchronously.
1349 		 */
1350 
1351 		mutex_exit(&bd->d_iomutex);
1352 
1353 		rv = xi->i_func(bd->d_private, &xi->i_public);
1354 		if (rv != 0) {
1355 			bp = xi->i_bp;
1356 			bioerror(bp, rv);
1357 			biodone(bp);
1358 
1359 			mutex_enter(&bd->d_iomutex);
1360 			bd->d_qactive--;
1361 			kstat_runq_exit(bd->d_kiop);
1362 			list_remove(&bd->d_runq, xi);
1363 			bd_xfer_free(xi);
1364 		} else {
1365 			mutex_enter(&bd->d_iomutex);
1366 		}
1367 	}
1368 
1369 	mutex_exit(&bd->d_iomutex);
1370 }
1371 
1372 static void
1373 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1374 {
1375 	mutex_enter(&bd->d_iomutex);
1376 	list_insert_tail(&bd->d_waitq, xi);
1377 	kstat_waitq_enter(bd->d_kiop);
1378 	mutex_exit(&bd->d_iomutex);
1379 
1380 	bd_sched(bd);
1381 }
1382 
1383 static void
1384 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1385 {
1386 	bd_t	*bd = xi->i_bd;
1387 	buf_t	*bp = xi->i_bp;
1388 
1389 	mutex_enter(&bd->d_iomutex);
1390 	bd->d_qactive--;
1391 	kstat_runq_exit(bd->d_kiop);
1392 	list_remove(&bd->d_runq, xi);
1393 	mutex_exit(&bd->d_iomutex);
1394 
1395 	if (err == 0) {
1396 		if (bp->b_flags & B_READ) {
1397 			bd->d_kiop->reads++;
1398 			bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1399 		} else {
1400 			bd->d_kiop->writes++;
1401 			bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1402 		}
1403 	}
1404 	bd_sched(bd);
1405 }
1406 
1407 static void
1408 bd_update_state(bd_t *bd)
1409 {
1410 	enum	dkio_state	state = DKIO_INSERTED;
1411 	boolean_t		docmlb = B_FALSE;
1412 	bd_media_t		media;
1413 
1414 	bzero(&media, sizeof (media));
1415 
1416 	mutex_enter(&bd->d_statemutex);
1417 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1418 		bd->d_numblks = 0;
1419 		state = DKIO_EJECTED;
1420 		goto done;
1421 	}
1422 
1423 	if ((media.m_blksize < 512) ||
1424 	    (!ISP2(media.m_blksize)) ||
1425 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1426 		cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1427 		    ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1428 		    media.m_blksize);
1429 		/*
1430 		 * We can't use the media, treat it as not present.
1431 		 */
1432 		state = DKIO_EJECTED;
1433 		bd->d_numblks = 0;
1434 		goto done;
1435 	}
1436 
1437 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
1438 	    (bd->d_numblks != media.m_nblks)) {
1439 		/* Device size changed */
1440 		docmlb = B_TRUE;
1441 	}
1442 
1443 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1444 	bd->d_pblkshift = bd->d_blkshift;
1445 	bd->d_numblks = media.m_nblks;
1446 	bd->d_rdonly = media.m_readonly;
1447 	bd->d_ssd = media.m_solidstate;
1448 
1449 	/*
1450 	 * Only use the supplied physical block size if it is non-zero,
1451 	 * greater or equal to the block size, and a power of 2. Ignore it
1452 	 * if not, it's just informational and we can still use the media.
1453 	 */
1454 	if ((media.m_pblksize != 0) &&
1455 	    (media.m_pblksize >= media.m_blksize) &&
1456 	    (ISP2(media.m_pblksize)))
1457 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1458 
1459 done:
1460 	if (state != bd->d_state) {
1461 		bd->d_state = state;
1462 		cv_broadcast(&bd->d_statecv);
1463 		docmlb = B_TRUE;
1464 	}
1465 	mutex_exit(&bd->d_statemutex);
1466 
1467 	if (docmlb) {
1468 		if (state == DKIO_INSERTED) {
1469 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1470 		} else {
1471 			cmlb_invalidate(bd->d_cmlbh, 0);
1472 		}
1473 	}
1474 }
1475 
1476 static int
1477 bd_check_state(bd_t *bd, enum dkio_state *state)
1478 {
1479 	clock_t		when;
1480 
1481 	for (;;) {
1482 
1483 		bd_update_state(bd);
1484 
1485 		mutex_enter(&bd->d_statemutex);
1486 
1487 		if (bd->d_state != *state) {
1488 			*state = bd->d_state;
1489 			mutex_exit(&bd->d_statemutex);
1490 			break;
1491 		}
1492 
1493 		when = drv_usectohz(1000000);
1494 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1495 		    when, TR_CLOCK_TICK) == 0) {
1496 			mutex_exit(&bd->d_statemutex);
1497 			return (EINTR);
1498 		}
1499 
1500 		mutex_exit(&bd->d_statemutex);
1501 	}
1502 
1503 	return (0);
1504 }
1505 
1506 static int
1507 bd_flush_write_cache_done(struct buf *bp)
1508 {
1509 	struct dk_callback *dc = (void *)bp->b_private;
1510 
1511 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1512 	kmem_free(dc, sizeof (*dc));
1513 	freerbuf(bp);
1514 	return (0);
1515 }
1516 
1517 static int
1518 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1519 {
1520 	buf_t			*bp;
1521 	struct dk_callback	*dc;
1522 	bd_xfer_impl_t		*xi;
1523 	int			rv;
1524 
1525 	if (bd->d_ops.o_sync_cache == NULL) {
1526 		return (ENOTSUP);
1527 	}
1528 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1529 		return (ENOMEM);
1530 	}
1531 	bp->b_resid = 0;
1532 	bp->b_bcount = 0;
1533 
1534 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1535 	if (xi == NULL) {
1536 		rv = geterror(bp);
1537 		freerbuf(bp);
1538 		return (rv);
1539 	}
1540 
1541 	/* Make an asynchronous flush, but only if there is a callback */
1542 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1543 		/* Make a private copy of the callback structure */
1544 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1545 		*dc = *dkc;
1546 		bp->b_private = dc;
1547 		bp->b_iodone = bd_flush_write_cache_done;
1548 
1549 		bd_submit(bd, xi);
1550 		return (0);
1551 	}
1552 
1553 	/* In case there is no callback, perform a synchronous flush */
1554 	bd_submit(bd, xi);
1555 	(void) biowait(bp);
1556 	rv = geterror(bp);
1557 	freerbuf(bp);
1558 
1559 	return (rv);
1560 }
1561 
1562 /*
1563  * Nexus support.
1564  */
1565 int
1566 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1567     void *arg, void *result)
1568 {
1569 	bd_handle_t	hdl;
1570 
1571 	switch (ctlop) {
1572 	case DDI_CTLOPS_REPORTDEV:
1573 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1574 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1575 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1576 		return (DDI_SUCCESS);
1577 
1578 	case DDI_CTLOPS_INITCHILD:
1579 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1580 		if (hdl == NULL) {
1581 			return (DDI_NOT_WELL_FORMED);
1582 		}
1583 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1584 		return (DDI_SUCCESS);
1585 
1586 	case DDI_CTLOPS_UNINITCHILD:
1587 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1588 		ndi_prop_remove_all((dev_info_t *)arg);
1589 		return (DDI_SUCCESS);
1590 
1591 	default:
1592 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1593 	}
1594 }
1595 
1596 /*
1597  * Functions for device drivers.
1598  */
1599 bd_handle_t
1600 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1601 {
1602 	bd_handle_t	hdl;
1603 
1604 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1605 	if (hdl != NULL) {
1606 		hdl->h_ops = *ops;
1607 		hdl->h_dma = dma;
1608 		hdl->h_private = private;
1609 	}
1610 
1611 	return (hdl);
1612 }
1613 
1614 void
1615 bd_free_handle(bd_handle_t hdl)
1616 {
1617 	kmem_free(hdl, sizeof (*hdl));
1618 }
1619 
1620 int
1621 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1622 {
1623 	dev_info_t	*child;
1624 	bd_drive_t	drive = { 0 };
1625 
1626 	/* if drivers don't override this, make it assume none */
1627 	drive.d_lun = -1;
1628 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1629 
1630 	hdl->h_parent = dip;
1631 	hdl->h_name = "blkdev";
1632 
1633 	if (drive.d_lun >= 0) {
1634 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1635 		    drive.d_target, drive.d_lun);
1636 	} else {
1637 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1638 		    drive.d_target);
1639 	}
1640 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1641 	    &child) != NDI_SUCCESS) {
1642 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1643 		    ddi_driver_name(dip), ddi_get_instance(dip),
1644 		    "blkdev", hdl->h_addr);
1645 		return (DDI_FAILURE);
1646 	}
1647 
1648 	ddi_set_parent_data(child, hdl);
1649 	hdl->h_child = child;
1650 
1651 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1652 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1653 		    ddi_driver_name(dip), ddi_get_instance(dip),
1654 		    hdl->h_name, hdl->h_addr);
1655 		(void) ndi_devi_free(child);
1656 		return (DDI_FAILURE);
1657 	}
1658 
1659 	return (DDI_SUCCESS);
1660 }
1661 
1662 int
1663 bd_detach_handle(bd_handle_t hdl)
1664 {
1665 	int	circ;
1666 	int	rv;
1667 	char	*devnm;
1668 
1669 	if (hdl->h_child == NULL) {
1670 		return (DDI_SUCCESS);
1671 	}
1672 	ndi_devi_enter(hdl->h_parent, &circ);
1673 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1674 		rv = ddi_remove_child(hdl->h_child, 0);
1675 	} else {
1676 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1677 		(void) ddi_deviname(hdl->h_child, devnm);
1678 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1679 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1680 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
1681 		kmem_free(devnm, MAXNAMELEN + 1);
1682 	}
1683 	if (rv == 0) {
1684 		hdl->h_child = NULL;
1685 	}
1686 
1687 	ndi_devi_exit(hdl->h_parent, circ);
1688 	return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1689 }
1690 
1691 void
1692 bd_xfer_done(bd_xfer_t *xfer, int err)
1693 {
1694 	bd_xfer_impl_t	*xi = (void *)xfer;
1695 	buf_t		*bp = xi->i_bp;
1696 	int		rv = DDI_SUCCESS;
1697 	bd_t		*bd = xi->i_bd;
1698 	size_t		len;
1699 
1700 	if (err != 0) {
1701 		bd_runq_exit(xi, err);
1702 
1703 		bp->b_resid += xi->i_resid;
1704 		bd_xfer_free(xi);
1705 		bioerror(bp, err);
1706 		biodone(bp);
1707 		return;
1708 	}
1709 
1710 	xi->i_cur_win++;
1711 	xi->i_resid -= xi->i_len;
1712 
1713 	if (xi->i_resid == 0) {
1714 		/* Job completed succcessfully! */
1715 		bd_runq_exit(xi, 0);
1716 
1717 		bd_xfer_free(xi);
1718 		biodone(bp);
1719 		return;
1720 	}
1721 
1722 	xi->i_blkno += xi->i_nblks;
1723 
1724 	if (bd->d_use_dma) {
1725 		/* More transfer still pending... advance to next DMA window. */
1726 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1727 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1728 	} else {
1729 		/* Advance memory window. */
1730 		xi->i_kaddr += xi->i_len;
1731 		xi->i_offset += xi->i_len;
1732 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1733 	}
1734 
1735 
1736 	if ((rv != DDI_SUCCESS) ||
1737 	    (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1738 		bd_runq_exit(xi, EFAULT);
1739 
1740 		bp->b_resid += xi->i_resid;
1741 		bd_xfer_free(xi);
1742 		bioerror(bp, EFAULT);
1743 		biodone(bp);
1744 		return;
1745 	}
1746 	xi->i_len = len;
1747 	xi->i_nblks = len >> xi->i_blkshift;
1748 
1749 	/* Submit next window to hardware. */
1750 	rv = xi->i_func(bd->d_private, &xi->i_public);
1751 	if (rv != 0) {
1752 		bd_runq_exit(xi, rv);
1753 
1754 		bp->b_resid += xi->i_resid;
1755 		bd_xfer_free(xi);
1756 		bioerror(bp, rv);
1757 		biodone(bp);
1758 	}
1759 }
1760 
1761 void
1762 bd_state_change(bd_handle_t hdl)
1763 {
1764 	bd_t		*bd;
1765 
1766 	if ((bd = hdl->h_bd) != NULL) {
1767 		bd_update_state(bd);
1768 	}
1769 }
1770 
1771 void
1772 bd_mod_init(struct dev_ops *devops)
1773 {
1774 	static struct bus_ops bd_bus_ops = {
1775 		BUSO_REV,		/* busops_rev */
1776 		nullbusmap,		/* bus_map */
1777 		NULL,			/* bus_get_intrspec (OBSOLETE) */
1778 		NULL,			/* bus_add_intrspec (OBSOLETE) */
1779 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
1780 		i_ddi_map_fault,	/* bus_map_fault */
1781 		NULL,			/* bus_dma_map (OBSOLETE) */
1782 		ddi_dma_allochdl,	/* bus_dma_allochdl */
1783 		ddi_dma_freehdl,	/* bus_dma_freehdl */
1784 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
1785 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
1786 		ddi_dma_flush,		/* bus_dma_flush */
1787 		ddi_dma_win,		/* bus_dma_win */
1788 		ddi_dma_mctl,		/* bus_dma_ctl */
1789 		bd_bus_ctl,		/* bus_ctl */
1790 		ddi_bus_prop_op,	/* bus_prop_op */
1791 		NULL,			/* bus_get_eventcookie */
1792 		NULL,			/* bus_add_eventcall */
1793 		NULL,			/* bus_remove_eventcall */
1794 		NULL,			/* bus_post_event */
1795 		NULL,			/* bus_intr_ctl (OBSOLETE) */
1796 		NULL,			/* bus_config */
1797 		NULL,			/* bus_unconfig */
1798 		NULL,			/* bus_fm_init */
1799 		NULL,			/* bus_fm_fini */
1800 		NULL,			/* bus_fm_access_enter */
1801 		NULL,			/* bus_fm_access_exit */
1802 		NULL,			/* bus_power */
1803 		NULL,			/* bus_intr_op */
1804 	};
1805 
1806 	devops->devo_bus_ops = &bd_bus_ops;
1807 
1808 	/*
1809 	 * NB: The device driver is free to supply its own
1810 	 * character entry device support.
1811 	 */
1812 }
1813 
1814 void
1815 bd_mod_fini(struct dev_ops *devops)
1816 {
1817 	devops->devo_bus_ops = NULL;
1818 }
1819