1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24 * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25 * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26 * Copyright 2017 The MathWorks, Inc.  All rights reserved.
27 * Copyright 2019 Western Digital Corporation.
28 * Copyright 2020 Joyent, Inc.
29 */
30
31#include <sys/types.h>
32#include <sys/ksynch.h>
33#include <sys/kmem.h>
34#include <sys/file.h>
35#include <sys/errno.h>
36#include <sys/open.h>
37#include <sys/buf.h>
38#include <sys/uio.h>
39#include <sys/aio_req.h>
40#include <sys/cred.h>
41#include <sys/modctl.h>
42#include <sys/cmlb.h>
43#include <sys/conf.h>
44#include <sys/devops.h>
45#include <sys/list.h>
46#include <sys/sysmacros.h>
47#include <sys/dkio.h>
48#include <sys/dkioc_free_util.h>
49#include <sys/vtoc.h>
50#include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
51#include <sys/kstat.h>
52#include <sys/fs/dv_node.h>
53#include <sys/ddi.h>
54#include <sys/sunddi.h>
55#include <sys/note.h>
56#include <sys/blkdev.h>
57#include <sys/scsi/impl/inquiry.h>
58
59/*
60 * blkdev is a driver which provides a lot of the common functionality
61 * a block device driver may need and helps by removing code which
62 * is frequently duplicated in block device drivers.
63 *
64 * Within this driver all the struct cb_ops functions required for a
65 * block device driver are written with appropriate call back functions
66 * to be provided by the parent driver.
67 *
68 * To use blkdev, a driver needs to:
69 *	1. Create a bd_ops_t structure which has the call back operations
70 *	   blkdev will use.
71 *	2. Create a handle by calling bd_alloc_handle(). One of the
72 *	   arguments to this function is the bd_ops_t.
73 *	3. Call bd_attach_handle(). This will instantiate a blkdev device
74 *	   as a child device node of the calling driver.
75 *
76 * A parent driver is not restricted to just allocating and attaching a
77 * single instance, it may attach as many as it wishes. For each handle
78 * attached, appropriate entries in /dev/[r]dsk are created.
79 *
80 * The bd_ops_t routines that a parent of blkdev need to provide are:
81 *
82 * o_drive_info: Provide information to blkdev such as how many I/O queues
83 *		 to create and the size of those queues. Also some device
84 *		 specifics such as EUI, vendor, product, model, serial
85 *		 number ....
86 *
87 * o_media_info: Provide information about the media. Eg size and block size.
88 *
89 * o_devid_init: Creates and initializes the device id. Typically calls
90 *		 ddi_devid_init().
91 *
92 * o_sync_cache: Issues a device appropriate command to flush any write
93 *		 caches.
94 *
95 * o_read:	 Read data as described by bd_xfer_t argument.
96 *
97 * o_write:	 Write data as described by bd_xfer_t argument.
98 *
99 * o_free_space: Free the space described by bd_xfer_t argument (optional).
100 *
101 * Queues
102 * ------
103 * Part of the drive_info data is a queue count. blkdev will create
104 * "queue count" number of waitq/runq pairs. Each waitq/runq pair
105 * operates independently. As an I/O is scheduled up to the parent
106 * driver via o_read or o_write its queue number is given. If the
107 * parent driver supports multiple hardware queues it can then select
108 * where to submit the I/O request.
109 *
110 * Currently blkdev uses a simplistic round-robin queue selection method.
111 * It has the advantage that it is lockless. In the future it will be
112 * worthwhile reviewing this strategy for something which prioritizes queues
113 * depending on how busy they are.
114 *
115 * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
116 * I/O requests are initially added to the waitq. They are taken off the
117 * waitq, added to the runq and submitted, providing the runq is less
118 * than the qsize as specified in the drive_info. As an I/O request
119 * completes, the parent driver is required to call bd_xfer_done(), which
120 * will remove the I/O request from the runq and pass I/O completion
121 * status up the stack.
122 *
123 * Locks
124 * -----
125 * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and
126 * d_statemutex. As well a q_iomutex per waitq/runq pair.
127 *
128 * Lock Hierarchy
129 * --------------
130 * The only two locks which may be held simultaneously are q_iomutex and
131 * d_ksmutex. In all cases q_iomutex must be acquired before d_ksmutex.
132 */
133
134#define	BD_MAXPART	64
135#define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
136#define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
137
138typedef struct bd bd_t;
139typedef struct bd_xfer_impl bd_xfer_impl_t;
140typedef struct bd_queue bd_queue_t;
141
142struct bd {
143	void		*d_private;
144	dev_info_t	*d_dip;
145	kmutex_t	d_ocmutex;
146	kmutex_t	d_ksmutex;
147	kmutex_t	d_errmutex;
148	kmutex_t	d_statemutex;
149	kcondvar_t	d_statecv;
150	enum dkio_state	d_state;
151	cmlb_handle_t	d_cmlbh;
152	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
153	uint64_t	d_open_excl;	/* bit mask indexed by partition */
154	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
155	uint64_t	d_io_counter;
156
157	uint32_t	d_qcount;
158	uint32_t	d_qactive;
159	uint32_t	d_maxxfer;
160	uint32_t	d_blkshift;
161	uint32_t	d_pblkshift;
162	uint64_t	d_numblks;
163	ddi_devid_t	d_devid;
164
165	uint64_t	d_max_free_seg;
166	uint64_t	d_max_free_blks;
167	uint64_t	d_max_free_seg_blks;
168	uint64_t	d_free_align;
169
170	kmem_cache_t	*d_cache;
171	bd_queue_t	*d_queues;
172	kstat_t		*d_ksp;
173	kstat_io_t	*d_kiop;
174	kstat_t		*d_errstats;
175	struct bd_errstats *d_kerr;
176
177	boolean_t	d_rdonly;
178	boolean_t	d_ssd;
179	boolean_t	d_removable;
180	boolean_t	d_hotpluggable;
181	boolean_t	d_use_dma;
182
183	ddi_dma_attr_t	d_dma;
184	bd_ops_t	d_ops;
185	bd_handle_t	d_handle;
186};
187
188struct bd_handle {
189	bd_ops_t	h_ops;
190	ddi_dma_attr_t	*h_dma;
191	dev_info_t	*h_parent;
192	dev_info_t	*h_child;
193	void		*h_private;
194	bd_t		*h_bd;
195	char		*h_name;
196	char		h_addr[30];	/* enough for w%0.16x,%X */
197};
198
199struct bd_xfer_impl {
200	bd_xfer_t	i_public;
201	list_node_t	i_linkage;
202	bd_t		*i_bd;
203	buf_t		*i_bp;
204	bd_queue_t	*i_bq;
205	uint_t		i_num_win;
206	uint_t		i_cur_win;
207	off_t		i_offset;
208	int		(*i_func)(void *, bd_xfer_t *);
209	uint32_t	i_blkshift;
210	size_t		i_len;
211	size_t		i_resid;
212};
213
214struct bd_queue {
215	kmutex_t	q_iomutex;
216	uint32_t	q_qsize;
217	uint32_t	q_qactive;
218	list_t		q_runq;
219	list_t		q_waitq;
220};
221
222#define	i_dmah		i_public.x_dmah
223#define	i_dmac		i_public.x_dmac
224#define	i_ndmac		i_public.x_ndmac
225#define	i_kaddr		i_public.x_kaddr
226#define	i_nblks		i_public.x_nblks
227#define	i_blkno		i_public.x_blkno
228#define	i_flags		i_public.x_flags
229#define	i_qnum		i_public.x_qnum
230#define	i_dfl		i_public.x_dfl
231
232#define	CAN_FREESPACE(bd) \
233	(((bd)->d_ops.o_free_space == NULL) ? B_FALSE : B_TRUE)
234
235/*
236 * Private prototypes.
237 */
238
239static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
240static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
241static void bd_create_errstats(bd_t *, int, bd_drive_t *);
242static void bd_destroy_errstats(bd_t *);
243static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
244static void bd_init_errstats(bd_t *, bd_drive_t *);
245static void bd_fini_errstats(bd_t *);
246
247static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
248static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
249static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
250
251static int bd_open(dev_t *, int, int, cred_t *);
252static int bd_close(dev_t, int, int, cred_t *);
253static int bd_strategy(struct buf *);
254static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
255static int bd_dump(dev_t, caddr_t, daddr_t, int);
256static int bd_read(dev_t, struct uio *, cred_t *);
257static int bd_write(dev_t, struct uio *, cred_t *);
258static int bd_aread(dev_t, struct aio_req *, cred_t *);
259static int bd_awrite(dev_t, struct aio_req *, cred_t *);
260static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
261    caddr_t, int *);
262
263static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
264    void *);
265static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
266static int bd_xfer_ctor(void *, void *, int);
267static void bd_xfer_dtor(void *, void *);
268static void bd_sched(bd_t *, bd_queue_t *);
269static void bd_submit(bd_t *, bd_xfer_impl_t *);
270static void bd_runq_exit(bd_xfer_impl_t *, int);
271static void bd_update_state(bd_t *);
272static int bd_check_state(bd_t *, enum dkio_state *);
273static int bd_flush_write_cache(bd_t *, struct dk_callback *);
274static int bd_check_uio(dev_t, struct uio *);
275static int bd_free_space(dev_t, bd_t *, dkioc_free_list_t *);
276
277struct cmlb_tg_ops bd_tg_ops = {
278	TG_DK_OPS_VERSION_1,
279	bd_tg_rdwr,
280	bd_tg_getinfo,
281};
282
283static struct cb_ops bd_cb_ops = {
284	bd_open,		/* open */
285	bd_close,		/* close */
286	bd_strategy,		/* strategy */
287	nodev,			/* print */
288	bd_dump,		/* dump */
289	bd_read,		/* read */
290	bd_write,		/* write */
291	bd_ioctl,		/* ioctl */
292	nodev,			/* devmap */
293	nodev,			/* mmap */
294	nodev,			/* segmap */
295	nochpoll,		/* poll */
296	bd_prop_op,		/* cb_prop_op */
297	0,			/* streamtab  */
298	D_64BIT | D_MP,		/* Driver comaptibility flag */
299	CB_REV,			/* cb_rev */
300	bd_aread,		/* async read */
301	bd_awrite		/* async write */
302};
303
304struct dev_ops bd_dev_ops = {
305	DEVO_REV,		/* devo_rev, */
306	0,			/* refcnt  */
307	bd_getinfo,		/* getinfo */
308	nulldev,		/* identify */
309	nulldev,		/* probe */
310	bd_attach,		/* attach */
311	bd_detach,		/* detach */
312	nodev,			/* reset */
313	&bd_cb_ops,		/* driver operations */
314	NULL,			/* bus operations */
315	NULL,			/* power */
316	ddi_quiesce_not_needed,	/* quiesce */
317};
318
319static struct modldrv modldrv = {
320	&mod_driverops,
321	"Generic Block Device",
322	&bd_dev_ops,
323};
324
325static struct modlinkage modlinkage = {
326	MODREV_1, { &modldrv, NULL }
327};
328
329static void *bd_state;
330static krwlock_t bd_lock;
331
332int
333_init(void)
334{
335	int	rv;
336
337	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
338	if (rv != DDI_SUCCESS) {
339		return (rv);
340	}
341	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
342	rv = mod_install(&modlinkage);
343	if (rv != DDI_SUCCESS) {
344		rw_destroy(&bd_lock);
345		ddi_soft_state_fini(&bd_state);
346	}
347	return (rv);
348}
349
350int
351_fini(void)
352{
353	int	rv;
354
355	rv = mod_remove(&modlinkage);
356	if (rv == DDI_SUCCESS) {
357		rw_destroy(&bd_lock);
358		ddi_soft_state_fini(&bd_state);
359	}
360	return (rv);
361}
362
363int
364_info(struct modinfo *modinfop)
365{
366	return (mod_info(&modlinkage, modinfop));
367}
368
369static int
370bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
371{
372	bd_t	*bd;
373	minor_t	inst;
374
375	_NOTE(ARGUNUSED(dip));
376
377	inst = BDINST((dev_t)arg);
378
379	switch (cmd) {
380	case DDI_INFO_DEVT2DEVINFO:
381		bd = ddi_get_soft_state(bd_state, inst);
382		if (bd == NULL) {
383			return (DDI_FAILURE);
384		}
385		*resultp = (void *)bd->d_dip;
386		break;
387
388	case DDI_INFO_DEVT2INSTANCE:
389		*resultp = (void *)(intptr_t)inst;
390		break;
391
392	default:
393		return (DDI_FAILURE);
394	}
395	return (DDI_SUCCESS);
396}
397
398static void
399bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
400{
401	int	ilen;
402	char	*data_string;
403
404	ilen = scsi_ascii_inquiry_len(data, len);
405	ASSERT3U(ilen, <=, len);
406	if (ilen <= 0)
407		return;
408	/* ensure null termination */
409	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
410	bcopy(data, data_string, ilen);
411	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
412	kmem_free(data_string, ilen + 1);
413}
414
415static void
416bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
417{
418	if (drive->d_vendor_len > 0)
419		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
420		    drive->d_vendor, drive->d_vendor_len);
421
422	if (drive->d_product_len > 0)
423		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
424		    drive->d_product, drive->d_product_len);
425
426	if (drive->d_serial_len > 0)
427		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
428		    drive->d_serial, drive->d_serial_len);
429
430	if (drive->d_revision_len > 0)
431		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
432		    drive->d_revision, drive->d_revision_len);
433}
434
435static void
436bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
437{
438	char	ks_module[KSTAT_STRLEN];
439	char	ks_name[KSTAT_STRLEN];
440	int	ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
441
442	if (bd->d_errstats != NULL)
443		return;
444
445	(void) snprintf(ks_module, sizeof (ks_module), "%serr",
446	    ddi_driver_name(bd->d_dip));
447	(void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
448	    ddi_driver_name(bd->d_dip), inst);
449
450	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
451	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
452
453	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
454	if (bd->d_errstats == NULL) {
455		/*
456		 * Even if we cannot create the kstat, we create a
457		 * scratch kstat.  The reason for this is to ensure
458		 * that we can update the kstat all of the time,
459		 * without adding an extra branch instruction.
460		 */
461		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
462		    KM_SLEEP);
463	} else {
464		bd->d_errstats->ks_lock = &bd->d_errmutex;
465		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
466	}
467
468	kstat_named_init(&bd->d_kerr->bd_softerrs,	"Soft Errors",
469	    KSTAT_DATA_UINT32);
470	kstat_named_init(&bd->d_kerr->bd_harderrs,	"Hard Errors",
471	    KSTAT_DATA_UINT32);
472	kstat_named_init(&bd->d_kerr->bd_transerrs,	"Transport Errors",
473	    KSTAT_DATA_UINT32);
474
475	if (drive->d_model_len > 0) {
476		kstat_named_init(&bd->d_kerr->bd_model,	"Model",
477		    KSTAT_DATA_STRING);
478	} else {
479		kstat_named_init(&bd->d_kerr->bd_vid,	"Vendor",
480		    KSTAT_DATA_STRING);
481		kstat_named_init(&bd->d_kerr->bd_pid,	"Product",
482		    KSTAT_DATA_STRING);
483	}
484
485	kstat_named_init(&bd->d_kerr->bd_revision,	"Revision",
486	    KSTAT_DATA_STRING);
487	kstat_named_init(&bd->d_kerr->bd_serial,	"Serial No",
488	    KSTAT_DATA_STRING);
489	kstat_named_init(&bd->d_kerr->bd_capacity,	"Size",
490	    KSTAT_DATA_ULONGLONG);
491	kstat_named_init(&bd->d_kerr->bd_rq_media_err,	"Media Error",
492	    KSTAT_DATA_UINT32);
493	kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err,	"Device Not Ready",
494	    KSTAT_DATA_UINT32);
495	kstat_named_init(&bd->d_kerr->bd_rq_nodev_err,	"No Device",
496	    KSTAT_DATA_UINT32);
497	kstat_named_init(&bd->d_kerr->bd_rq_recov_err,	"Recoverable",
498	    KSTAT_DATA_UINT32);
499	kstat_named_init(&bd->d_kerr->bd_rq_illrq_err,	"Illegal Request",
500	    KSTAT_DATA_UINT32);
501	kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
502	    "Predictive Failure Analysis", KSTAT_DATA_UINT32);
503
504	bd->d_errstats->ks_private = bd;
505
506	kstat_install(bd->d_errstats);
507	bd_init_errstats(bd, drive);
508}
509
510static void
511bd_destroy_errstats(bd_t *bd)
512{
513	if (bd->d_errstats != NULL) {
514		bd_fini_errstats(bd);
515		kstat_delete(bd->d_errstats);
516		bd->d_errstats = NULL;
517	} else {
518		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
519		bd->d_kerr = NULL;
520		mutex_destroy(&bd->d_errmutex);
521	}
522}
523
524static void
525bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
526{
527	char	*tmp;
528	size_t	km_len;
529
530	if (KSTAT_NAMED_STR_PTR(k) == NULL) {
531		if (len > 0)
532			km_len = strnlen(str, len);
533		else if (alt != NULL)
534			km_len = strlen(alt);
535		else
536			return;
537
538		tmp = kmem_alloc(km_len + 1, KM_SLEEP);
539		bcopy(len > 0 ? str : alt, tmp, km_len);
540		tmp[km_len] = '\0';
541
542		kstat_named_setstr(k, tmp);
543	}
544}
545
546static void
547bd_errstats_clrstr(kstat_named_t *k)
548{
549	if (KSTAT_NAMED_STR_PTR(k) == NULL)
550		return;
551
552	kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k));
553	kstat_named_setstr(k, NULL);
554}
555
556static void
557bd_init_errstats(bd_t *bd, bd_drive_t *drive)
558{
559	struct bd_errstats	*est = bd->d_kerr;
560
561	mutex_enter(&bd->d_errmutex);
562
563	if (drive->d_model_len > 0 &&
564	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
565		bd_errstats_setstr(&est->bd_model, drive->d_model,
566		    drive->d_model_len, NULL);
567	} else {
568		bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
569		    drive->d_vendor_len, "Unknown ");
570		bd_errstats_setstr(&est->bd_pid, drive->d_product,
571		    drive->d_product_len, "Unknown         ");
572	}
573
574	bd_errstats_setstr(&est->bd_revision, drive->d_revision,
575	    drive->d_revision_len, "0001");
576	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
577	    drive->d_serial_len, "0               ");
578
579	mutex_exit(&bd->d_errmutex);
580}
581
582static void
583bd_fini_errstats(bd_t *bd)
584{
585	struct bd_errstats	*est = bd->d_kerr;
586
587	mutex_enter(&bd->d_errmutex);
588
589	bd_errstats_clrstr(&est->bd_model);
590	bd_errstats_clrstr(&est->bd_vid);
591	bd_errstats_clrstr(&est->bd_pid);
592	bd_errstats_clrstr(&est->bd_revision);
593	bd_errstats_clrstr(&est->bd_serial);
594
595	mutex_exit(&bd->d_errmutex);
596}
597
598static void
599bd_queues_free(bd_t *bd)
600{
601	uint32_t i;
602
603	for (i = 0; i < bd->d_qcount; i++) {
604		bd_queue_t *bq = &bd->d_queues[i];
605
606		mutex_destroy(&bq->q_iomutex);
607		list_destroy(&bq->q_waitq);
608		list_destroy(&bq->q_runq);
609	}
610
611	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
612}
613
614static int
615bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
616{
617	int		inst;
618	bd_handle_t	hdl;
619	bd_t		*bd;
620	bd_drive_t	drive;
621	uint32_t	i;
622	int		rv;
623	char		name[16];
624	char		kcache[32];
625
626	switch (cmd) {
627	case DDI_ATTACH:
628		break;
629	case DDI_RESUME:
630		/* We don't do anything native for suspend/resume */
631		return (DDI_SUCCESS);
632	default:
633		return (DDI_FAILURE);
634	}
635
636	inst = ddi_get_instance(dip);
637	hdl = ddi_get_parent_data(dip);
638
639	(void) snprintf(name, sizeof (name), "%s%d",
640	    ddi_driver_name(dip), ddi_get_instance(dip));
641	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
642
643	if (hdl == NULL) {
644		cmn_err(CE_WARN, "%s: missing parent data!", name);
645		return (DDI_FAILURE);
646	}
647
648	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
649		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
650		return (DDI_FAILURE);
651	}
652	bd = ddi_get_soft_state(bd_state, inst);
653
654	if (hdl->h_dma) {
655		bd->d_dma = *(hdl->h_dma);
656		bd->d_dma.dma_attr_granular =
657		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
658		bd->d_use_dma = B_TRUE;
659
660		if (bd->d_maxxfer &&
661		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
662			cmn_err(CE_WARN,
663			    "%s: inconsistent maximum transfer size!",
664			    name);
665			/* We force it */
666			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
667		} else {
668			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
669		}
670	} else {
671		bd->d_use_dma = B_FALSE;
672		if (bd->d_maxxfer == 0) {
673			bd->d_maxxfer = 1024 * 1024;
674		}
675	}
676	bd->d_ops = hdl->h_ops;
677	bd->d_private = hdl->h_private;
678	bd->d_blkshift = DEV_BSHIFT;	/* 512 bytes, to start */
679
680	if (bd->d_maxxfer % DEV_BSIZE) {
681		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
682		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
683	}
684	if (bd->d_maxxfer < DEV_BSIZE) {
685		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
686		ddi_soft_state_free(bd_state, inst);
687		return (DDI_FAILURE);
688	}
689
690	bd->d_dip = dip;
691	bd->d_handle = hdl;
692	hdl->h_bd = bd;
693	ddi_set_driver_private(dip, bd);
694
695	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
696	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
697	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
698	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
699
700	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
701	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
702
703	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
704	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
705	if (bd->d_ksp != NULL) {
706		bd->d_ksp->ks_lock = &bd->d_ksmutex;
707		kstat_install(bd->d_ksp);
708		bd->d_kiop = bd->d_ksp->ks_data;
709	} else {
710		/*
711		 * Even if we cannot create the kstat, we create a
712		 * scratch kstat.  The reason for this is to ensure
713		 * that we can update the kstat all of the time,
714		 * without adding an extra branch instruction.
715		 */
716		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
717	}
718
719	cmlb_alloc_handle(&bd->d_cmlbh);
720
721	bd->d_state = DKIO_NONE;
722
723	bzero(&drive, sizeof (drive));
724	/*
725	 * Default to one queue, and no restrictions on free space requests
726	 * (if driver provides method) parent driver can override.
727	 */
728	drive.d_qcount = 1;
729	drive.d_free_align = 1;
730	bd->d_ops.o_drive_info(bd->d_private, &drive);
731
732	/*
733	 * Several checks to make sure o_drive_info() didn't return bad
734	 * values:
735	 *
736	 * There must be at least one queue
737	 */
738	if (drive.d_qcount == 0)
739		goto fail_drive_info;
740
741	/* FREE/UNMAP/TRIM alignment needs to be at least 1 block */
742	if (drive.d_free_align == 0)
743		goto fail_drive_info;
744
745	/*
746	 * If d_max_free_blks is not unlimited (not 0), then we cannot allow
747	 * an unlimited segment size. It is however permissible to not impose
748	 * a limit on the total number of blocks freed while limiting the
749	 * amount allowed in an individual segment.
750	 */
751	if ((drive.d_max_free_blks > 0 && drive.d_max_free_seg_blks == 0))
752		goto fail_drive_info;
753
754	/*
755	 * If a limit is set on d_max_free_blks (by the above check, we know
756	 * if there's a limit on d_max_free_blks, d_max_free_seg_blks cannot
757	 * be unlimited), it cannot be smaller than the limit on an individual
758	 * segment.
759	 */
760	if ((drive.d_max_free_blks > 0 &&
761	    drive.d_max_free_seg_blks > drive.d_max_free_blks)) {
762		goto fail_drive_info;
763	}
764
765	bd->d_qcount = drive.d_qcount;
766	bd->d_removable = drive.d_removable;
767	bd->d_hotpluggable = drive.d_hotpluggable;
768
769	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
770		bd->d_maxxfer = drive.d_maxxfer;
771
772	bd->d_free_align = drive.d_free_align;
773	bd->d_max_free_seg = drive.d_max_free_seg;
774	bd->d_max_free_blks = drive.d_max_free_blks;
775	bd->d_max_free_seg_blks = drive.d_max_free_seg_blks;
776
777	bd_create_inquiry_props(dip, &drive);
778	bd_create_errstats(bd, inst, &drive);
779	bd_update_state(bd);
780
781	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
782	    KM_SLEEP);
783	for (i = 0; i < bd->d_qcount; i++) {
784		bd_queue_t *bq = &bd->d_queues[i];
785
786		bq->q_qsize = drive.d_qsize;
787		bq->q_qactive = 0;
788		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);
789
790		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
791		    offsetof(struct bd_xfer_impl, i_linkage));
792		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
793		    offsetof(struct bd_xfer_impl, i_linkage));
794	}
795
796	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
797	    bd->d_removable, bd->d_hotpluggable,
798	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/
799	    *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
800	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
801	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
802	if (rv != 0) {
803		goto fail_cmlb_attach;
804	}
805
806	if (bd->d_ops.o_devid_init != NULL) {
807		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
808		if (rv == DDI_SUCCESS) {
809			if (ddi_devid_register(dip, bd->d_devid) !=
810			    DDI_SUCCESS) {
811				cmn_err(CE_WARN,
812				    "%s: unable to register devid", name);
813			}
814		}
815	}
816
817	/*
818	 * Add a zero-length attribute to tell the world we support
819	 * kernel ioctls (for layered drivers).  Also set up properties
820	 * used by HAL to identify removable media.
821	 */
822	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
823	    DDI_KERNEL_IOCTL, NULL, 0);
824	if (bd->d_removable) {
825		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
826		    "removable-media", NULL, 0);
827	}
828	if (bd->d_hotpluggable) {
829		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
830		    "hotpluggable", NULL, 0);
831	}
832
833	ddi_report_dev(dip);
834
835	return (DDI_SUCCESS);
836
837fail_cmlb_attach:
838	bd_queues_free(bd);
839	bd_destroy_errstats(bd);
840
841fail_drive_info:
842	cmlb_free_handle(&bd->d_cmlbh);
843
844	if (bd->d_ksp != NULL) {
845		kstat_delete(bd->d_ksp);
846		bd->d_ksp = NULL;
847	} else {
848		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
849	}
850
851	kmem_cache_destroy(bd->d_cache);
852	cv_destroy(&bd->d_statecv);
853	mutex_destroy(&bd->d_statemutex);
854	mutex_destroy(&bd->d_ocmutex);
855	mutex_destroy(&bd->d_ksmutex);
856	ddi_soft_state_free(bd_state, inst);
857	return (DDI_FAILURE);
858}
859
860static int
861bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
862{
863	bd_t	*bd;
864
865	bd = ddi_get_driver_private(dip);
866
867	switch (cmd) {
868	case DDI_DETACH:
869		break;
870	case DDI_SUSPEND:
871		/* We don't suspend, but our parent does */
872		return (DDI_SUCCESS);
873	default:
874		return (DDI_FAILURE);
875	}
876
877	if (bd->d_ksp != NULL) {
878		kstat_delete(bd->d_ksp);
879		bd->d_ksp = NULL;
880	} else {
881		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
882	}
883
884	bd_destroy_errstats(bd);
885	cmlb_detach(bd->d_cmlbh, 0);
886	cmlb_free_handle(&bd->d_cmlbh);
887	if (bd->d_devid)
888		ddi_devid_free(bd->d_devid);
889	kmem_cache_destroy(bd->d_cache);
890	mutex_destroy(&bd->d_ksmutex);
891	mutex_destroy(&bd->d_ocmutex);
892	mutex_destroy(&bd->d_statemutex);
893	cv_destroy(&bd->d_statecv);
894	bd_queues_free(bd);
895	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
896	return (DDI_SUCCESS);
897}
898
899static int
900bd_xfer_ctor(void *buf, void *arg, int kmflag)
901{
902	bd_xfer_impl_t	*xi;
903	bd_t		*bd = arg;
904	int		(*dcb)(caddr_t);
905
906	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
907		dcb = DDI_DMA_SLEEP;
908	} else {
909		dcb = DDI_DMA_DONTWAIT;
910	}
911
912	xi = buf;
913	bzero(xi, sizeof (*xi));
914	xi->i_bd = bd;
915
916	if (bd->d_use_dma) {
917		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
918		    &xi->i_dmah) != DDI_SUCCESS) {
919			return (-1);
920		}
921	}
922
923	return (0);
924}
925
926static void
927bd_xfer_dtor(void *buf, void *arg)
928{
929	bd_xfer_impl_t	*xi = buf;
930
931	_NOTE(ARGUNUSED(arg));
932
933	if (xi->i_dmah)
934		ddi_dma_free_handle(&xi->i_dmah);
935	xi->i_dmah = NULL;
936}
937
938static bd_xfer_impl_t *
939bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
940    int kmflag)
941{
942	bd_xfer_impl_t		*xi;
943	int			rv = 0;
944	int			status;
945	unsigned		dir;
946	int			(*cb)(caddr_t);
947	size_t			len;
948	uint32_t		shift;
949
950	if (kmflag == KM_SLEEP) {
951		cb = DDI_DMA_SLEEP;
952	} else {
953		cb = DDI_DMA_DONTWAIT;
954	}
955
956	xi = kmem_cache_alloc(bd->d_cache, kmflag);
957	if (xi == NULL) {
958		bioerror(bp, ENOMEM);
959		return (NULL);
960	}
961
962	ASSERT(bp);
963
964	xi->i_bp = bp;
965	xi->i_func = func;
966	xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
967
968	if (bp->b_bcount == 0) {
969		xi->i_len = 0;
970		xi->i_nblks = 0;
971		xi->i_kaddr = NULL;
972		xi->i_resid = 0;
973		xi->i_num_win = 0;
974		goto done;
975	}
976
977	if (bp->b_flags & B_READ) {
978		dir = DDI_DMA_READ;
979		xi->i_func = bd->d_ops.o_read;
980	} else {
981		dir = DDI_DMA_WRITE;
982		xi->i_func = bd->d_ops.o_write;
983	}
984
985	shift = bd->d_blkshift;
986	xi->i_blkshift = shift;
987
988	if (!bd->d_use_dma) {
989		bp_mapin(bp);
990		rv = 0;
991		xi->i_offset = 0;
992		xi->i_num_win =
993		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
994		xi->i_cur_win = 0;
995		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
996		xi->i_nblks = xi->i_len >> shift;
997		xi->i_kaddr = bp->b_un.b_addr;
998		xi->i_resid = bp->b_bcount;
999	} else {
1000
1001		/*
1002		 * We have to use consistent DMA if the address is misaligned.
1003		 */
1004		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
1005		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
1006			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
1007		} else {
1008			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
1009		}
1010
1011		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
1012		    NULL, &xi->i_dmac, &xi->i_ndmac);
1013		switch (status) {
1014		case DDI_DMA_MAPPED:
1015			xi->i_num_win = 1;
1016			xi->i_cur_win = 0;
1017			xi->i_offset = 0;
1018			xi->i_len = bp->b_bcount;
1019			xi->i_nblks = xi->i_len >> shift;
1020			xi->i_resid = bp->b_bcount;
1021			rv = 0;
1022			break;
1023		case DDI_DMA_PARTIAL_MAP:
1024			xi->i_cur_win = 0;
1025
1026			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
1027			    DDI_SUCCESS) ||
1028			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
1029			    &len, &xi->i_dmac, &xi->i_ndmac) !=
1030			    DDI_SUCCESS) ||
1031			    (P2PHASE(len, (1U << shift)) != 0)) {
1032				(void) ddi_dma_unbind_handle(xi->i_dmah);
1033				rv = EFAULT;
1034				goto done;
1035			}
1036			xi->i_len = len;
1037			xi->i_nblks = xi->i_len >> shift;
1038			xi->i_resid = bp->b_bcount;
1039			rv = 0;
1040			break;
1041		case DDI_DMA_NORESOURCES:
1042			rv = EAGAIN;
1043			goto done;
1044		case DDI_DMA_TOOBIG:
1045			rv = EINVAL;
1046			goto done;
1047		case DDI_DMA_NOMAPPING:
1048		case DDI_DMA_INUSE:
1049		default:
1050			rv = EFAULT;
1051			goto done;
1052		}
1053	}
1054
1055done:
1056	if (rv != 0) {
1057		kmem_cache_free(bd->d_cache, xi);
1058		bioerror(bp, rv);
1059		return (NULL);
1060	}
1061
1062	return (xi);
1063}
1064
1065static void
1066bd_xfer_free(bd_xfer_impl_t *xi)
1067{
1068	if (xi->i_dmah) {
1069		(void) ddi_dma_unbind_handle(xi->i_dmah);
1070	}
1071	if (xi->i_dfl != NULL) {
1072		dfl_free((dkioc_free_list_t *)xi->i_dfl);
1073		xi->i_dfl = NULL;
1074	}
1075	kmem_cache_free(xi->i_bd->d_cache, xi);
1076}
1077
1078static int
1079bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1080{
1081	dev_t		dev = *devp;
1082	bd_t		*bd;
1083	minor_t		part;
1084	minor_t		inst;
1085	uint64_t	mask;
1086	boolean_t	ndelay;
1087	int		rv;
1088	diskaddr_t	nblks;
1089	diskaddr_t	lba;
1090
1091	_NOTE(ARGUNUSED(credp));
1092
1093	part = BDPART(dev);
1094	inst = BDINST(dev);
1095
1096	if (otyp >= OTYPCNT)
1097		return (EINVAL);
1098
1099	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
1100
1101	/*
1102	 * Block any DR events from changing the set of registered
1103	 * devices while we function.
1104	 */
1105	rw_enter(&bd_lock, RW_READER);
1106	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1107		rw_exit(&bd_lock);
1108		return (ENXIO);
1109	}
1110
1111	mutex_enter(&bd->d_ocmutex);
1112
1113	ASSERT(part < 64);
1114	mask = (1U << part);
1115
1116	bd_update_state(bd);
1117
1118	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
1119
1120		/* non-blocking opens are allowed to succeed */
1121		if (!ndelay) {
1122			rv = ENXIO;
1123			goto done;
1124		}
1125	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
1126	    NULL, NULL, 0) == 0) {
1127
1128		/*
1129		 * We read the partinfo, verify valid ranges.  If the
1130		 * partition is invalid, and we aren't blocking or
1131		 * doing a raw access, then fail. (Non-blocking and
1132		 * raw accesses can still succeed to allow a disk with
1133		 * bad partition data to opened by format and fdisk.)
1134		 */
1135		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
1136			rv = ENXIO;
1137			goto done;
1138		}
1139	} else if (!ndelay) {
1140		/*
1141		 * cmlb_partinfo failed -- invalid partition or no
1142		 * disk label.
1143		 */
1144		rv = ENXIO;
1145		goto done;
1146	}
1147
1148	if ((flag & FWRITE) && bd->d_rdonly) {
1149		rv = EROFS;
1150		goto done;
1151	}
1152
1153	if ((bd->d_open_excl) & (mask)) {
1154		rv = EBUSY;
1155		goto done;
1156	}
1157	if (flag & FEXCL) {
1158		if (bd->d_open_lyr[part]) {
1159			rv = EBUSY;
1160			goto done;
1161		}
1162		for (int i = 0; i < OTYP_LYR; i++) {
1163			if (bd->d_open_reg[i] & mask) {
1164				rv = EBUSY;
1165				goto done;
1166			}
1167		}
1168	}
1169
1170	if (otyp == OTYP_LYR) {
1171		bd->d_open_lyr[part]++;
1172	} else {
1173		bd->d_open_reg[otyp] |= mask;
1174	}
1175	if (flag & FEXCL) {
1176		bd->d_open_excl |= mask;
1177	}
1178
1179	rv = 0;
1180done:
1181	mutex_exit(&bd->d_ocmutex);
1182	rw_exit(&bd_lock);
1183
1184	return (rv);
1185}
1186
1187static int
1188bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
1189{
1190	bd_t		*bd;
1191	minor_t		inst;
1192	minor_t		part;
1193	uint64_t	mask;
1194	boolean_t	last = B_TRUE;
1195
1196	_NOTE(ARGUNUSED(flag));
1197	_NOTE(ARGUNUSED(credp));
1198
1199	part = BDPART(dev);
1200	inst = BDINST(dev);
1201
1202	ASSERT(part < 64);
1203	mask = (1U << part);
1204
1205	rw_enter(&bd_lock, RW_READER);
1206
1207	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1208		rw_exit(&bd_lock);
1209		return (ENXIO);
1210	}
1211
1212	mutex_enter(&bd->d_ocmutex);
1213	if (bd->d_open_excl & mask) {
1214		bd->d_open_excl &= ~mask;
1215	}
1216	if (otyp == OTYP_LYR) {
1217		bd->d_open_lyr[part]--;
1218	} else {
1219		bd->d_open_reg[otyp] &= ~mask;
1220	}
1221	for (int i = 0; i < 64; i++) {
1222		if (bd->d_open_lyr[part]) {
1223			last = B_FALSE;
1224		}
1225	}
1226	for (int i = 0; last && (i < OTYP_LYR); i++) {
1227		if (bd->d_open_reg[i]) {
1228			last = B_FALSE;
1229		}
1230	}
1231	mutex_exit(&bd->d_ocmutex);
1232
1233	if (last) {
1234		cmlb_invalidate(bd->d_cmlbh, 0);
1235	}
1236	rw_exit(&bd_lock);
1237
1238	return (0);
1239}
1240
1241static int
1242bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1243{
1244	minor_t		inst;
1245	minor_t		part;
1246	diskaddr_t	pstart;
1247	diskaddr_t	psize;
1248	bd_t		*bd;
1249	bd_xfer_impl_t	*xi;
1250	buf_t		*bp;
1251	int		rv;
1252	uint32_t	shift;
1253	daddr_t		d_blkno;
1254	int	d_nblk;
1255
1256	rw_enter(&bd_lock, RW_READER);
1257
1258	part = BDPART(dev);
1259	inst = BDINST(dev);
1260
1261	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1262		rw_exit(&bd_lock);
1263		return (ENXIO);
1264	}
1265	shift = bd->d_blkshift;
1266	d_blkno = blkno >> (shift - DEV_BSHIFT);
1267	d_nblk = nblk >> (shift - DEV_BSHIFT);
1268	/*
1269	 * do cmlb, but do it synchronously unless we already have the
1270	 * partition (which we probably should.)
1271	 */
1272	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1273	    (void *)1)) {
1274		rw_exit(&bd_lock);
1275		return (ENXIO);
1276	}
1277
1278	if ((d_blkno + d_nblk) > psize) {
1279		rw_exit(&bd_lock);
1280		return (EINVAL);
1281	}
1282	bp = getrbuf(KM_NOSLEEP);
1283	if (bp == NULL) {
1284		rw_exit(&bd_lock);
1285		return (ENOMEM);
1286	}
1287
1288	bp->b_bcount = nblk << DEV_BSHIFT;
1289	bp->b_resid = bp->b_bcount;
1290	bp->b_lblkno = blkno;
1291	bp->b_un.b_addr = caddr;
1292
1293	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
1294	if (xi == NULL) {
1295		rw_exit(&bd_lock);
1296		freerbuf(bp);
1297		return (ENOMEM);
1298	}
1299	xi->i_blkno = d_blkno + pstart;
1300	xi->i_flags = BD_XFER_POLL;
1301	bd_submit(bd, xi);
1302	rw_exit(&bd_lock);
1303
1304	/*
1305	 * Generally, we should have run this entirely synchronously
1306	 * at this point and the biowait call should be a no-op.  If
1307	 * it didn't happen this way, it's a bug in the underlying
1308	 * driver not honoring BD_XFER_POLL.
1309	 */
1310	(void) biowait(bp);
1311	rv = geterror(bp);
1312	freerbuf(bp);
1313	return (rv);
1314}
1315
1316void
1317bd_minphys(struct buf *bp)
1318{
1319	minor_t inst;
1320	bd_t	*bd;
1321	inst = BDINST(bp->b_edev);
1322
1323	bd = ddi_get_soft_state(bd_state, inst);
1324
1325	/*
1326	 * In a non-debug kernel, bd_strategy will catch !bd as
1327	 * well, and will fail nicely.
1328	 */
1329	ASSERT(bd);
1330
1331	if (bp->b_bcount > bd->d_maxxfer)
1332		bp->b_bcount = bd->d_maxxfer;
1333}
1334
1335static int
1336bd_check_uio(dev_t dev, struct uio *uio)
1337{
1338	bd_t		*bd;
1339	uint32_t	shift;
1340
1341	if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1342		return (ENXIO);
1343	}
1344
1345	shift = bd->d_blkshift;
1346	if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1347	    (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1348		return (EINVAL);
1349	}
1350
1351	return (0);
1352}
1353
1354static int
1355bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1356{
1357	_NOTE(ARGUNUSED(credp));
1358	int	ret = bd_check_uio(dev, uio);
1359	if (ret != 0) {
1360		return (ret);
1361	}
1362	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1363}
1364
1365static int
1366bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1367{
1368	_NOTE(ARGUNUSED(credp));
1369	int	ret = bd_check_uio(dev, uio);
1370	if (ret != 0) {
1371		return (ret);
1372	}
1373	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1374}
1375
1376static int
1377bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1378{
1379	_NOTE(ARGUNUSED(credp));
1380	int	ret = bd_check_uio(dev, aio->aio_uio);
1381	if (ret != 0) {
1382		return (ret);
1383	}
1384	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1385}
1386
1387static int
1388bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1389{
1390	_NOTE(ARGUNUSED(credp));
1391	int	ret = bd_check_uio(dev, aio->aio_uio);
1392	if (ret != 0) {
1393		return (ret);
1394	}
1395	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1396}
1397
1398static int
1399bd_strategy(struct buf *bp)
1400{
1401	minor_t		inst;
1402	minor_t		part;
1403	bd_t		*bd;
1404	diskaddr_t	p_lba;
1405	diskaddr_t	p_nblks;
1406	diskaddr_t	b_nblks;
1407	bd_xfer_impl_t	*xi;
1408	uint32_t	shift;
1409	int		(*func)(void *, bd_xfer_t *);
1410	diskaddr_t	lblkno;
1411
1412	part = BDPART(bp->b_edev);
1413	inst = BDINST(bp->b_edev);
1414
1415	ASSERT(bp);
1416
1417	bp->b_resid = bp->b_bcount;
1418
1419	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1420		bioerror(bp, ENXIO);
1421		biodone(bp);
1422		return (0);
1423	}
1424
1425	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1426	    NULL, NULL, 0)) {
1427		bioerror(bp, ENXIO);
1428		biodone(bp);
1429		return (0);
1430	}
1431
1432	shift = bd->d_blkshift;
1433	lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1434	if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1435	    (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1436	    (lblkno > p_nblks)) {
1437		bioerror(bp, EINVAL);
1438		biodone(bp);
1439		return (0);
1440	}
1441	b_nblks = bp->b_bcount >> shift;
1442	if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1443		biodone(bp);
1444		return (0);
1445	}
1446
1447	if ((b_nblks + lblkno) > p_nblks) {
1448		bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1449		bp->b_bcount -= bp->b_resid;
1450	} else {
1451		bp->b_resid = 0;
1452	}
1453	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1454
1455	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1456	if (xi == NULL) {
1457		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1458	}
1459	if (xi == NULL) {
1460		/* bd_request_alloc will have done bioerror */
1461		biodone(bp);
1462		return (0);
1463	}
1464	xi->i_blkno = lblkno + p_lba;
1465
1466	bd_submit(bd, xi);
1467
1468	return (0);
1469}
1470
1471static int
1472bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1473{
1474	minor_t		inst;
1475	uint16_t	part;
1476	bd_t		*bd;
1477	void		*ptr = (void *)arg;
1478	int		rv;
1479
1480	part = BDPART(dev);
1481	inst = BDINST(dev);
1482
1483	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1484		return (ENXIO);
1485	}
1486
1487	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1488	if (rv != ENOTTY)
1489		return (rv);
1490
1491	if (rvalp != NULL) {
1492		/* the return value of the ioctl is 0 by default */
1493		*rvalp = 0;
1494	}
1495
1496	switch (cmd) {
1497	case DKIOCGMEDIAINFO: {
1498		struct dk_minfo minfo;
1499
1500		/* make sure our state information is current */
1501		bd_update_state(bd);
1502		bzero(&minfo, sizeof (minfo));
1503		minfo.dki_media_type = DK_FIXED_DISK;
1504		minfo.dki_lbsize = (1U << bd->d_blkshift);
1505		minfo.dki_capacity = bd->d_numblks;
1506		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1507			return (EFAULT);
1508		}
1509		return (0);
1510	}
1511	case DKIOCGMEDIAINFOEXT: {
1512		struct dk_minfo_ext miext;
1513
1514		/* make sure our state information is current */
1515		bd_update_state(bd);
1516		bzero(&miext, sizeof (miext));
1517		miext.dki_media_type = DK_FIXED_DISK;
1518		miext.dki_lbsize = (1U << bd->d_blkshift);
1519		miext.dki_pbsize = (1U << bd->d_pblkshift);
1520		miext.dki_capacity = bd->d_numblks;
1521		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1522			return (EFAULT);
1523		}
1524		return (0);
1525	}
1526	case DKIOCINFO: {
1527		struct dk_cinfo cinfo;
1528		bzero(&cinfo, sizeof (cinfo));
1529		cinfo.dki_ctype = DKC_BLKDEV;
1530		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1531		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1532		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1533		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1534		    "%s", ddi_driver_name(bd->d_dip));
1535		cinfo.dki_unit = inst;
1536		cinfo.dki_flags = DKI_FMTVOL;
1537		cinfo.dki_partition = part;
1538		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1539		cinfo.dki_addr = 0;
1540		cinfo.dki_slave = 0;
1541		cinfo.dki_space = 0;
1542		cinfo.dki_prio = 0;
1543		cinfo.dki_vec = 0;
1544		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1545			return (EFAULT);
1546		}
1547		return (0);
1548	}
1549	case DKIOCREMOVABLE: {
1550		int i;
1551		i = bd->d_removable ? 1 : 0;
1552		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1553			return (EFAULT);
1554		}
1555		return (0);
1556	}
1557	case DKIOCHOTPLUGGABLE: {
1558		int i;
1559		i = bd->d_hotpluggable ? 1 : 0;
1560		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1561			return (EFAULT);
1562		}
1563		return (0);
1564	}
1565	case DKIOCREADONLY: {
1566		int i;
1567		i = bd->d_rdonly ? 1 : 0;
1568		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1569			return (EFAULT);
1570		}
1571		return (0);
1572	}
1573	case DKIOCSOLIDSTATE: {
1574		int i;
1575		i = bd->d_ssd ? 1 : 0;
1576		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1577			return (EFAULT);
1578		}
1579		return (0);
1580	}
1581	case DKIOCSTATE: {
1582		enum dkio_state	state;
1583		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1584			return (EFAULT);
1585		}
1586		if ((rv = bd_check_state(bd, &state)) != 0) {
1587			return (rv);
1588		}
1589		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1590			return (EFAULT);
1591		}
1592		return (0);
1593	}
1594	case DKIOCFLUSHWRITECACHE: {
1595		struct dk_callback *dkc = NULL;
1596
1597		if (flag & FKIOCTL)
1598			dkc = (void *)arg;
1599
1600		rv = bd_flush_write_cache(bd, dkc);
1601		return (rv);
1602	}
1603	case DKIOCFREE: {
1604		dkioc_free_list_t *dfl = NULL;
1605
1606		/*
1607		 * Check free space support early to avoid copyin/allocation
1608		 * when unnecessary.
1609		 */
1610		if (!CAN_FREESPACE(bd))
1611			return (ENOTSUP);
1612
1613		rv = dfl_copyin(ptr, &dfl, flag, KM_SLEEP);
1614		if (rv != 0)
1615			return (rv);
1616
1617		/*
1618		 * bd_free_space() consumes 'dfl'. bd_free_space() will
1619		 * call dfl_iter() which will normally try to pass dfl through
1620		 * to bd_free_space_cb() which attaches dfl to the bd_xfer_t
1621		 * that is then queued for the underlying driver. Once the
1622		 * driver processes the request, the bd_xfer_t instance is
1623		 * disposed of, including any attached dkioc_free_list_t.
1624		 *
1625		 * If dfl cannot be processed by the underlying driver due to
1626		 * size or alignment requirements of the driver, dfl_iter()
1627		 * will replace dfl with one or more new dkioc_free_list_t
1628		 * instances with the correct alignment and sizes for the driver
1629		 * (and free the original dkioc_free_list_t).
1630		 */
1631		rv = bd_free_space(dev, bd, dfl);
1632		return (rv);
1633	}
1634
1635	case DKIOC_CANFREE: {
1636		boolean_t supported = CAN_FREESPACE(bd);
1637
1638		if (ddi_copyout(&supported, (void *)arg, sizeof (supported),
1639		    flag) != 0) {
1640			return (EFAULT);
1641		}
1642
1643		return (0);
1644	}
1645
1646	default:
1647		break;
1648
1649	}
1650	return (ENOTTY);
1651}
1652
1653static int
1654bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1655    char *name, caddr_t valuep, int *lengthp)
1656{
1657	bd_t	*bd;
1658
1659	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1660	if (bd == NULL)
1661		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1662		    name, valuep, lengthp));
1663
1664	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1665	    valuep, lengthp, BDPART(dev), 0));
1666}
1667
1668
1669static int
1670bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1671    size_t length, void *tg_cookie)
1672{
1673	bd_t		*bd;
1674	buf_t		*bp;
1675	bd_xfer_impl_t	*xi;
1676	int		rv;
1677	int		(*func)(void *, bd_xfer_t *);
1678	int		kmflag;
1679
1680	/*
1681	 * If we are running in polled mode (such as during dump(9e)
1682	 * execution), then we cannot sleep for kernel allocations.
1683	 */
1684	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1685
1686	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1687
1688	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1689		/* We can only transfer whole blocks at a time! */
1690		return (EINVAL);
1691	}
1692
1693	if ((bp = getrbuf(kmflag)) == NULL) {
1694		return (ENOMEM);
1695	}
1696
1697	switch (cmd) {
1698	case TG_READ:
1699		bp->b_flags = B_READ;
1700		func = bd->d_ops.o_read;
1701		break;
1702	case TG_WRITE:
1703		bp->b_flags = B_WRITE;
1704		func = bd->d_ops.o_write;
1705		break;
1706	default:
1707		freerbuf(bp);
1708		return (EINVAL);
1709	}
1710
1711	bp->b_un.b_addr = bufaddr;
1712	bp->b_bcount = length;
1713	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1714	if (xi == NULL) {
1715		rv = geterror(bp);
1716		freerbuf(bp);
1717		return (rv);
1718	}
1719	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1720	xi->i_blkno = start;
1721	bd_submit(bd, xi);
1722	(void) biowait(bp);
1723	rv = geterror(bp);
1724	freerbuf(bp);
1725
1726	return (rv);
1727}
1728
1729static int
1730bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1731{
1732	bd_t		*bd;
1733
1734	_NOTE(ARGUNUSED(tg_cookie));
1735	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1736
1737	switch (cmd) {
1738	case TG_GETPHYGEOM:
1739	case TG_GETVIRTGEOM:
1740		/*
1741		 * We don't have any "geometry" as such, let cmlb
1742		 * fabricate something.
1743		 */
1744		return (ENOTTY);
1745
1746	case TG_GETCAPACITY:
1747		bd_update_state(bd);
1748		*(diskaddr_t *)arg = bd->d_numblks;
1749		return (0);
1750
1751	case TG_GETBLOCKSIZE:
1752		*(uint32_t *)arg = (1U << bd->d_blkshift);
1753		return (0);
1754
1755	case TG_GETATTR:
1756		/*
1757		 * It turns out that cmlb really doesn't do much for
1758		 * non-writable media, but lets make the information
1759		 * available for it in case it does more in the
1760		 * future.  (The value is currently used for
1761		 * triggering special behavior for CD-ROMs.)
1762		 */
1763		bd_update_state(bd);
1764		((tg_attribute_t *)arg)->media_is_writable =
1765		    bd->d_rdonly ? B_FALSE : B_TRUE;
1766		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1767		((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1768		return (0);
1769
1770	default:
1771		return (EINVAL);
1772	}
1773}
1774
1775
1776static void
1777bd_sched(bd_t *bd, bd_queue_t *bq)
1778{
1779	bd_xfer_impl_t	*xi;
1780	struct buf	*bp;
1781	int		rv;
1782
1783	mutex_enter(&bq->q_iomutex);
1784
1785	while ((bq->q_qactive < bq->q_qsize) &&
1786	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
1787		mutex_enter(&bd->d_ksmutex);
1788		kstat_waitq_to_runq(bd->d_kiop);
1789		mutex_exit(&bd->d_ksmutex);
1790
1791		bq->q_qactive++;
1792		list_insert_tail(&bq->q_runq, xi);
1793
1794		/*
1795		 * Submit the job to the driver.  We drop the I/O mutex
1796		 * so that we can deal with the case where the driver
1797		 * completion routine calls back into us synchronously.
1798		 */
1799
1800		mutex_exit(&bq->q_iomutex);
1801
1802		rv = xi->i_func(bd->d_private, &xi->i_public);
1803		if (rv != 0) {
1804			bp = xi->i_bp;
1805			bioerror(bp, rv);
1806			biodone(bp);
1807
1808			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1809
1810			mutex_enter(&bq->q_iomutex);
1811
1812			mutex_enter(&bd->d_ksmutex);
1813			kstat_runq_exit(bd->d_kiop);
1814			mutex_exit(&bd->d_ksmutex);
1815
1816			bq->q_qactive--;
1817			list_remove(&bq->q_runq, xi);
1818			bd_xfer_free(xi);
1819		} else {
1820			mutex_enter(&bq->q_iomutex);
1821		}
1822	}
1823
1824	mutex_exit(&bq->q_iomutex);
1825}
1826
1827static void
1828bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1829{
1830	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
1831	unsigned	q = nv % bd->d_qcount;
1832	bd_queue_t	*bq = &bd->d_queues[q];
1833
1834	xi->i_bq = bq;
1835	xi->i_qnum = q;
1836
1837	mutex_enter(&bq->q_iomutex);
1838
1839	list_insert_tail(&bq->q_waitq, xi);
1840
1841	mutex_enter(&bd->d_ksmutex);
1842	kstat_waitq_enter(bd->d_kiop);
1843	mutex_exit(&bd->d_ksmutex);
1844
1845	mutex_exit(&bq->q_iomutex);
1846
1847	bd_sched(bd, bq);
1848}
1849
1850static void
1851bd_runq_exit(bd_xfer_impl_t *xi, int err)
1852{
1853	bd_t		*bd = xi->i_bd;
1854	buf_t		*bp = xi->i_bp;
1855	bd_queue_t	*bq = xi->i_bq;
1856
1857	mutex_enter(&bq->q_iomutex);
1858	bq->q_qactive--;
1859
1860	mutex_enter(&bd->d_ksmutex);
1861	kstat_runq_exit(bd->d_kiop);
1862	mutex_exit(&bd->d_ksmutex);
1863
1864	list_remove(&bq->q_runq, xi);
1865	mutex_exit(&bq->q_iomutex);
1866
1867	if (err == 0) {
1868		if (bp->b_flags & B_READ) {
1869			atomic_inc_uint(&bd->d_kiop->reads);
1870			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
1871			    bp->b_bcount - xi->i_resid);
1872		} else {
1873			atomic_inc_uint(&bd->d_kiop->writes);
1874			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
1875			    bp->b_bcount - xi->i_resid);
1876		}
1877	}
1878	bd_sched(bd, bq);
1879}
1880
1881static void
1882bd_update_state(bd_t *bd)
1883{
1884	enum	dkio_state	state = DKIO_INSERTED;
1885	boolean_t		docmlb = B_FALSE;
1886	bd_media_t		media;
1887
1888	bzero(&media, sizeof (media));
1889
1890	mutex_enter(&bd->d_statemutex);
1891	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1892		bd->d_numblks = 0;
1893		state = DKIO_EJECTED;
1894		goto done;
1895	}
1896
1897	if ((media.m_blksize < 512) ||
1898	    (!ISP2(media.m_blksize)) ||
1899	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1900		cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1901		    ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1902		    media.m_blksize);
1903		/*
1904		 * We can't use the media, treat it as not present.
1905		 */
1906		state = DKIO_EJECTED;
1907		bd->d_numblks = 0;
1908		goto done;
1909	}
1910
1911	if (((1U << bd->d_blkshift) != media.m_blksize) ||
1912	    (bd->d_numblks != media.m_nblks)) {
1913		/* Device size changed */
1914		docmlb = B_TRUE;
1915	}
1916
1917	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1918	bd->d_pblkshift = bd->d_blkshift;
1919	bd->d_numblks = media.m_nblks;
1920	bd->d_rdonly = media.m_readonly;
1921	bd->d_ssd = media.m_solidstate;
1922
1923	/*
1924	 * Only use the supplied physical block size if it is non-zero,
1925	 * greater or equal to the block size, and a power of 2. Ignore it
1926	 * if not, it's just informational and we can still use the media.
1927	 */
1928	if ((media.m_pblksize != 0) &&
1929	    (media.m_pblksize >= media.m_blksize) &&
1930	    (ISP2(media.m_pblksize)))
1931		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1932
1933done:
1934	if (state != bd->d_state) {
1935		bd->d_state = state;
1936		cv_broadcast(&bd->d_statecv);
1937		docmlb = B_TRUE;
1938	}
1939	mutex_exit(&bd->d_statemutex);
1940
1941	bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
1942
1943	if (docmlb) {
1944		if (state == DKIO_INSERTED) {
1945			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1946		} else {
1947			cmlb_invalidate(bd->d_cmlbh, 0);
1948		}
1949	}
1950}
1951
1952static int
1953bd_check_state(bd_t *bd, enum dkio_state *state)
1954{
1955	clock_t		when;
1956
1957	for (;;) {
1958
1959		bd_update_state(bd);
1960
1961		mutex_enter(&bd->d_statemutex);
1962
1963		if (bd->d_state != *state) {
1964			*state = bd->d_state;
1965			mutex_exit(&bd->d_statemutex);
1966			break;
1967		}
1968
1969		when = drv_usectohz(1000000);
1970		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1971		    when, TR_CLOCK_TICK) == 0) {
1972			mutex_exit(&bd->d_statemutex);
1973			return (EINTR);
1974		}
1975
1976		mutex_exit(&bd->d_statemutex);
1977	}
1978
1979	return (0);
1980}
1981
1982static int
1983bd_flush_write_cache_done(struct buf *bp)
1984{
1985	struct dk_callback *dc = (void *)bp->b_private;
1986
1987	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1988	kmem_free(dc, sizeof (*dc));
1989	freerbuf(bp);
1990	return (0);
1991}
1992
1993static int
1994bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1995{
1996	buf_t			*bp;
1997	struct dk_callback	*dc;
1998	bd_xfer_impl_t		*xi;
1999	int			rv;
2000
2001	if (bd->d_ops.o_sync_cache == NULL) {
2002		return (ENOTSUP);
2003	}
2004	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
2005		return (ENOMEM);
2006	}
2007	bp->b_resid = 0;
2008	bp->b_bcount = 0;
2009
2010	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
2011	if (xi == NULL) {
2012		rv = geterror(bp);
2013		freerbuf(bp);
2014		return (rv);
2015	}
2016
2017	/* Make an asynchronous flush, but only if there is a callback */
2018	if (dkc != NULL && dkc->dkc_callback != NULL) {
2019		/* Make a private copy of the callback structure */
2020		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
2021		*dc = *dkc;
2022		bp->b_private = dc;
2023		bp->b_iodone = bd_flush_write_cache_done;
2024
2025		bd_submit(bd, xi);
2026		return (0);
2027	}
2028
2029	/* In case there is no callback, perform a synchronous flush */
2030	bd_submit(bd, xi);
2031	(void) biowait(bp);
2032	rv = geterror(bp);
2033	freerbuf(bp);
2034
2035	return (rv);
2036}
2037
2038static int
2039bd_free_space_done(struct buf *bp)
2040{
2041	freerbuf(bp);
2042	return (0);
2043}
2044
2045static int
2046bd_free_space_cb(dkioc_free_list_t *dfl, void *arg, int kmflag)
2047{
2048	bd_t		*bd = arg;
2049	buf_t		*bp = NULL;
2050	bd_xfer_impl_t	*xi = NULL;
2051	boolean_t	sync = DFL_ISSYNC(dfl) ?  B_TRUE : B_FALSE;
2052	int		rv = 0;
2053
2054	bp = getrbuf(KM_SLEEP);
2055	bp->b_resid = 0;
2056	bp->b_bcount = 0;
2057	bp->b_lblkno = 0;
2058
2059	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_free_space, kmflag);
2060	xi->i_dfl = dfl;
2061
2062	if (!sync) {
2063		bp->b_iodone = bd_free_space_done;
2064		bd_submit(bd, xi);
2065		return (0);
2066	}
2067
2068	xi->i_flags |= BD_XFER_POLL;
2069	bd_submit(bd, xi);
2070
2071	(void) biowait(bp);
2072	rv = geterror(bp);
2073	freerbuf(bp);
2074
2075	return (rv);
2076}
2077
2078static int
2079bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
2080{
2081	diskaddr_t p_len, p_offset;
2082	uint64_t offset_bytes, len_bytes;
2083	minor_t part = BDPART(dev);
2084	const uint_t bshift = bd->d_blkshift;
2085	dkioc_free_info_t dfi = {
2086		.dfi_bshift = bshift,
2087		.dfi_align = bd->d_free_align << bshift,
2088		.dfi_max_bytes = bd->d_max_free_blks << bshift,
2089		.dfi_max_ext = bd->d_max_free_seg,
2090		.dfi_max_ext_bytes = bd->d_max_free_seg_blks << bshift,
2091	};
2092
2093	if (cmlb_partinfo(bd->d_cmlbh, part, &p_len, &p_offset, NULL,
2094	    NULL, 0) != 0) {
2095		dfl_free(dfl);
2096		return (ENXIO);
2097	}
2098
2099	/*
2100	 * bd_ioctl created our own copy of dfl, so we can modify as
2101	 * necessary
2102	 */
2103	offset_bytes = (uint64_t)p_offset << bshift;
2104	len_bytes = (uint64_t)p_len << bshift;
2105
2106	dfl->dfl_offset += offset_bytes;
2107	if (dfl->dfl_offset < offset_bytes) {
2108		dfl_free(dfl);
2109		return (EOVERFLOW);
2110	}
2111
2112	return (dfl_iter(dfl, &dfi, offset_bytes + len_bytes, bd_free_space_cb,
2113	    bd, KM_SLEEP));
2114}
2115
2116/*
2117 * Nexus support.
2118 */
2119int
2120bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
2121    void *arg, void *result)
2122{
2123	bd_handle_t	hdl;
2124
2125	switch (ctlop) {
2126	case DDI_CTLOPS_REPORTDEV:
2127		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
2128		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
2129		    ddi_driver_name(rdip), ddi_get_instance(rdip));
2130		return (DDI_SUCCESS);
2131
2132	case DDI_CTLOPS_INITCHILD:
2133		hdl = ddi_get_parent_data((dev_info_t *)arg);
2134		if (hdl == NULL) {
2135			return (DDI_NOT_WELL_FORMED);
2136		}
2137		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
2138		return (DDI_SUCCESS);
2139
2140	case DDI_CTLOPS_UNINITCHILD:
2141		ddi_set_name_addr((dev_info_t *)arg, NULL);
2142		ndi_prop_remove_all((dev_info_t *)arg);
2143		return (DDI_SUCCESS);
2144
2145	default:
2146		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
2147	}
2148}
2149
2150/*
2151 * Functions for device drivers.
2152 */
2153bd_handle_t
2154bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
2155{
2156	bd_handle_t	hdl;
2157
2158	switch (ops->o_version) {
2159	case BD_OPS_VERSION_0:
2160	case BD_OPS_VERSION_1:
2161	case BD_OPS_VERSION_2:
2162		break;
2163
2164	default:
2165		/* Unsupported version */
2166		return (NULL);
2167	}
2168
2169	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
2170	if (hdl == NULL) {
2171		return (NULL);
2172	}
2173
2174	switch (ops->o_version) {
2175	case BD_OPS_VERSION_2:
2176		hdl->h_ops.o_free_space = ops->o_free_space;
2177		/*FALLTHRU*/
2178	case BD_OPS_VERSION_1:
2179	case BD_OPS_VERSION_0:
2180		hdl->h_ops.o_drive_info = ops->o_drive_info;
2181		hdl->h_ops.o_media_info = ops->o_media_info;
2182		hdl->h_ops.o_devid_init = ops->o_devid_init;
2183		hdl->h_ops.o_sync_cache = ops->o_sync_cache;
2184		hdl->h_ops.o_read = ops->o_read;
2185		hdl->h_ops.o_write = ops->o_write;
2186		break;
2187	}
2188
2189	hdl->h_dma = dma;
2190	hdl->h_private = private;
2191
2192	return (hdl);
2193}
2194
2195void
2196bd_free_handle(bd_handle_t hdl)
2197{
2198	kmem_free(hdl, sizeof (*hdl));
2199}
2200
2201int
2202bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
2203{
2204	dev_info_t	*child;
2205	bd_drive_t	drive = { 0 };
2206
2207	/*
2208	 * It's not an error if bd_attach_handle() is called on a handle that
2209	 * already is attached. We just ignore the request to attach and return.
2210	 * This way drivers using blkdev don't have to keep track about blkdev
2211	 * state, they can just call this function to make sure it attached.
2212	 */
2213	if (hdl->h_child != NULL) {
2214		return (DDI_SUCCESS);
2215	}
2216
2217	/* if drivers don't override this, make it assume none */
2218	drive.d_lun = -1;
2219	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
2220
2221	hdl->h_parent = dip;
2222	hdl->h_name = "blkdev";
2223
2224	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2225	if (*(uint64_t *)drive.d_eui64 != 0) {
2226		if (drive.d_lun >= 0) {
2227			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2228			    "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
2229			    drive.d_eui64[0], drive.d_eui64[1],
2230			    drive.d_eui64[2], drive.d_eui64[3],
2231			    drive.d_eui64[4], drive.d_eui64[5],
2232			    drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
2233		} else {
2234			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2235			    "w%02X%02X%02X%02X%02X%02X%02X%02X",
2236			    drive.d_eui64[0], drive.d_eui64[1],
2237			    drive.d_eui64[2], drive.d_eui64[3],
2238			    drive.d_eui64[4], drive.d_eui64[5],
2239			    drive.d_eui64[6], drive.d_eui64[7]);
2240		}
2241	} else {
2242		if (drive.d_lun >= 0) {
2243			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2244			    "%X,%X", drive.d_target, drive.d_lun);
2245		} else {
2246			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2247			    "%X", drive.d_target);
2248		}
2249	}
2250
2251	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
2252	    &child) != NDI_SUCCESS) {
2253		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
2254		    ddi_driver_name(dip), ddi_get_instance(dip),
2255		    "blkdev", hdl->h_addr);
2256		return (DDI_FAILURE);
2257	}
2258
2259	ddi_set_parent_data(child, hdl);
2260	hdl->h_child = child;
2261
2262	if (ndi_devi_online(child, 0) != NDI_SUCCESS) {
2263		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
2264		    ddi_driver_name(dip), ddi_get_instance(dip),
2265		    hdl->h_name, hdl->h_addr);
2266		(void) ndi_devi_free(child);
2267		hdl->h_child = NULL;
2268		return (DDI_FAILURE);
2269	}
2270
2271	return (DDI_SUCCESS);
2272}
2273
2274int
2275bd_detach_handle(bd_handle_t hdl)
2276{
2277	int	circ;
2278	int	rv;
2279	char	*devnm;
2280
2281	/*
2282	 * It's not an error if bd_detach_handle() is called on a handle that
2283	 * already is detached. We just ignore the request to detach and return.
2284	 * This way drivers using blkdev don't have to keep track about blkdev
2285	 * state, they can just call this function to make sure it detached.
2286	 */
2287	if (hdl->h_child == NULL) {
2288		return (DDI_SUCCESS);
2289	}
2290	ndi_devi_enter(hdl->h_parent, &circ);
2291	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
2292		rv = ddi_remove_child(hdl->h_child, 0);
2293	} else {
2294		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
2295		(void) ddi_deviname(hdl->h_child, devnm);
2296		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
2297		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
2298		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
2299		kmem_free(devnm, MAXNAMELEN + 1);
2300	}
2301	if (rv == 0) {
2302		hdl->h_child = NULL;
2303	}
2304
2305	ndi_devi_exit(hdl->h_parent, circ);
2306	return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
2307}
2308
2309void
2310bd_xfer_done(bd_xfer_t *xfer, int err)
2311{
2312	bd_xfer_impl_t	*xi = (void *)xfer;
2313	buf_t		*bp = xi->i_bp;
2314	int		rv = DDI_SUCCESS;
2315	bd_t		*bd = xi->i_bd;
2316	size_t		len;
2317
2318	if (err != 0) {
2319		bd_runq_exit(xi, err);
2320		atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
2321
2322		bp->b_resid += xi->i_resid;
2323		bd_xfer_free(xi);
2324		bioerror(bp, err);
2325		biodone(bp);
2326		return;
2327	}
2328
2329	xi->i_cur_win++;
2330	xi->i_resid -= xi->i_len;
2331
2332	if (xi->i_resid == 0) {
2333		/* Job completed succcessfully! */
2334		bd_runq_exit(xi, 0);
2335
2336		bd_xfer_free(xi);
2337		biodone(bp);
2338		return;
2339	}
2340
2341	xi->i_blkno += xi->i_nblks;
2342
2343	if (bd->d_use_dma) {
2344		/* More transfer still pending... advance to next DMA window. */
2345		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
2346		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
2347	} else {
2348		/* Advance memory window. */
2349		xi->i_kaddr += xi->i_len;
2350		xi->i_offset += xi->i_len;
2351		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
2352	}
2353
2354
2355	if ((rv != DDI_SUCCESS) ||
2356	    (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
2357		bd_runq_exit(xi, EFAULT);
2358
2359		bp->b_resid += xi->i_resid;
2360		bd_xfer_free(xi);
2361		bioerror(bp, EFAULT);
2362		biodone(bp);
2363		return;
2364	}
2365	xi->i_len = len;
2366	xi->i_nblks = len >> xi->i_blkshift;
2367
2368	/* Submit next window to hardware. */
2369	rv = xi->i_func(bd->d_private, &xi->i_public);
2370	if (rv != 0) {
2371		bd_runq_exit(xi, rv);
2372
2373		atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
2374
2375		bp->b_resid += xi->i_resid;
2376		bd_xfer_free(xi);
2377		bioerror(bp, rv);
2378		biodone(bp);
2379	}
2380}
2381
2382void
2383bd_error(bd_xfer_t *xfer, int error)
2384{
2385	bd_xfer_impl_t	*xi = (void *)xfer;
2386	bd_t		*bd = xi->i_bd;
2387
2388	switch (error) {
2389	case BD_ERR_MEDIA:
2390		atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2391		break;
2392	case BD_ERR_NTRDY:
2393		atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2394		break;
2395	case BD_ERR_NODEV:
2396		atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2397		break;
2398	case BD_ERR_RECOV:
2399		atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2400		break;
2401	case BD_ERR_ILLRQ:
2402		atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2403		break;
2404	case BD_ERR_PFA:
2405		atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2406		break;
2407	default:
2408		cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2409		break;
2410	}
2411}
2412
2413void
2414bd_state_change(bd_handle_t hdl)
2415{
2416	bd_t		*bd;
2417
2418	if ((bd = hdl->h_bd) != NULL) {
2419		bd_update_state(bd);
2420	}
2421}
2422
2423void
2424bd_mod_init(struct dev_ops *devops)
2425{
2426	static struct bus_ops bd_bus_ops = {
2427		BUSO_REV,		/* busops_rev */
2428		nullbusmap,		/* bus_map */
2429		NULL,			/* bus_get_intrspec (OBSOLETE) */
2430		NULL,			/* bus_add_intrspec (OBSOLETE) */
2431		NULL,			/* bus_remove_intrspec (OBSOLETE) */
2432		i_ddi_map_fault,	/* bus_map_fault */
2433		NULL,			/* bus_dma_map (OBSOLETE) */
2434		ddi_dma_allochdl,	/* bus_dma_allochdl */
2435		ddi_dma_freehdl,	/* bus_dma_freehdl */
2436		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
2437		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
2438		ddi_dma_flush,		/* bus_dma_flush */
2439		ddi_dma_win,		/* bus_dma_win */
2440		ddi_dma_mctl,		/* bus_dma_ctl */
2441		bd_bus_ctl,		/* bus_ctl */
2442		ddi_bus_prop_op,	/* bus_prop_op */
2443		NULL,			/* bus_get_eventcookie */
2444		NULL,			/* bus_add_eventcall */
2445		NULL,			/* bus_remove_eventcall */
2446		NULL,			/* bus_post_event */
2447		NULL,			/* bus_intr_ctl (OBSOLETE) */
2448		NULL,			/* bus_config */
2449		NULL,			/* bus_unconfig */
2450		NULL,			/* bus_fm_init */
2451		NULL,			/* bus_fm_fini */
2452		NULL,			/* bus_fm_access_enter */
2453		NULL,			/* bus_fm_access_exit */
2454		NULL,			/* bus_power */
2455		NULL,			/* bus_intr_op */
2456	};
2457
2458	devops->devo_bus_ops = &bd_bus_ops;
2459
2460	/*
2461	 * NB: The device driver is free to supply its own
2462	 * character entry device support.
2463	 */
2464}
2465
2466void
2467bd_mod_fini(struct dev_ops *devops)
2468{
2469	devops->devo_bus_ops = NULL;
2470}
2471