xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision 66f654faf94d77a6760e083cb715592f4a408046)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29  * Copyright 2017 Nexenta Systems, Inc.
30  */
31 
32 /*
33  * xdf.c - Xen Virtual Block Device Driver
34  * TODO:
35  *	- support alternate block size (currently only DEV_BSIZE supported)
36  *	- revalidate geometry for removable devices
37  *
38  * This driver exports disk device nodes, accepts IO requests from those
39  * nodes, and services those requests by talking to a backend device
40  * in another domain.
41  *
42  * Communication with the backend device is done via a ringbuffer (which is
43  * managed via xvdi interfaces) and dma memory (which is managed via ddi
44  * interfaces).
45  *
46  * Communication with the backend device is dependant upon establishing a
47  * connection to the backend device.  This connection process involves
48  * reading device configuration information from xenbus and publishing
49  * some frontend runtime configuration parameters via the xenbus (for
50  * consumption by the backend).  Once we've published runtime configuration
51  * information via the xenbus, the backend device can enter the connected
52  * state and we'll enter the XD_CONNECTED state.  But before we can allow
53  * random IO to begin, we need to do IO to the backend device to determine
54  * the device label and if flush operations are supported.  Once this is
55  * done we enter the XD_READY state and can process any IO operations.
56  *
57  * We receive notifications of xenbus state changes for the backend device
58  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
59  * is single threaded, meaning that we can't receive new notification of
60  * other end state changes while we're processing an outstanding
61  * notification of an other end state change.  There for we can't do any
62  * blocking operations from the xdf_oe_change() callback.  This is why we
63  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
64  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
65  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
66  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
67  * generated by the xdf_ready_tq_thread thread have priority over all
68  * other IO requests.
69  *
70  * We also communicate with the backend device via the xenbus "media-req"
71  * (XBP_MEDIA_REQ) property.  For more information on this see the
72  * comments in blkif.h.
73  */
74 
75 #include <io/xdf.h>
76 
77 #include <sys/conf.h>
78 #include <sys/dkio.h>
79 #include <sys/promif.h>
80 #include <sys/sysmacros.h>
81 #include <sys/kstat.h>
82 #include <sys/mach_mmu.h>
83 #ifdef XPV_HVM_DRIVER
84 #include <sys/xpv_support.h>
85 #else /* !XPV_HVM_DRIVER */
86 #include <sys/evtchn_impl.h>
87 #endif /* !XPV_HVM_DRIVER */
88 #include <sys/sunndi.h>
89 #include <public/io/xenbus.h>
90 #include <xen/sys/xenbus_impl.h>
91 #include <sys/scsi/generic/inquiry.h>
92 #include <xen/io/blkif_impl.h>
93 #include <sys/fdio.h>
94 #include <sys/cdio.h>
95 
96 /*
97  * DEBUG_EVAL can be used to include debug only statements without
98  * having to use '#ifdef DEBUG' statements
99  */
100 #ifdef DEBUG
101 #define	DEBUG_EVAL(x)	(x)
102 #else /* !DEBUG */
103 #define	DEBUG_EVAL(x)
104 #endif /* !DEBUG */
105 
106 #define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
107 #define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
108 #define	XDF_STATE_TIMEOUT		(30*1000*1000)	/* 30.00 sec */
109 
110 #define	INVALID_DOMID	((domid_t)-1)
111 #define	FLUSH_DISKCACHE	0x1
112 #define	WRITE_BARRIER	0x2
113 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
114 #define	USE_WRITE_BARRIER(vdp)						\
115 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
116 #define	USE_FLUSH_DISKCACHE(vdp)					\
117 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
118 #define	IS_WRITE_BARRIER(vdp, bp)					\
119 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
120 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
121 #define	IS_FLUSH_DISKCACHE(bp)						\
122 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
123 
124 #define	VREQ_DONE(vreq)							\
125 	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
126 	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
127 	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
128 
129 #define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
130 #define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
131 
132 extern int		do_polled_io;
133 
134 /* run-time tunables that we don't want the compiler to optimize away */
135 volatile int		xdf_debug = 0;
136 volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
137 
138 /* per module globals */
139 major_t			xdf_major;
140 static void		*xdf_ssp;
141 static kmem_cache_t	*xdf_vreq_cache;
142 static kmem_cache_t	*xdf_gs_cache;
143 static int		xdf_maxphys = XB_MAXPHYS;
144 static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
145 static int		xdf_fbrewrites;	/* flush block re-write count */
146 
147 /* misc public functions */
148 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
149 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
150 
151 /*  misc private functions */
152 static void xdf_io_start(xdf_t *);
153 static void xdf_devid_setup(xdf_t *);
154 
155 /* callbacks from commmon label */
156 static cmlb_tg_ops_t xdf_lb_ops = {
157 	TG_DK_OPS_VERSION_1,
158 	xdf_lb_rdwr,
159 	xdf_lb_getinfo
160 };
161 
162 /*
163  * I/O buffer DMA attributes
164  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
165  */
166 static ddi_dma_attr_t xb_dma_attr = {
167 	DMA_ATTR_V0,
168 	(uint64_t)0,			/* lowest address */
169 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
170 	(uint64_t)0xffffff,		/* DMA counter limit max */
171 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
172 	XB_BSIZE - 1,			/* bitmap of burst sizes */
173 	XB_BSIZE,			/* min transfer */
174 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
175 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
176 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
177 	XB_BSIZE,			/* granularity */
178 	0,				/* flags (reserved) */
179 };
180 
181 static ddi_device_acc_attr_t xc_acc_attr = {
182 	DDI_DEVICE_ATTR_V0,
183 	DDI_NEVERSWAP_ACC,
184 	DDI_STRICTORDER_ACC
185 };
186 
187 static void
188 xdf_timeout_handler(void *arg)
189 {
190 	xdf_t *vdp = arg;
191 
192 	mutex_enter(&vdp->xdf_dev_lk);
193 	vdp->xdf_timeout_id = 0;
194 	mutex_exit(&vdp->xdf_dev_lk);
195 
196 	/* new timeout thread could be re-scheduled */
197 	xdf_io_start(vdp);
198 }
199 
200 /*
201  * callback func when DMA/GTE resources is available
202  *
203  * Note: we only register one callback function to grant table subsystem
204  * since we only have one 'struct gnttab_free_callback' in xdf_t.
205  */
206 static int
207 xdf_dmacallback(caddr_t arg)
208 {
209 	xdf_t *vdp = (xdf_t *)arg;
210 	ASSERT(vdp != NULL);
211 
212 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
213 	    vdp->xdf_addr));
214 
215 	ddi_trigger_softintr(vdp->xdf_softintr_id);
216 	return (DDI_DMA_CALLBACK_DONE);
217 }
218 
219 static ge_slot_t *
220 gs_get(xdf_t *vdp, int isread)
221 {
222 	grant_ref_t gh;
223 	ge_slot_t *gs;
224 
225 	/* try to alloc GTEs needed in this slot, first */
226 	if (gnttab_alloc_grant_references(
227 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
228 		if (vdp->xdf_gnt_callback.next == NULL) {
229 			SETDMACBON(vdp);
230 			gnttab_request_free_callback(
231 			    &vdp->xdf_gnt_callback,
232 			    (void (*)(void *))xdf_dmacallback,
233 			    (void *)vdp,
234 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
235 		}
236 		return (NULL);
237 	}
238 
239 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
240 	if (gs == NULL) {
241 		gnttab_free_grant_references(gh);
242 		if (vdp->xdf_timeout_id == 0)
243 			/* restart I/O after one second */
244 			vdp->xdf_timeout_id =
245 			    timeout(xdf_timeout_handler, vdp, hz);
246 		return (NULL);
247 	}
248 
249 	/* init gs_slot */
250 	gs->gs_oeid = vdp->xdf_peer;
251 	gs->gs_isread = isread;
252 	gs->gs_ghead = gh;
253 	gs->gs_ngrefs = 0;
254 
255 	return (gs);
256 }
257 
258 static void
259 gs_free(ge_slot_t *gs)
260 {
261 	int		i;
262 
263 	/* release all grant table entry resources used in this slot */
264 	for (i = 0; i < gs->gs_ngrefs; i++)
265 		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
266 	gnttab_free_grant_references(gs->gs_ghead);
267 	list_remove(&gs->gs_vreq->v_gs, gs);
268 	kmem_cache_free(xdf_gs_cache, gs);
269 }
270 
271 static grant_ref_t
272 gs_grant(ge_slot_t *gs, mfn_t mfn)
273 {
274 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
275 
276 	ASSERT(gr != -1);
277 	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
278 	gs->gs_ge[gs->gs_ngrefs++] = gr;
279 	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
280 
281 	return (gr);
282 }
283 
284 /*
285  * Alloc a vreq for this bp
286  * bp->av_back contains the pointer to the vreq upon return
287  */
288 static v_req_t *
289 vreq_get(xdf_t *vdp, buf_t *bp)
290 {
291 	v_req_t *vreq = NULL;
292 
293 	ASSERT(BP_VREQ(bp) == NULL);
294 
295 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
296 	if (vreq == NULL) {
297 		if (vdp->xdf_timeout_id == 0)
298 			/* restart I/O after one second */
299 			vdp->xdf_timeout_id =
300 			    timeout(xdf_timeout_handler, vdp, hz);
301 		return (NULL);
302 	}
303 	bzero(vreq, sizeof (v_req_t));
304 	list_create(&vreq->v_gs, sizeof (ge_slot_t),
305 	    offsetof(ge_slot_t, gs_vreq_link));
306 	vreq->v_buf = bp;
307 	vreq->v_status = VREQ_INIT;
308 	vreq->v_runq = B_FALSE;
309 	BP_VREQ_SET(bp, vreq);
310 	/* init of other fields in vreq is up to the caller */
311 
312 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
313 
314 	return (vreq);
315 }
316 
317 static void
318 vreq_free(xdf_t *vdp, v_req_t *vreq)
319 {
320 	buf_t	*bp = vreq->v_buf;
321 
322 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
323 	ASSERT(BP_VREQ(bp) == vreq);
324 
325 	list_remove(&vdp->xdf_vreq_act, vreq);
326 
327 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
328 		goto done;
329 
330 	switch (vreq->v_status) {
331 	case VREQ_DMAWIN_DONE:
332 	case VREQ_GS_ALLOCED:
333 	case VREQ_DMABUF_BOUND:
334 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
335 		/*FALLTHRU*/
336 	case VREQ_DMAMEM_ALLOCED:
337 		if (!ALIGNED_XFER(bp)) {
338 			ASSERT(vreq->v_abuf != NULL);
339 			if (!IS_ERROR(bp) && IS_READ(bp))
340 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
341 				    bp->b_bcount);
342 			ddi_dma_mem_free(&vreq->v_align);
343 		}
344 		/*FALLTHRU*/
345 	case VREQ_MEMDMAHDL_ALLOCED:
346 		if (!ALIGNED_XFER(bp))
347 			ddi_dma_free_handle(&vreq->v_memdmahdl);
348 		/*FALLTHRU*/
349 	case VREQ_DMAHDL_ALLOCED:
350 		ddi_dma_free_handle(&vreq->v_dmahdl);
351 		break;
352 	default:
353 		break;
354 	}
355 done:
356 	ASSERT(!vreq->v_runq);
357 	list_destroy(&vreq->v_gs);
358 	kmem_cache_free(xdf_vreq_cache, vreq);
359 }
360 
361 /*
362  * Snarf new data if our flush block was re-written
363  */
364 static void
365 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
366 {
367 	int nblks;
368 	boolean_t mapin;
369 
370 	if (IS_WRITE_BARRIER(vdp, bp))
371 		return; /* write was a flush write */
372 
373 	mapin = B_FALSE;
374 	nblks = bp->b_bcount >> DEV_BSHIFT;
375 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
376 		xdf_fbrewrites++;
377 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
378 			mapin = B_TRUE;
379 			bp_mapin(bp);
380 		}
381 		bcopy(bp->b_un.b_addr +
382 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
383 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
384 		if (mapin)
385 			bp_mapout(bp);
386 	}
387 }
388 
389 /*
390  * Initalize the DMA and grant table resources for the buf
391  */
392 static int
393 vreq_setup(xdf_t *vdp, v_req_t *vreq)
394 {
395 	int rc;
396 	ddi_dma_attr_t dmaattr;
397 	uint_t ndcs, ndws;
398 	ddi_dma_handle_t dh;
399 	ddi_dma_handle_t mdh;
400 	ddi_dma_cookie_t dc;
401 	ddi_acc_handle_t abh;
402 	caddr_t	aba;
403 	ge_slot_t *gs;
404 	size_t bufsz;
405 	off_t off;
406 	size_t sz;
407 	buf_t *bp = vreq->v_buf;
408 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
409 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
410 
411 	switch (vreq->v_status) {
412 	case VREQ_INIT:
413 		if (IS_FLUSH_DISKCACHE(bp)) {
414 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
415 				DPRINTF(DMA_DBG, ("xdf@%s: "
416 				    "get ge_slotfailed\n", vdp->xdf_addr));
417 				return (DDI_FAILURE);
418 			}
419 			vreq->v_blkno = 0;
420 			vreq->v_nslots = 1;
421 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
422 			vreq->v_status = VREQ_GS_ALLOCED;
423 			gs->gs_vreq = vreq;
424 			list_insert_head(&vreq->v_gs, gs);
425 			return (DDI_SUCCESS);
426 		}
427 
428 		if (IS_WRITE_BARRIER(vdp, bp))
429 			vreq->v_flush_diskcache = WRITE_BARRIER;
430 		vreq->v_blkno = bp->b_blkno +
431 		    (diskaddr_t)(uintptr_t)bp->b_private;
432 		/* See if we wrote new data to our flush block */
433 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
434 			check_fbwrite(vdp, bp, vreq->v_blkno);
435 		vreq->v_status = VREQ_INIT_DONE;
436 		/*FALLTHRU*/
437 
438 	case VREQ_INIT_DONE:
439 		/*
440 		 * alloc DMA handle
441 		 */
442 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
443 		    xdf_dmacallback, (caddr_t)vdp, &dh);
444 		if (rc != DDI_SUCCESS) {
445 			SETDMACBON(vdp);
446 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
447 			    vdp->xdf_addr));
448 			return (DDI_FAILURE);
449 		}
450 
451 		vreq->v_dmahdl = dh;
452 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
453 		/*FALLTHRU*/
454 
455 	case VREQ_DMAHDL_ALLOCED:
456 		/*
457 		 * alloc dma handle for 512-byte aligned buf
458 		 */
459 		if (!ALIGNED_XFER(bp)) {
460 			/*
461 			 * XXPV: we need to temporarily enlarge the seg
462 			 * boundary and s/g length to work round CR6381968
463 			 */
464 			dmaattr = xb_dma_attr;
465 			dmaattr.dma_attr_seg = (uint64_t)-1;
466 			dmaattr.dma_attr_sgllen = INT_MAX;
467 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
468 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
469 			if (rc != DDI_SUCCESS) {
470 				SETDMACBON(vdp);
471 				DPRINTF(DMA_DBG, ("xdf@%s: "
472 				    "unaligned buf DMAhandle alloc failed\n",
473 				    vdp->xdf_addr));
474 				return (DDI_FAILURE);
475 			}
476 			vreq->v_memdmahdl = mdh;
477 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
478 		}
479 		/*FALLTHRU*/
480 
481 	case VREQ_MEMDMAHDL_ALLOCED:
482 		/*
483 		 * alloc 512-byte aligned buf
484 		 */
485 		if (!ALIGNED_XFER(bp)) {
486 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
487 				bp_mapin(bp);
488 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
489 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
490 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
491 			    &aba, &bufsz, &abh);
492 			if (rc != DDI_SUCCESS) {
493 				SETDMACBON(vdp);
494 				DPRINTF(DMA_DBG, ("xdf@%s: "
495 				    "DMA mem allocation failed\n",
496 				    vdp->xdf_addr));
497 				return (DDI_FAILURE);
498 			}
499 
500 			vreq->v_abuf = aba;
501 			vreq->v_align = abh;
502 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
503 
504 			ASSERT(bufsz >= bp->b_bcount);
505 			if (!IS_READ(bp))
506 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
507 				    bp->b_bcount);
508 		}
509 		/*FALLTHRU*/
510 
511 	case VREQ_DMAMEM_ALLOCED:
512 		/*
513 		 * dma bind
514 		 */
515 		if (ALIGNED_XFER(bp)) {
516 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
517 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
518 			    &dc, &ndcs);
519 		} else {
520 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
521 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
522 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
523 		}
524 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
525 			/* get num of dma windows */
526 			if (rc == DDI_DMA_PARTIAL_MAP) {
527 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
528 				ASSERT(rc == DDI_SUCCESS);
529 			} else {
530 				ndws = 1;
531 			}
532 		} else {
533 			SETDMACBON(vdp);
534 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
535 			    vdp->xdf_addr));
536 			return (DDI_FAILURE);
537 		}
538 
539 		vreq->v_dmac = dc;
540 		vreq->v_dmaw = 0;
541 		vreq->v_ndmacs = ndcs;
542 		vreq->v_ndmaws = ndws;
543 		vreq->v_nslots = ndws;
544 		vreq->v_status = VREQ_DMABUF_BOUND;
545 		/*FALLTHRU*/
546 
547 	case VREQ_DMABUF_BOUND:
548 		/*
549 		 * get ge_slot, callback is set upon failure from gs_get(),
550 		 * if not set previously
551 		 */
552 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
553 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
554 			    vdp->xdf_addr));
555 			return (DDI_FAILURE);
556 		}
557 
558 		vreq->v_status = VREQ_GS_ALLOCED;
559 		gs->gs_vreq = vreq;
560 		list_insert_head(&vreq->v_gs, gs);
561 		break;
562 
563 	case VREQ_GS_ALLOCED:
564 		/* nothing need to be done */
565 		break;
566 
567 	case VREQ_DMAWIN_DONE:
568 		/*
569 		 * move to the next dma window
570 		 */
571 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
572 
573 		/* get a ge_slot for this DMA window */
574 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
575 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
576 			    vdp->xdf_addr));
577 			return (DDI_FAILURE);
578 		}
579 
580 		vreq->v_dmaw++;
581 		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
582 		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
583 		vreq->v_status = VREQ_GS_ALLOCED;
584 		gs->gs_vreq = vreq;
585 		list_insert_head(&vreq->v_gs, gs);
586 		break;
587 
588 	default:
589 		return (DDI_FAILURE);
590 	}
591 
592 	return (DDI_SUCCESS);
593 }
594 
595 static int
596 xdf_cmlb_attach(xdf_t *vdp)
597 {
598 	dev_info_t	*dip = vdp->xdf_dip;
599 
600 	return (cmlb_attach(dip, &xdf_lb_ops,
601 	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
602 	    XD_IS_RM(vdp), B_TRUE,
603 	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
604 	    0, vdp->xdf_vd_lbl, NULL));
605 }
606 
607 static void
608 xdf_io_err(buf_t *bp, int err, size_t resid)
609 {
610 	bioerror(bp, err);
611 	if (resid == 0)
612 		bp->b_resid = bp->b_bcount;
613 	biodone(bp);
614 }
615 
616 static void
617 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
618 {
619 	v_req_t *vreq = BP_VREQ(bp);
620 
621 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
622 
623 	if (vdp->xdf_xdev_iostat == NULL)
624 		return;
625 	if ((vreq != NULL) && vreq->v_runq) {
626 		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
627 	} else {
628 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
629 	}
630 }
631 
632 static void
633 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
634 {
635 	v_req_t *vreq = BP_VREQ(bp);
636 
637 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
638 
639 	if (vdp->xdf_xdev_iostat == NULL)
640 		return;
641 
642 	if ((vreq != NULL) && vreq->v_runq) {
643 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
644 	} else {
645 		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
646 	}
647 
648 	if (bp->b_flags & B_READ) {
649 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->reads++;
650 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nread += bp->b_bcount;
651 	} else if (bp->b_flags & B_WRITE) {
652 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->writes++;
653 		KSTAT_IO_PTR(vdp->xdf_xdev_iostat)->nwritten += bp->b_bcount;
654 	}
655 }
656 
657 static void
658 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
659 {
660 	v_req_t *vreq = BP_VREQ(bp);
661 
662 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
663 	ASSERT(!vreq->v_runq);
664 
665 	vreq->v_runq = B_TRUE;
666 	if (vdp->xdf_xdev_iostat == NULL)
667 		return;
668 	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
669 }
670 
671 static void
672 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
673 {
674 	v_req_t *vreq = BP_VREQ(bp);
675 
676 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
677 	ASSERT(vreq->v_runq);
678 
679 	vreq->v_runq = B_FALSE;
680 	if (vdp->xdf_xdev_iostat == NULL)
681 		return;
682 	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
683 }
684 
685 int
686 xdf_kstat_create(dev_info_t *dip)
687 {
688 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
689 	kstat_t		*kstat;
690 	buf_t		*bp;
691 
692 	if ((kstat = kstat_create("xdf", ddi_get_instance(dip), NULL, "disk",
693 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
694 		return (-1);
695 
696 	/* See comment about locking in xdf_kstat_delete(). */
697 	mutex_enter(&vdp->xdf_iostat_lk);
698 	mutex_enter(&vdp->xdf_dev_lk);
699 
700 	/* only one kstat can exist at a time */
701 	if (vdp->xdf_xdev_iostat != NULL) {
702 		mutex_exit(&vdp->xdf_dev_lk);
703 		mutex_exit(&vdp->xdf_iostat_lk);
704 		kstat_delete(kstat);
705 		return (-1);
706 	}
707 
708 	vdp->xdf_xdev_iostat = kstat;
709 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
710 	kstat_install(vdp->xdf_xdev_iostat);
711 
712 	/*
713 	 * Now that we've created a kstat, we need to update the waitq and
714 	 * runq counts for the kstat to reflect our current state.
715 	 *
716 	 * For a buf_t structure to be on the runq, it must have a ring
717 	 * buffer slot associated with it.  To get a ring buffer slot the
718 	 * buf must first have a v_req_t and a ge_slot_t associated with it.
719 	 * Then when it is granted a ring buffer slot, v_runq will be set to
720 	 * true.
721 	 *
722 	 * For a buf_t structure to be on the waitq, it must not be on the
723 	 * runq.  So to find all the buf_t's that should be on waitq, we
724 	 * walk the active buf list and add any buf_t's which aren't on the
725 	 * runq to the waitq.
726 	 */
727 	bp = vdp->xdf_f_act;
728 	while (bp != NULL) {
729 		xdf_kstat_enter(vdp, bp);
730 		bp = bp->av_forw;
731 	}
732 	if (vdp->xdf_ready_tq_bp != NULL)
733 		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
734 
735 	mutex_exit(&vdp->xdf_dev_lk);
736 	mutex_exit(&vdp->xdf_iostat_lk);
737 	return (0);
738 }
739 
740 void
741 xdf_kstat_delete(dev_info_t *dip)
742 {
743 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
744 	kstat_t		*kstat;
745 	buf_t		*bp;
746 
747 	/*
748 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
749 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
750 	 * and the contents of the our kstat.  xdf_iostat_lk is used
751 	 * to protect the allocation and freeing of the actual kstat.
752 	 * xdf_dev_lk can't be used for this purpose because kstat
753 	 * readers use it to access the contents of the kstat and
754 	 * hence it can't be held when calling kstat_delete().
755 	 */
756 	mutex_enter(&vdp->xdf_iostat_lk);
757 	mutex_enter(&vdp->xdf_dev_lk);
758 
759 	if (vdp->xdf_xdev_iostat == NULL) {
760 		mutex_exit(&vdp->xdf_dev_lk);
761 		mutex_exit(&vdp->xdf_iostat_lk);
762 		return;
763 	}
764 
765 	/*
766 	 * We're about to destroy the kstat structures, so it isn't really
767 	 * necessary to update the runq and waitq counts.  But, since this
768 	 * isn't a hot code path we can afford to be a little pedantic and
769 	 * go ahead and decrement the runq and waitq kstat counters to zero
770 	 * before free'ing them.  This helps us ensure that we've gotten all
771 	 * our accounting correct.
772 	 *
773 	 * For an explanation of how we determine which buffers go on the
774 	 * runq vs which go on the waitq, see the comments in
775 	 * xdf_kstat_create().
776 	 */
777 	bp = vdp->xdf_f_act;
778 	while (bp != NULL) {
779 		xdf_kstat_exit(vdp, bp);
780 		bp = bp->av_forw;
781 	}
782 	if (vdp->xdf_ready_tq_bp != NULL)
783 		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
784 
785 	kstat = vdp->xdf_xdev_iostat;
786 	vdp->xdf_xdev_iostat = NULL;
787 	mutex_exit(&vdp->xdf_dev_lk);
788 	kstat_delete(kstat);
789 	mutex_exit(&vdp->xdf_iostat_lk);
790 }
791 
792 /*
793  * Add an IO requests onto the active queue.
794  *
795  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
796  * are used to establish a connection to the backend, so they receive
797  * priority over all other IOs.  Since xdf_ready_tq_thread only does
798  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
799  * given time and we record the buf associated with that request in
800  * xdf_ready_tq_bp.
801  */
802 static void
803 xdf_bp_push(xdf_t *vdp, buf_t *bp)
804 {
805 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
806 	ASSERT(bp->av_forw == NULL);
807 
808 	xdf_kstat_enter(vdp, bp);
809 
810 	if (curthread == vdp->xdf_ready_tq_thread) {
811 		/* new IO requests from the ready thread */
812 		ASSERT(vdp->xdf_ready_tq_bp == NULL);
813 		vdp->xdf_ready_tq_bp = bp;
814 		return;
815 	}
816 
817 	/* this is normal IO request */
818 	ASSERT(bp != vdp->xdf_ready_tq_bp);
819 
820 	if (vdp->xdf_f_act == NULL) {
821 		/* this is only only IO on the active queue */
822 		ASSERT(vdp->xdf_l_act == NULL);
823 		ASSERT(vdp->xdf_i_act == NULL);
824 		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
825 		return;
826 	}
827 
828 	/* add this IO to the tail of the active queue */
829 	vdp->xdf_l_act->av_forw = bp;
830 	vdp->xdf_l_act = bp;
831 	if (vdp->xdf_i_act == NULL)
832 		vdp->xdf_i_act = bp;
833 }
834 
835 static void
836 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
837 {
838 	buf_t	*bp_iter;
839 
840 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
841 	ASSERT(VREQ_DONE(BP_VREQ(bp)));
842 
843 	if (vdp->xdf_ready_tq_bp == bp) {
844 		/* we're done with a ready thread IO request */
845 		ASSERT(bp->av_forw == NULL);
846 		vdp->xdf_ready_tq_bp = NULL;
847 		return;
848 	}
849 
850 	/* we're done with a normal IO request */
851 	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
852 	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
853 	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
854 	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
855 
856 	if (bp == vdp->xdf_f_act) {
857 		/* This IO was at the head of our active queue. */
858 		vdp->xdf_f_act = bp->av_forw;
859 		if (bp == vdp->xdf_l_act)
860 			vdp->xdf_l_act = NULL;
861 	} else {
862 		/* There IO finished before some other pending IOs. */
863 		bp_iter = vdp->xdf_f_act;
864 		while (bp != bp_iter->av_forw) {
865 			bp_iter = bp_iter->av_forw;
866 			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
867 			ASSERT(bp_iter != vdp->xdf_i_act);
868 		}
869 		bp_iter->av_forw = bp->av_forw;
870 		if (bp == vdp->xdf_l_act)
871 			vdp->xdf_l_act = bp_iter;
872 	}
873 	bp->av_forw = NULL;
874 }
875 
876 static buf_t *
877 xdf_bp_next(xdf_t *vdp)
878 {
879 	v_req_t	*vreq;
880 	buf_t	*bp;
881 
882 	if (vdp->xdf_state == XD_CONNECTED) {
883 		/*
884 		 * If we're in the XD_CONNECTED state, we only service IOs
885 		 * from the xdf_ready_tq_thread thread.
886 		 */
887 		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
888 			return (NULL);
889 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
890 			return (bp);
891 		return (NULL);
892 	}
893 
894 	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
895 	if (vdp->xdf_state != XD_READY)
896 		return (NULL);
897 
898 	ASSERT(vdp->xdf_ready_tq_bp == NULL);
899 	for (;;) {
900 		if ((bp = vdp->xdf_i_act) == NULL)
901 			return (NULL);
902 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
903 			return (bp);
904 
905 		/* advance the active buf index pointer */
906 		vdp->xdf_i_act = bp->av_forw;
907 	}
908 }
909 
910 static void
911 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
912 {
913 	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
914 	v_req_t		*vreq = gs->gs_vreq;
915 	buf_t		*bp = vreq->v_buf;
916 
917 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
918 	ASSERT(BP_VREQ(bp) == vreq);
919 
920 	gs_free(gs);
921 
922 	if (bioerr != 0)
923 		bioerror(bp, bioerr);
924 	ASSERT(vreq->v_nslots > 0);
925 	if (--vreq->v_nslots > 0)
926 		return;
927 
928 	/* remove this IO from our active queue */
929 	xdf_bp_pop(vdp, bp);
930 
931 	ASSERT(vreq->v_runq);
932 	xdf_kstat_exit(vdp, bp);
933 	vreq->v_runq = B_FALSE;
934 	vreq_free(vdp, vreq);
935 
936 	if (IS_ERROR(bp)) {
937 		xdf_io_err(bp, geterror(bp), 0);
938 	} else if (bp->b_resid != 0) {
939 		/* Partial transfers are an error */
940 		xdf_io_err(bp, EIO, bp->b_resid);
941 	} else {
942 		biodone(bp);
943 	}
944 }
945 
946 /*
947  * xdf interrupt handler
948  */
949 static uint_t
950 xdf_intr_locked(xdf_t *vdp)
951 {
952 	xendev_ring_t *xbr;
953 	blkif_response_t *resp;
954 	int bioerr;
955 	uint64_t id;
956 	uint8_t op;
957 	uint16_t status;
958 	ddi_acc_handle_t acchdl;
959 
960 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
961 
962 	if ((xbr = vdp->xdf_xb_ring) == NULL)
963 		return (DDI_INTR_UNCLAIMED);
964 
965 	acchdl = vdp->xdf_xb_ring_hdl;
966 
967 	/*
968 	 * complete all requests which have a response
969 	 */
970 	while (resp = xvdi_ring_get_response(xbr)) {
971 		id = ddi_get64(acchdl, &resp->id);
972 		op = ddi_get8(acchdl, &resp->operation);
973 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
974 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
975 		    op, id, status));
976 
977 		if (status != BLKIF_RSP_OKAY) {
978 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
979 			    vdp->xdf_addr,
980 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
981 			bioerr = EIO;
982 		} else {
983 			bioerr = 0;
984 		}
985 
986 		xdf_io_fini(vdp, id, bioerr);
987 	}
988 	return (DDI_INTR_CLAIMED);
989 }
990 
991 /*
992  * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
993  * block at a lower pil.
994  */
995 static uint_t
996 xdf_intr(caddr_t arg)
997 {
998 	xdf_t *vdp = (xdf_t *)arg;
999 	int rv;
1000 
1001 	mutex_enter(&vdp->xdf_dev_lk);
1002 	rv = xdf_intr_locked(vdp);
1003 	mutex_exit(&vdp->xdf_dev_lk);
1004 
1005 	if (!do_polled_io)
1006 		xdf_io_start(vdp);
1007 
1008 	return (rv);
1009 }
1010 
1011 static void
1012 xdf_ring_push(xdf_t *vdp)
1013 {
1014 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1015 
1016 	if (vdp->xdf_xb_ring == NULL)
1017 		return;
1018 
1019 	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
1020 		DPRINTF(IO_DBG, (
1021 		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
1022 		    vdp->xdf_addr));
1023 	}
1024 
1025 	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
1026 		xvdi_notify_oe(vdp->xdf_dip);
1027 }
1028 
1029 static int
1030 xdf_ring_drain_locked(xdf_t *vdp)
1031 {
1032 	int		pollc, rv = 0;
1033 
1034 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1035 
1036 	if (xdf_debug & SUSRES_DBG)
1037 		xen_printf("xdf_ring_drain: start\n");
1038 
1039 	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
1040 		if (vdp->xdf_xb_ring == NULL)
1041 			goto out;
1042 
1043 		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1044 			(void) xdf_intr_locked(vdp);
1045 		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
1046 			goto out;
1047 		xdf_ring_push(vdp);
1048 
1049 		/* file-backed devices can be slow */
1050 		mutex_exit(&vdp->xdf_dev_lk);
1051 #ifdef XPV_HVM_DRIVER
1052 		(void) HYPERVISOR_yield();
1053 #endif /* XPV_HVM_DRIVER */
1054 		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
1055 		mutex_enter(&vdp->xdf_dev_lk);
1056 	}
1057 	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
1058 
1059 out:
1060 	if (vdp->xdf_xb_ring != NULL) {
1061 		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
1062 		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
1063 			rv = EIO;
1064 	}
1065 	if (xdf_debug & SUSRES_DBG)
1066 		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
1067 		    vdp->xdf_addr, rv);
1068 	return (rv);
1069 }
1070 
1071 static int
1072 xdf_ring_drain(xdf_t *vdp)
1073 {
1074 	int rv;
1075 	mutex_enter(&vdp->xdf_dev_lk);
1076 	rv = xdf_ring_drain_locked(vdp);
1077 	mutex_exit(&vdp->xdf_dev_lk);
1078 	return (rv);
1079 }
1080 
1081 /*
1082  * Destroy all v_req_t, grant table entries, and our ring buffer.
1083  */
1084 static void
1085 xdf_ring_destroy(xdf_t *vdp)
1086 {
1087 	v_req_t		*vreq;
1088 	buf_t		*bp;
1089 	ge_slot_t	*gs;
1090 
1091 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1092 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1093 
1094 	if ((vdp->xdf_state != XD_INIT) &&
1095 	    (vdp->xdf_state != XD_CONNECTED) &&
1096 	    (vdp->xdf_state != XD_READY)) {
1097 		ASSERT(vdp->xdf_xb_ring == NULL);
1098 		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
1099 		ASSERT(vdp->xdf_peer == INVALID_DOMID);
1100 		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
1101 		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
1102 		return;
1103 	}
1104 
1105 	/*
1106 	 * We don't want to receive async notifications from the backend
1107 	 * when it finishes processing ring entries.
1108 	 */
1109 #ifdef XPV_HVM_DRIVER
1110 	ec_unbind_evtchn(vdp->xdf_evtchn);
1111 #else /* !XPV_HVM_DRIVER */
1112 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1113 #endif /* !XPV_HVM_DRIVER */
1114 
1115 	/*
1116 	 * Drain any requests in the ring.  We need to do this before we
1117 	 * can free grant table entries, because if active ring entries
1118 	 * point to grants, then the backend could be trying to access
1119 	 * those grants.
1120 	 */
1121 	(void) xdf_ring_drain_locked(vdp);
1122 
1123 	/* We're done talking to the backend so free up our event channel */
1124 	xvdi_free_evtchn(vdp->xdf_dip);
1125 	vdp->xdf_evtchn = INVALID_EVTCHN;
1126 
1127 	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
1128 		bp = vreq->v_buf;
1129 		ASSERT(BP_VREQ(bp) == vreq);
1130 
1131 		/* Free up any grant table entries associaed with this IO */
1132 		while ((gs = list_head(&vreq->v_gs)) != NULL)
1133 			gs_free(gs);
1134 
1135 		/* If this IO was on the runq, move it back to the waitq. */
1136 		if (vreq->v_runq)
1137 			xdf_kstat_runq_to_waitq(vdp, bp);
1138 
1139 		/*
1140 		 * Reset any buf IO state since we're going to re-issue the
1141 		 * IO when we reconnect.
1142 		 */
1143 		vreq_free(vdp, vreq);
1144 		BP_VREQ_SET(bp, NULL);
1145 		bioerror(bp, 0);
1146 	}
1147 
1148 	/* reset the active queue index pointer */
1149 	vdp->xdf_i_act = vdp->xdf_f_act;
1150 
1151 	/* Destroy the ring */
1152 	xvdi_free_ring(vdp->xdf_xb_ring);
1153 	vdp->xdf_xb_ring = NULL;
1154 	vdp->xdf_xb_ring_hdl = NULL;
1155 	vdp->xdf_peer = INVALID_DOMID;
1156 }
1157 
1158 void
1159 xdfmin(struct buf *bp)
1160 {
1161 	if (bp->b_bcount > xdf_maxphys)
1162 		bp->b_bcount = xdf_maxphys;
1163 }
1164 
1165 /*
1166  * Check if we have a pending "eject" media request.
1167  */
1168 static int
1169 xdf_eject_pending(xdf_t *vdp)
1170 {
1171 	dev_info_t	*dip = vdp->xdf_dip;
1172 	char		*xsname, *str;
1173 
1174 	if (!vdp->xdf_media_req_supported)
1175 		return (B_FALSE);
1176 
1177 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1178 	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
1179 		return (B_FALSE);
1180 
1181 	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
1182 		strfree(str);
1183 		return (B_FALSE);
1184 	}
1185 	strfree(str);
1186 	return (B_TRUE);
1187 }
1188 
1189 /*
1190  * Generate a media request.
1191  */
1192 static int
1193 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
1194 {
1195 	dev_info_t	*dip = vdp->xdf_dip;
1196 	char		*xsname;
1197 
1198 	/*
1199 	 * we can't be holding xdf_dev_lk because xenbus_printf() can
1200 	 * block while waiting for a PIL 1 interrupt message.  this
1201 	 * would cause a deadlock with xdf_intr() which needs to grab
1202 	 * xdf_dev_lk as well and runs at PIL 5.
1203 	 */
1204 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1205 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1206 
1207 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1208 		return (ENXIO);
1209 
1210 	/* Check if we support media requests */
1211 	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
1212 		return (ENOTTY);
1213 
1214 	/* If an eject is pending then don't allow any new requests */
1215 	if (xdf_eject_pending(vdp))
1216 		return (ENXIO);
1217 
1218 	/* Make sure that there is media present */
1219 	if (media_required && (vdp->xdf_xdev_nblocks == 0))
1220 		return (ENXIO);
1221 
1222 	/* We only allow operations when the device is ready and connected */
1223 	if (vdp->xdf_state != XD_READY)
1224 		return (EIO);
1225 
1226 	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
1227 		return (EIO);
1228 
1229 	return (0);
1230 }
1231 
1232 /*
1233  * populate a single blkif_request_t w/ a buf
1234  */
1235 static void
1236 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1237 {
1238 	grant_ref_t	gr;
1239 	uint8_t		fsect, lsect;
1240 	size_t		bcnt;
1241 	paddr_t		dma_addr;
1242 	off_t		blk_off;
1243 	dev_info_t	*dip = vdp->xdf_dip;
1244 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1245 	v_req_t		*vreq = BP_VREQ(bp);
1246 	uint64_t	blkno = vreq->v_blkno;
1247 	uint_t		ndmacs = vreq->v_ndmacs;
1248 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1249 	int		seg = 0;
1250 	int		isread = IS_READ(bp);
1251 	ge_slot_t	*gs = list_head(&vreq->v_gs);
1252 
1253 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1254 	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
1255 
1256 	if (isread)
1257 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1258 	else {
1259 		switch (vreq->v_flush_diskcache) {
1260 		case FLUSH_DISKCACHE:
1261 			ddi_put8(acchdl, &rreq->operation,
1262 			    BLKIF_OP_FLUSH_DISKCACHE);
1263 			ddi_put16(acchdl, &rreq->handle, vdev);
1264 			ddi_put64(acchdl, &rreq->id,
1265 			    (uint64_t)(uintptr_t)(gs));
1266 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1267 			vreq->v_status = VREQ_DMAWIN_DONE;
1268 			return;
1269 		case WRITE_BARRIER:
1270 			ddi_put8(acchdl, &rreq->operation,
1271 			    BLKIF_OP_WRITE_BARRIER);
1272 			break;
1273 		default:
1274 			if (!vdp->xdf_wce)
1275 				ddi_put8(acchdl, &rreq->operation,
1276 				    BLKIF_OP_WRITE_BARRIER);
1277 			else
1278 				ddi_put8(acchdl, &rreq->operation,
1279 				    BLKIF_OP_WRITE);
1280 			break;
1281 		}
1282 	}
1283 
1284 	ddi_put16(acchdl, &rreq->handle, vdev);
1285 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1286 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
1287 
1288 	/*
1289 	 * loop until all segments are populated or no more dma cookie in buf
1290 	 */
1291 	for (;;) {
1292 		/*
1293 		 * Each segment of a blkif request can transfer up to
1294 		 * one 4K page of data.
1295 		 */
1296 		bcnt = vreq->v_dmac.dmac_size;
1297 		dma_addr = vreq->v_dmac.dmac_laddress;
1298 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1299 		fsect = blk_off >> XB_BSHIFT;
1300 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1301 
1302 		ASSERT(bcnt <= PAGESIZE);
1303 		ASSERT((bcnt % XB_BSIZE) == 0);
1304 		ASSERT((blk_off & XB_BMASK) == 0);
1305 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1306 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1307 
1308 		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
1309 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1310 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1311 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1312 
1313 		DPRINTF(IO_DBG, (
1314 		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
1315 		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
1316 		DPRINTF(IO_DBG, (
1317 		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
1318 		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
1319 
1320 		blkno += (bcnt >> XB_BSHIFT);
1321 		seg++;
1322 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1323 		if (--ndmacs) {
1324 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1325 			continue;
1326 		}
1327 
1328 		vreq->v_status = VREQ_DMAWIN_DONE;
1329 		vreq->v_blkno = blkno;
1330 		break;
1331 	}
1332 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1333 	DPRINTF(IO_DBG, (
1334 	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
1335 	    vdp->xdf_addr, rreq->id));
1336 }
1337 
1338 static void
1339 xdf_io_start(xdf_t *vdp)
1340 {
1341 	struct buf	*bp;
1342 	v_req_t		*vreq;
1343 	blkif_request_t	*rreq;
1344 	boolean_t	rreqready = B_FALSE;
1345 
1346 	mutex_enter(&vdp->xdf_dev_lk);
1347 
1348 	/*
1349 	 * Populate the ring request(s).  Loop until there is no buf to
1350 	 * transfer or no free slot available in I/O ring.
1351 	 */
1352 	for (;;) {
1353 		/* don't start any new IO if we're suspending */
1354 		if (vdp->xdf_suspending)
1355 			break;
1356 		if ((bp = xdf_bp_next(vdp)) == NULL)
1357 			break;
1358 
1359 		/* if the buf doesn't already have a vreq, allocate one */
1360 		if (((vreq = BP_VREQ(bp)) == NULL) &&
1361 		    ((vreq = vreq_get(vdp, bp)) == NULL))
1362 			break;
1363 
1364 		/* alloc DMA/GTE resources */
1365 		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
1366 			break;
1367 
1368 		/* get next blkif_request in the ring */
1369 		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
1370 			break;
1371 		bzero(rreq, sizeof (blkif_request_t));
1372 		rreqready = B_TRUE;
1373 
1374 		/* populate blkif_request with this buf */
1375 		xdf_process_rreq(vdp, bp, rreq);
1376 
1377 		/*
1378 		 * This buffer/vreq pair is has been allocated a ring buffer
1379 		 * resources, so if it isn't already in our runq, add it.
1380 		 */
1381 		if (!vreq->v_runq)
1382 			xdf_kstat_waitq_to_runq(vdp, bp);
1383 	}
1384 
1385 	/* Send the request(s) to the backend */
1386 	if (rreqready)
1387 		xdf_ring_push(vdp);
1388 
1389 	mutex_exit(&vdp->xdf_dev_lk);
1390 }
1391 
1392 
1393 /* check if partition is open, -1 - check all partitions on the disk */
1394 static boolean_t
1395 xdf_isopen(xdf_t *vdp, int partition)
1396 {
1397 	int i;
1398 	ulong_t parbit;
1399 	boolean_t rval = B_FALSE;
1400 
1401 	ASSERT((partition == -1) ||
1402 	    ((partition >= 0) || (partition < XDF_PEXT)));
1403 
1404 	if (partition == -1)
1405 		parbit = (ulong_t)-1;
1406 	else
1407 		parbit = 1 << partition;
1408 
1409 	for (i = 0; i < OTYPCNT; i++) {
1410 		if (vdp->xdf_vd_open[i] & parbit)
1411 			rval = B_TRUE;
1412 	}
1413 
1414 	return (rval);
1415 }
1416 
1417 /*
1418  * The connection should never be closed as long as someone is holding
1419  * us open, there is pending IO, or someone is waiting waiting for a
1420  * connection.
1421  */
1422 static boolean_t
1423 xdf_busy(xdf_t *vdp)
1424 {
1425 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1426 
1427 	if ((vdp->xdf_xb_ring != NULL) &&
1428 	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
1429 		ASSERT(vdp->xdf_state != XD_CLOSED);
1430 		return (B_TRUE);
1431 	}
1432 
1433 	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
1434 		ASSERT(vdp->xdf_state != XD_CLOSED);
1435 		return (B_TRUE);
1436 	}
1437 
1438 	if (xdf_isopen(vdp, -1)) {
1439 		ASSERT(vdp->xdf_state != XD_CLOSED);
1440 		return (B_TRUE);
1441 	}
1442 
1443 	if (vdp->xdf_connect_req > 0) {
1444 		ASSERT(vdp->xdf_state != XD_CLOSED);
1445 		return (B_TRUE);
1446 	}
1447 
1448 	return (B_FALSE);
1449 }
1450 
1451 static void
1452 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
1453 {
1454 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1455 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1456 	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
1457 	    vdp->xdf_addr, vdp->xdf_state, new_state));
1458 	vdp->xdf_state = new_state;
1459 	cv_broadcast(&vdp->xdf_dev_cv);
1460 }
1461 
1462 static void
1463 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
1464 {
1465 	dev_info_t	*dip = vdp->xdf_dip;
1466 	boolean_t	busy;
1467 
1468 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1469 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1470 	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
1471 
1472 	/* Check if we're already there. */
1473 	if (vdp->xdf_state == new_state)
1474 		return;
1475 
1476 	mutex_enter(&vdp->xdf_dev_lk);
1477 	busy = xdf_busy(vdp);
1478 
1479 	/* If we're already closed then there's nothing todo. */
1480 	if (vdp->xdf_state == XD_CLOSED) {
1481 		ASSERT(!busy);
1482 		xdf_set_state(vdp, new_state);
1483 		mutex_exit(&vdp->xdf_dev_lk);
1484 		return;
1485 	}
1486 
1487 #ifdef DEBUG
1488 	/* UhOh.  Warn the user that something bad has happened. */
1489 	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
1490 	    (vdp->xdf_xdev_nblocks != 0)) {
1491 		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
1492 		    vdp->xdf_addr);
1493 	}
1494 #endif /* DEBUG */
1495 
1496 	xdf_ring_destroy(vdp);
1497 
1498 	/* If we're busy then we can only go into the unknown state */
1499 	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
1500 	mutex_exit(&vdp->xdf_dev_lk);
1501 
1502 	/* if we're closed now, let the other end know */
1503 	if (vdp->xdf_state == XD_CLOSED)
1504 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1505 }
1506 
1507 
1508 /*
1509  * Kick-off connect process
1510  * Status should be XD_UNKNOWN or XD_CLOSED
1511  * On success, status will be changed to XD_INIT
1512  * On error, it will be changed to XD_UNKNOWN
1513  */
1514 static int
1515 xdf_setstate_init(xdf_t *vdp)
1516 {
1517 	dev_info_t		*dip = vdp->xdf_dip;
1518 	xenbus_transaction_t	xbt;
1519 	grant_ref_t		gref;
1520 	char			*xsname, *str;
1521 	int 			rv;
1522 
1523 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1524 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1525 	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
1526 	    (vdp->xdf_state == XD_CLOSED));
1527 
1528 	DPRINTF(DDI_DBG,
1529 	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
1530 
1531 	/*
1532 	 * If an eject is pending then don't allow a new connection.
1533 	 * (Only the backend can clear media request eject request.)
1534 	 */
1535 	if (xdf_eject_pending(vdp))
1536 		return (DDI_FAILURE);
1537 
1538 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
1539 		goto errout;
1540 
1541 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
1542 		goto errout;
1543 
1544 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
1545 
1546 	/*
1547 	 * Sanity check for the existance of the xenbus device-type property.
1548 	 * This property might not exist if our xenbus device nodes were
1549 	 * force destroyed while we were still connected to the backend.
1550 	 */
1551 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
1552 		goto errout;
1553 	strfree(str);
1554 
1555 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
1556 		goto errout;
1557 
1558 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1559 #ifdef XPV_HVM_DRIVER
1560 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1561 #else /* !XPV_HVM_DRIVER */
1562 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1563 	    DDI_SUCCESS) {
1564 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
1565 		    "failed to add intr handler", vdp->xdf_addr);
1566 		goto errout1;
1567 	}
1568 #endif /* !XPV_HVM_DRIVER */
1569 
1570 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1571 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1572 	    DDI_SUCCESS) {
1573 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1574 		    vdp->xdf_addr);
1575 		goto errout2;
1576 	}
1577 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1578 
1579 	/*
1580 	 * Write into xenstore the info needed by backend
1581 	 */
1582 trans_retry:
1583 	if (xenbus_transaction_start(&xbt)) {
1584 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1585 		    vdp->xdf_addr);
1586 		xvdi_fatal_error(dip, EIO, "connect transaction init");
1587 		goto fail_trans;
1588 	}
1589 
1590 	/*
1591 	 * XBP_PROTOCOL is written by the domain builder in the case of PV
1592 	 * domains. However, it is not written for HVM domains, so let's
1593 	 * write it here.
1594 	 */
1595 	if (((rv = xenbus_printf(xbt, xsname,
1596 	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
1597 	    ((rv = xenbus_printf(xbt, xsname,
1598 	    XBP_RING_REF, "%u", gref)) != 0) ||
1599 	    ((rv = xenbus_printf(xbt, xsname,
1600 	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
1601 	    ((rv = xenbus_printf(xbt, xsname,
1602 	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
1603 	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
1604 		(void) xenbus_transaction_end(xbt, 1);
1605 		xvdi_fatal_error(dip, rv, "connect transaction setup");
1606 		goto fail_trans;
1607 	}
1608 
1609 	/* kick-off connect process */
1610 	if (rv = xenbus_transaction_end(xbt, 0)) {
1611 		if (rv == EAGAIN)
1612 			goto trans_retry;
1613 		xvdi_fatal_error(dip, rv, "connect transaction commit");
1614 		goto fail_trans;
1615 	}
1616 
1617 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1618 	mutex_enter(&vdp->xdf_dev_lk);
1619 	xdf_set_state(vdp, XD_INIT);
1620 	mutex_exit(&vdp->xdf_dev_lk);
1621 
1622 	return (DDI_SUCCESS);
1623 
1624 fail_trans:
1625 	xvdi_free_ring(vdp->xdf_xb_ring);
1626 errout2:
1627 #ifdef XPV_HVM_DRIVER
1628 	ec_unbind_evtchn(vdp->xdf_evtchn);
1629 #else /* !XPV_HVM_DRIVER */
1630 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1631 #endif /* !XPV_HVM_DRIVER */
1632 errout1:
1633 	xvdi_free_evtchn(dip);
1634 	vdp->xdf_evtchn = INVALID_EVTCHN;
1635 errout:
1636 	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1637 	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
1638 	    vdp->xdf_addr);
1639 	return (DDI_FAILURE);
1640 }
1641 
1642 int
1643 xdf_get_flush_block(xdf_t *vdp)
1644 {
1645 	/*
1646 	 * Get a DEV_BSIZE aligned bufer
1647 	 */
1648 	vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
1649 	vdp->xdf_cache_flush_block =
1650 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
1651 	    (int)vdp->xdf_xdev_secsize);
1652 
1653 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1654 	    xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
1655 		return (DDI_FAILURE);
1656 	return (DDI_SUCCESS);
1657 }
1658 
1659 static void
1660 xdf_setstate_ready(void *arg)
1661 {
1662 	xdf_t		*vdp = (xdf_t *)arg;
1663 	dev_info_t	*dip = vdp->xdf_dip;
1664 
1665 	vdp->xdf_ready_tq_thread = curthread;
1666 
1667 	/* Create minor nodes now when we are almost connected */
1668 	mutex_enter(&vdp->xdf_dev_lk);
1669 	if (vdp->xdf_cmlb_reattach) {
1670 		vdp->xdf_cmlb_reattach = B_FALSE;
1671 		mutex_exit(&vdp->xdf_dev_lk);
1672 		if (xdf_cmlb_attach(vdp) != 0) {
1673 			cmn_err(CE_WARN,
1674 			    "xdf@%s: cmlb attach failed",
1675 			    ddi_get_name_addr(dip));
1676 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1677 			return;
1678 		}
1679 		mutex_enter(&vdp->xdf_dev_lk);
1680 	}
1681 
1682 	/* If we're not still trying to get to the ready state, then bail. */
1683 	if (vdp->xdf_state != XD_CONNECTED) {
1684 		mutex_exit(&vdp->xdf_dev_lk);
1685 		return;
1686 	}
1687 	mutex_exit(&vdp->xdf_dev_lk);
1688 
1689 	/*
1690 	 * If backend has feature-barrier, see if it supports disk
1691 	 * cache flush op.
1692 	 */
1693 	vdp->xdf_flush_supported = B_FALSE;
1694 	if (vdp->xdf_feature_barrier) {
1695 		/*
1696 		 * Pretend we already know flush is supported so probe
1697 		 * will attempt the correct op.
1698 		 */
1699 		vdp->xdf_flush_supported = B_TRUE;
1700 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1701 			vdp->xdf_flush_supported = B_TRUE;
1702 		} else {
1703 			vdp->xdf_flush_supported = B_FALSE;
1704 			/*
1705 			 * If the other end does not support the cache flush op
1706 			 * then we must use a barrier-write to force disk
1707 			 * cache flushing.  Barrier writes require that a data
1708 			 * block actually be written.
1709 			 * Cache a block to barrier-write when we are
1710 			 * asked to perform a flush.
1711 			 * XXX - would it be better to just copy 1 block
1712 			 * (512 bytes) from whatever write we did last
1713 			 * and rewrite that block?
1714 			 */
1715 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
1716 				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1717 				return;
1718 			}
1719 		}
1720 	}
1721 
1722 	mutex_enter(&vdp->xdf_cb_lk);
1723 	mutex_enter(&vdp->xdf_dev_lk);
1724 	if (vdp->xdf_state == XD_CONNECTED)
1725 		xdf_set_state(vdp, XD_READY);
1726 	mutex_exit(&vdp->xdf_dev_lk);
1727 
1728 	/* Restart any currently queued up io */
1729 	xdf_io_start(vdp);
1730 
1731 	mutex_exit(&vdp->xdf_cb_lk);
1732 }
1733 
1734 /*
1735  * synthetic geometry
1736  */
1737 #define	XDF_NSECTS	256
1738 #define	XDF_NHEADS	16
1739 
1740 static void
1741 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
1742 {
1743 	xdf_t *vdp;
1744 	uint_t ncyl;
1745 
1746 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
1747 
1748 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1749 
1750 	bzero(geomp, sizeof (*geomp));
1751 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1752 	geomp->g_acyl = 0;
1753 	geomp->g_nhead = XDF_NHEADS;
1754 	geomp->g_nsect = XDF_NSECTS;
1755 	geomp->g_secsize = vdp->xdf_xdev_secsize;
1756 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1757 	geomp->g_intrlv = 0;
1758 	geomp->g_rpm = 7200;
1759 }
1760 
1761 /*
1762  * Finish other initialization after we've connected to backend
1763  * Status should be XD_INIT before calling this routine
1764  * On success, status should be changed to XD_CONNECTED.
1765  * On error, status should stay XD_INIT
1766  */
1767 static int
1768 xdf_setstate_connected(xdf_t *vdp)
1769 {
1770 	dev_info_t	*dip = vdp->xdf_dip;
1771 	cmlb_geom_t	pgeom;
1772 	diskaddr_t	nblocks = 0;
1773 	uint_t		secsize = 0;
1774 	char		*oename, *xsname, *str;
1775 	uint_t		dinfo;
1776 
1777 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1778 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
1779 	ASSERT(vdp->xdf_state == XD_INIT);
1780 
1781 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
1782 	    ((oename = xvdi_get_oename(dip)) == NULL))
1783 		return (DDI_FAILURE);
1784 
1785 	/* Make sure the other end is XenbusStateConnected */
1786 	if (xenbus_read_driver_state(oename) != XenbusStateConnected)
1787 		return (DDI_FAILURE);
1788 
1789 	/* Determine if feature barrier is supported by backend */
1790 	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
1791 		cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
1792 		    vdp->xdf_addr);
1793 
1794 	/*
1795 	 * Probe backend.  Read the device size into xdf_xdev_nblocks
1796 	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
1797 	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
1798 	 * we always set VDISK_CDROM, regardless of if it's present in
1799 	 * the xenbus info parameter.
1800 	 */
1801 	if (xenbus_gather(XBT_NULL, oename,
1802 	    XBP_SECTORS, "%"SCNu64, &nblocks,
1803 	    XBP_SECTOR_SIZE, "%u", &secsize,
1804 	    XBP_INFO, "%u", &dinfo,
1805 	    NULL) != 0) {
1806 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1807 		    "cannot read backend info", vdp->xdf_addr);
1808 		return (DDI_FAILURE);
1809 	}
1810 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
1811 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
1812 		    vdp->xdf_addr);
1813 		return (DDI_FAILURE);
1814 	}
1815 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
1816 		dinfo |= VDISK_CDROM;
1817 	strfree(str);
1818 
1819 	if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
1820 		secsize = DEV_BSIZE;
1821 	vdp->xdf_xdev_nblocks = nblocks;
1822 	vdp->xdf_xdev_secsize = secsize;
1823 #ifdef _ILP32
1824 	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1825 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
1826 		    "backend disk device too large with %llu blocks for"
1827 		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
1828 		xvdi_fatal_error(dip, EFBIG, "reading backend info");
1829 		return (DDI_FAILURE);
1830 	}
1831 #endif
1832 
1833 	/*
1834 	 * If the physical geometry for a fixed disk has been explicity
1835 	 * set then make sure that the specified physical geometry isn't
1836 	 * larger than the device we connected to.
1837 	 */
1838 	if (vdp->xdf_pgeom_fixed &&
1839 	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
1840 		cmn_err(CE_WARN,
1841 		    "xdf@%s: connect failed, fixed geometry too large",
1842 		    vdp->xdf_addr);
1843 		return (DDI_FAILURE);
1844 	}
1845 
1846 	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
1847 
1848 	/* mark vbd is ready for I/O */
1849 	mutex_enter(&vdp->xdf_dev_lk);
1850 	xdf_set_state(vdp, XD_CONNECTED);
1851 
1852 	/* check if the cmlb label should be updated */
1853 	xdf_synthetic_pgeom(dip, &pgeom);
1854 	if ((vdp->xdf_dinfo != dinfo) ||
1855 	    (!vdp->xdf_pgeom_fixed &&
1856 	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
1857 		vdp->xdf_cmlb_reattach = B_TRUE;
1858 
1859 		vdp->xdf_dinfo = dinfo;
1860 		if (!vdp->xdf_pgeom_fixed)
1861 			vdp->xdf_pgeom = pgeom;
1862 	}
1863 
1864 	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
1865 		if (vdp->xdf_xdev_nblocks == 0) {
1866 			vdp->xdf_mstate = DKIO_EJECTED;
1867 			cv_broadcast(&vdp->xdf_mstate_cv);
1868 		} else {
1869 			vdp->xdf_mstate = DKIO_INSERTED;
1870 			cv_broadcast(&vdp->xdf_mstate_cv);
1871 		}
1872 	} else {
1873 		if (vdp->xdf_mstate != DKIO_NONE) {
1874 			vdp->xdf_mstate = DKIO_NONE;
1875 			cv_broadcast(&vdp->xdf_mstate_cv);
1876 		}
1877 	}
1878 
1879 	mutex_exit(&vdp->xdf_dev_lk);
1880 
1881 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
1882 	    (uint64_t)vdp->xdf_xdev_nblocks);
1883 
1884 	/* Restart any currently queued up io */
1885 	xdf_io_start(vdp);
1886 
1887 	/*
1888 	 * To get to the ready state we have to do IO to the backend device,
1889 	 * but we can't initiate IO from the other end change callback thread
1890 	 * (which is the current context we're executing in.)  This is because
1891 	 * if the other end disconnects while we're doing IO from the callback
1892 	 * thread, then we can't receive that disconnect event and we hang
1893 	 * waiting for an IO that can never complete.
1894 	 */
1895 	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
1896 	    DDI_SLEEP);
1897 
1898 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1899 	return (DDI_SUCCESS);
1900 }
1901 
1902 /*ARGSUSED*/
1903 static void
1904 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1905 {
1906 	XenbusState new_state = *(XenbusState *)impl_data;
1907 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1908 
1909 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1910 	    vdp->xdf_addr, new_state));
1911 
1912 	mutex_enter(&vdp->xdf_cb_lk);
1913 
1914 	/* We assume that this callback is single threaded */
1915 	ASSERT(vdp->xdf_oe_change_thread == NULL);
1916 	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
1917 
1918 	/* ignore any backend state changes if we're suspending/suspended */
1919 	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
1920 		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1921 		mutex_exit(&vdp->xdf_cb_lk);
1922 		return;
1923 	}
1924 
1925 	switch (new_state) {
1926 	case XenbusStateUnknown:
1927 	case XenbusStateInitialising:
1928 	case XenbusStateInitWait:
1929 	case XenbusStateInitialised:
1930 		if (vdp->xdf_state == XD_INIT)
1931 			break;
1932 
1933 		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1934 		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1935 			break;
1936 		ASSERT(vdp->xdf_state == XD_INIT);
1937 		break;
1938 
1939 	case XenbusStateConnected:
1940 		if ((vdp->xdf_state == XD_CONNECTED) ||
1941 		    (vdp->xdf_state == XD_READY))
1942 			break;
1943 
1944 		if (vdp->xdf_state != XD_INIT) {
1945 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1946 			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
1947 				break;
1948 			ASSERT(vdp->xdf_state == XD_INIT);
1949 		}
1950 
1951 		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
1952 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
1953 			break;
1954 		}
1955 		ASSERT(vdp->xdf_state == XD_CONNECTED);
1956 		break;
1957 
1958 	case XenbusStateClosing:
1959 		if (xdf_isopen(vdp, -1)) {
1960 			cmn_err(CE_NOTE,
1961 			    "xdf@%s: hot-unplug failed, still in use",
1962 			    vdp->xdf_addr);
1963 			break;
1964 		}
1965 		/*FALLTHROUGH*/
1966 	case XenbusStateClosed:
1967 		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
1968 		break;
1969 	}
1970 
1971 	/* notify anybody waiting for oe state change */
1972 	cv_broadcast(&vdp->xdf_dev_cv);
1973 	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
1974 	mutex_exit(&vdp->xdf_cb_lk);
1975 }
1976 
1977 static int
1978 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
1979 {
1980 	int	rv, timeouts = 0, reset = 20;
1981 
1982 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
1983 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
1984 
1985 	/* we can't connect once we're in the closed state */
1986 	if (vdp->xdf_state == XD_CLOSED)
1987 		return (XD_CLOSED);
1988 
1989 	vdp->xdf_connect_req++;
1990 	while (vdp->xdf_state != XD_READY) {
1991 		mutex_exit(&vdp->xdf_dev_lk);
1992 
1993 		/* only one thread at a time can be the connection thread */
1994 		if (vdp->xdf_connect_thread == NULL)
1995 			vdp->xdf_connect_thread = curthread;
1996 
1997 		if (vdp->xdf_connect_thread == curthread) {
1998 			if ((timeouts > 0) && ((timeouts % reset) == 0)) {
1999 				/*
2000 				 * If we haven't establised a connection
2001 				 * within the reset time, then disconnect
2002 				 * so we can try again, and double the reset
2003 				 * time.  The reset time starts at 2 sec.
2004 				 */
2005 				(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2006 				reset *= 2;
2007 			}
2008 			if (vdp->xdf_state == XD_UNKNOWN)
2009 				(void) xdf_setstate_init(vdp);
2010 			if (vdp->xdf_state == XD_INIT)
2011 				(void) xdf_setstate_connected(vdp);
2012 		}
2013 
2014 		mutex_enter(&vdp->xdf_dev_lk);
2015 		if (!wait || (vdp->xdf_state == XD_READY))
2016 			goto out;
2017 
2018 		mutex_exit((&vdp->xdf_cb_lk));
2019 		if (vdp->xdf_connect_thread != curthread) {
2020 			rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
2021 		} else {
2022 			/* delay for 0.1 sec */
2023 			rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
2024 			    &vdp->xdf_dev_lk, drv_usectohz(100*1000),
2025 			    TR_CLOCK_TICK);
2026 			if (rv == -1)
2027 				timeouts++;
2028 		}
2029 		mutex_exit((&vdp->xdf_dev_lk));
2030 		mutex_enter((&vdp->xdf_cb_lk));
2031 		mutex_enter((&vdp->xdf_dev_lk));
2032 		if (rv == 0)
2033 			goto out;
2034 	}
2035 
2036 out:
2037 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2038 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
2039 
2040 	if (vdp->xdf_connect_thread == curthread) {
2041 		/*
2042 		 * wake up someone else so they can become the connection
2043 		 * thread.
2044 		 */
2045 		cv_signal(&vdp->xdf_dev_cv);
2046 		vdp->xdf_connect_thread = NULL;
2047 	}
2048 
2049 	/* Try to lock the media */
2050 	mutex_exit((&vdp->xdf_dev_lk));
2051 	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2052 	mutex_enter((&vdp->xdf_dev_lk));
2053 
2054 	vdp->xdf_connect_req--;
2055 	return (vdp->xdf_state);
2056 }
2057 
2058 static uint_t
2059 xdf_iorestart(caddr_t arg)
2060 {
2061 	xdf_t *vdp = (xdf_t *)arg;
2062 
2063 	ASSERT(vdp != NULL);
2064 
2065 	mutex_enter(&vdp->xdf_dev_lk);
2066 	ASSERT(ISDMACBON(vdp));
2067 	SETDMACBOFF(vdp);
2068 	mutex_exit(&vdp->xdf_dev_lk);
2069 
2070 	xdf_io_start(vdp);
2071 
2072 	return (DDI_INTR_CLAIMED);
2073 }
2074 
2075 #ifdef XPV_HVM_DRIVER
2076 
2077 typedef struct xdf_hvm_entry {
2078 	list_node_t	xdf_he_list;
2079 	char		*xdf_he_path;
2080 	dev_info_t	*xdf_he_dip;
2081 } xdf_hvm_entry_t;
2082 
2083 static list_t xdf_hvm_list;
2084 static kmutex_t xdf_hvm_list_lock;
2085 
2086 static xdf_hvm_entry_t *
2087 i_xdf_hvm_find(const char *path, dev_info_t *dip)
2088 {
2089 	xdf_hvm_entry_t	*i;
2090 
2091 	ASSERT((path != NULL) || (dip != NULL));
2092 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2093 
2094 	i = list_head(&xdf_hvm_list);
2095 	while (i != NULL) {
2096 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2097 			i = list_next(&xdf_hvm_list, i);
2098 			continue;
2099 		}
2100 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2101 			i = list_next(&xdf_hvm_list, i);
2102 			continue;
2103 		}
2104 		break;
2105 	}
2106 	return (i);
2107 }
2108 
2109 dev_info_t *
2110 xdf_hvm_hold(const char *path)
2111 {
2112 	xdf_hvm_entry_t	*i;
2113 	dev_info_t	*dip;
2114 
2115 	mutex_enter(&xdf_hvm_list_lock);
2116 	i = i_xdf_hvm_find(path, NULL);
2117 	if (i == NULL) {
2118 		mutex_exit(&xdf_hvm_list_lock);
2119 		return (B_FALSE);
2120 	}
2121 	ndi_hold_devi(dip = i->xdf_he_dip);
2122 	mutex_exit(&xdf_hvm_list_lock);
2123 	return (dip);
2124 }
2125 
2126 static void
2127 xdf_hvm_add(dev_info_t *dip)
2128 {
2129 	xdf_hvm_entry_t	*i;
2130 	char		*path;
2131 
2132 	/* figure out the path for the dip */
2133 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2134 	(void) ddi_pathname(dip, path);
2135 
2136 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2137 	i->xdf_he_dip = dip;
2138 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2139 
2140 	mutex_enter(&xdf_hvm_list_lock);
2141 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2142 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2143 	list_insert_head(&xdf_hvm_list, i);
2144 	mutex_exit(&xdf_hvm_list_lock);
2145 
2146 	kmem_free(path, MAXPATHLEN);
2147 }
2148 
2149 static void
2150 xdf_hvm_rm(dev_info_t *dip)
2151 {
2152 	xdf_hvm_entry_t	*i;
2153 
2154 	mutex_enter(&xdf_hvm_list_lock);
2155 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2156 	list_remove(&xdf_hvm_list, i);
2157 	mutex_exit(&xdf_hvm_list_lock);
2158 
2159 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2160 	kmem_free(i, sizeof (*i));
2161 }
2162 
2163 static void
2164 xdf_hvm_init(void)
2165 {
2166 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2167 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2168 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2169 }
2170 
2171 static void
2172 xdf_hvm_fini(void)
2173 {
2174 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2175 	list_destroy(&xdf_hvm_list);
2176 	mutex_destroy(&xdf_hvm_list_lock);
2177 }
2178 
2179 boolean_t
2180 xdf_hvm_connect(dev_info_t *dip)
2181 {
2182 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2183 	char	*oename, *str;
2184 	int	rv;
2185 
2186 	mutex_enter(&vdp->xdf_cb_lk);
2187 
2188 	/*
2189 	 * Before try to establish a connection we need to wait for the
2190 	 * backend hotplug scripts to have run.  Once they are run the
2191 	 * "<oename>/hotplug-status" property will be set to "connected".
2192 	 */
2193 	for (;;) {
2194 		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2195 
2196 		/*
2197 		 * Get the xenbus path to the backend device.  Note that
2198 		 * we can't cache this path (and we look it up on each pass
2199 		 * through this loop) because it could change during
2200 		 * suspend, resume, and migration operations.
2201 		 */
2202 		if ((oename = xvdi_get_oename(dip)) == NULL) {
2203 			mutex_exit(&vdp->xdf_cb_lk);
2204 			return (B_FALSE);
2205 		}
2206 
2207 		str = NULL;
2208 		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
2209 		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
2210 			break;
2211 
2212 		if (str != NULL)
2213 			strfree(str);
2214 
2215 		/* wait for an update to "<oename>/hotplug-status" */
2216 		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
2217 			/* we got interrupted by a signal */
2218 			mutex_exit(&vdp->xdf_cb_lk);
2219 			return (B_FALSE);
2220 		}
2221 	}
2222 
2223 	/* Good news.  The backend hotplug scripts have been run. */
2224 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
2225 	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
2226 	strfree(str);
2227 
2228 	/*
2229 	 * If we're emulating a cd device and if the backend doesn't support
2230 	 * media request opreations, then we're not going to bother trying
2231 	 * to establish a connection for a couple reasons.  First off, media
2232 	 * requests support is required to support operations like eject and
2233 	 * media locking.  Second, other backend platforms like Linux don't
2234 	 * support hvm pv cdrom access.  They don't even have a backend pv
2235 	 * driver for cdrom device nodes, so we don't want to block forever
2236 	 * waiting for a connection to a backend driver that doesn't exist.
2237 	 */
2238 	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
2239 		mutex_exit(&vdp->xdf_cb_lk);
2240 		return (B_FALSE);
2241 	}
2242 
2243 	mutex_enter(&vdp->xdf_dev_lk);
2244 	rv = xdf_connect_locked(vdp, B_TRUE);
2245 	mutex_exit(&vdp->xdf_dev_lk);
2246 	mutex_exit(&vdp->xdf_cb_lk);
2247 
2248 	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
2249 }
2250 
2251 int
2252 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2253 {
2254 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2255 
2256 	/* sanity check the requested physical geometry */
2257 	mutex_enter(&vdp->xdf_dev_lk);
2258 	if ((geomp->g_secsize != XB_BSIZE) ||
2259 	    (geomp->g_capacity == 0)) {
2260 		mutex_exit(&vdp->xdf_dev_lk);
2261 		return (EINVAL);
2262 	}
2263 
2264 	/*
2265 	 * If we've already connected to the backend device then make sure
2266 	 * we're not defining a physical geometry larger than our backend
2267 	 * device.
2268 	 */
2269 	if ((vdp->xdf_xdev_nblocks != 0) &&
2270 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2271 		mutex_exit(&vdp->xdf_dev_lk);
2272 		return (EINVAL);
2273 	}
2274 
2275 	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
2276 	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
2277 	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
2278 	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
2279 	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
2280 	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
2281 	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
2282 	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
2283 	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
2284 
2285 	vdp->xdf_pgeom_fixed = B_TRUE;
2286 	mutex_exit(&vdp->xdf_dev_lk);
2287 
2288 	/* force a re-validation */
2289 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2290 
2291 	return (0);
2292 }
2293 
2294 boolean_t
2295 xdf_is_cd(dev_info_t *dip)
2296 {
2297 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2298 	boolean_t	rv;
2299 
2300 	mutex_enter(&vdp->xdf_cb_lk);
2301 	rv = XD_IS_CD(vdp);
2302 	mutex_exit(&vdp->xdf_cb_lk);
2303 	return (rv);
2304 }
2305 
2306 boolean_t
2307 xdf_is_rm(dev_info_t *dip)
2308 {
2309 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2310 	boolean_t	rv;
2311 
2312 	mutex_enter(&vdp->xdf_cb_lk);
2313 	rv = XD_IS_RM(vdp);
2314 	mutex_exit(&vdp->xdf_cb_lk);
2315 	return (rv);
2316 }
2317 
2318 boolean_t
2319 xdf_media_req_supported(dev_info_t *dip)
2320 {
2321 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
2322 	boolean_t	rv;
2323 
2324 	mutex_enter(&vdp->xdf_cb_lk);
2325 	rv = vdp->xdf_media_req_supported;
2326 	mutex_exit(&vdp->xdf_cb_lk);
2327 	return (rv);
2328 }
2329 
2330 #endif /* XPV_HVM_DRIVER */
2331 
2332 static int
2333 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
2334 {
2335 	xdf_t *vdp;
2336 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2337 
2338 	if (vdp == NULL)
2339 		return (ENXIO);
2340 
2341 	mutex_enter(&vdp->xdf_dev_lk);
2342 	*capp = vdp->xdf_pgeom.g_capacity;
2343 	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
2344 	mutex_exit(&vdp->xdf_dev_lk);
2345 	return (0);
2346 }
2347 
2348 static int
2349 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2350 {
2351 	xdf_t *vdp;
2352 
2353 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
2354 		return (ENXIO);
2355 	*geomp = vdp->xdf_pgeom;
2356 	return (0);
2357 }
2358 
2359 /*
2360  * No real HBA, no geometry available from it
2361  */
2362 /*ARGSUSED*/
2363 static int
2364 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2365 {
2366 	return (EINVAL);
2367 }
2368 
2369 static int
2370 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
2371 {
2372 	xdf_t *vdp;
2373 
2374 	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
2375 		return (ENXIO);
2376 
2377 	if (XD_IS_RO(vdp))
2378 		tgattributep->media_is_writable = 0;
2379 	else
2380 		tgattributep->media_is_writable = 1;
2381 	tgattributep->media_is_rotational = 0;
2382 	return (0);
2383 }
2384 
2385 /* ARGSUSED3 */
2386 int
2387 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
2388 {
2389 	int instance;
2390 	xdf_t   *vdp;
2391 
2392 	instance = ddi_get_instance(dip);
2393 
2394 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
2395 		return (ENXIO);
2396 
2397 	switch (cmd) {
2398 	case TG_GETPHYGEOM:
2399 		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
2400 	case TG_GETVIRTGEOM:
2401 		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
2402 	case TG_GETCAPACITY:
2403 		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
2404 	case TG_GETBLOCKSIZE:
2405 		mutex_enter(&vdp->xdf_cb_lk);
2406 		*(uint32_t *)arg = vdp->xdf_xdev_secsize;
2407 		mutex_exit(&vdp->xdf_cb_lk);
2408 		return (0);
2409 	case TG_GETATTR:
2410 		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
2411 	default:
2412 		return (ENOTTY);
2413 	}
2414 }
2415 
2416 /* ARGSUSED5 */
2417 int
2418 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
2419     diskaddr_t start, size_t reqlen, void *tg_cookie)
2420 {
2421 	xdf_t *vdp;
2422 	struct buf *bp;
2423 	int err = 0;
2424 
2425 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
2426 
2427 	/* We don't allow IO from the oe_change callback thread */
2428 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2429 
2430 	/*
2431 	 * Having secsize of 0 means that device isn't connected yet.
2432 	 * FIXME This happens for CD devices, and there's nothing we
2433 	 * can do about it at the moment.
2434 	 */
2435 	if (vdp->xdf_xdev_secsize == 0)
2436 		return (EIO);
2437 
2438 	if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
2439 	    >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
2440 		return (EINVAL);
2441 
2442 	bp = getrbuf(KM_SLEEP);
2443 	if (cmd == TG_READ)
2444 		bp->b_flags = B_BUSY | B_READ;
2445 	else
2446 		bp->b_flags = B_BUSY | B_WRITE;
2447 
2448 	bp->b_un.b_addr = bufp;
2449 	bp->b_bcount = reqlen;
2450 	bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
2451 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
2452 
2453 	mutex_enter(&vdp->xdf_dev_lk);
2454 	xdf_bp_push(vdp, bp);
2455 	mutex_exit(&vdp->xdf_dev_lk);
2456 	xdf_io_start(vdp);
2457 	if (curthread == vdp->xdf_ready_tq_thread)
2458 		(void) xdf_ring_drain(vdp);
2459 	err = biowait(bp);
2460 	ASSERT(bp->b_flags & B_DONE);
2461 	freerbuf(bp);
2462 	return (err);
2463 }
2464 
2465 /*
2466  * Lock the current media.  Set the media state to "lock".
2467  * (Media locks are only respected by the backend driver.)
2468  */
2469 static int
2470 xdf_ioctl_mlock(xdf_t *vdp)
2471 {
2472 	int rv;
2473 	mutex_enter(&vdp->xdf_cb_lk);
2474 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2475 	mutex_exit(&vdp->xdf_cb_lk);
2476 	return (rv);
2477 }
2478 
2479 /*
2480  * Release a media lock.  Set the media state to "none".
2481  */
2482 static int
2483 xdf_ioctl_munlock(xdf_t *vdp)
2484 {
2485 	int rv;
2486 	mutex_enter(&vdp->xdf_cb_lk);
2487 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
2488 	mutex_exit(&vdp->xdf_cb_lk);
2489 	return (rv);
2490 }
2491 
2492 /*
2493  * Eject the current media.  Ignores any media locks.  (Media locks
2494  * are only for benifit of the the backend.)
2495  */
2496 static int
2497 xdf_ioctl_eject(xdf_t *vdp)
2498 {
2499 	int rv;
2500 
2501 	mutex_enter(&vdp->xdf_cb_lk);
2502 	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
2503 		mutex_exit(&vdp->xdf_cb_lk);
2504 		return (rv);
2505 	}
2506 
2507 	/*
2508 	 * We've set the media requests xenbus parameter to eject, so now
2509 	 * disconnect from the backend, wait for the backend to clear
2510 	 * the media requets xenbus paramter, and then we can reconnect
2511 	 * to the backend.
2512 	 */
2513 	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
2514 	mutex_enter(&vdp->xdf_dev_lk);
2515 	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
2516 		mutex_exit(&vdp->xdf_dev_lk);
2517 		mutex_exit(&vdp->xdf_cb_lk);
2518 		return (EIO);
2519 	}
2520 	mutex_exit(&vdp->xdf_dev_lk);
2521 	mutex_exit(&vdp->xdf_cb_lk);
2522 	return (0);
2523 }
2524 
2525 /*
2526  * Watch for media state changes.  This can be an insertion of a device
2527  * (triggered by a 'xm block-configure' request in another domain) or
2528  * the ejection of a device (triggered by a local "eject" operation).
2529  * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
2530  */
2531 static int
2532 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
2533 {
2534 	enum dkio_state		prev_state;
2535 
2536 	mutex_enter(&vdp->xdf_cb_lk);
2537 	prev_state = vdp->xdf_mstate;
2538 
2539 	if (vdp->xdf_mstate == mstate) {
2540 		while (vdp->xdf_mstate == prev_state) {
2541 			if (cv_wait_sig(&vdp->xdf_mstate_cv,
2542 			    &vdp->xdf_cb_lk) == 0) {
2543 				mutex_exit(&vdp->xdf_cb_lk);
2544 				return (EINTR);
2545 			}
2546 		}
2547 	}
2548 
2549 	if ((prev_state != DKIO_INSERTED) &&
2550 	    (vdp->xdf_mstate == DKIO_INSERTED)) {
2551 		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
2552 		mutex_exit(&vdp->xdf_cb_lk);
2553 		return (0);
2554 	}
2555 
2556 	mutex_exit(&vdp->xdf_cb_lk);
2557 	return (0);
2558 }
2559 
2560 /*ARGSUSED*/
2561 static int
2562 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2563     int *rvalp)
2564 {
2565 	minor_t		minor = getminor(dev);
2566 	int		part = XDF_PART(minor);
2567 	xdf_t		*vdp;
2568 	int		rv;
2569 
2570 	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
2571 	    (!xdf_isopen(vdp, part)))
2572 		return (ENXIO);
2573 
2574 	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
2575 	    vdp->xdf_addr, cmd, cmd));
2576 
2577 	switch (cmd) {
2578 	default:
2579 		return (ENOTTY);
2580 	case DKIOCG_PHYGEOM:
2581 	case DKIOCG_VIRTGEOM:
2582 	case DKIOCGGEOM:
2583 	case DKIOCSGEOM:
2584 	case DKIOCGAPART:
2585 	case DKIOCSAPART:
2586 	case DKIOCGVTOC:
2587 	case DKIOCSVTOC:
2588 	case DKIOCPARTINFO:
2589 	case DKIOCGEXTVTOC:
2590 	case DKIOCSEXTVTOC:
2591 	case DKIOCEXTPARTINFO:
2592 	case DKIOCGMBOOT:
2593 	case DKIOCSMBOOT:
2594 	case DKIOCGETEFI:
2595 	case DKIOCSETEFI:
2596 	case DKIOCSETEXTPART:
2597 	case DKIOCPARTITION:
2598 		rv = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
2599 		    rvalp, NULL);
2600 		if (rv != 0)
2601 			return (rv);
2602 		/*
2603 		 * If we're labelling the disk, we have to update the geometry
2604 		 * in the cmlb data structures, and we also have to write a new
2605 		 * devid to the disk.  Note that writing an EFI label currently
2606 		 * requires 4 ioctls, and devid setup will fail on all but the
2607 		 * last.
2608 		 */
2609 		if (cmd == DKIOCSEXTVTOC || cmd == DKIOCSVTOC ||
2610 		    cmd == DKIOCSETEFI) {
2611 			rv = cmlb_validate(vdp->xdf_vd_lbl, 0, 0);
2612 			if (rv == 0) {
2613 				xdf_devid_setup(vdp);
2614 			} else {
2615 				cmn_err(CE_WARN,
2616 				    "xdf@%s, labeling failed on validate",
2617 				    vdp->xdf_addr);
2618 			}
2619 		}
2620 		return (rv);
2621 	case FDEJECT:
2622 	case DKIOCEJECT:
2623 	case CDROMEJECT:
2624 		return (xdf_ioctl_eject(vdp));
2625 	case DKIOCLOCK:
2626 		return (xdf_ioctl_mlock(vdp));
2627 	case DKIOCUNLOCK:
2628 		return (xdf_ioctl_munlock(vdp));
2629 	case CDROMREADOFFSET: {
2630 		int offset = 0;
2631 		if (!XD_IS_CD(vdp))
2632 			return (ENOTTY);
2633 		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
2634 			return (EFAULT);
2635 		return (0);
2636 	}
2637 	case DKIOCGMEDIAINFO: {
2638 		struct dk_minfo media_info;
2639 
2640 		media_info.dki_lbsize = vdp->xdf_xdev_secsize;
2641 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
2642 		if (XD_IS_CD(vdp))
2643 			media_info.dki_media_type = DK_CDROM;
2644 		else
2645 			media_info.dki_media_type = DK_FIXED_DISK;
2646 
2647 		if (ddi_copyout(&media_info, (void *)arg,
2648 		    sizeof (struct dk_minfo), mode))
2649 			return (EFAULT);
2650 		return (0);
2651 	}
2652 	case DKIOCINFO: {
2653 		struct dk_cinfo info;
2654 
2655 		/* controller information */
2656 		if (XD_IS_CD(vdp))
2657 			info.dki_ctype = DKC_CDROM;
2658 		else
2659 			info.dki_ctype = DKC_VBD;
2660 
2661 		info.dki_cnum = 0;
2662 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
2663 
2664 		/* unit information */
2665 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
2666 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
2667 		info.dki_flags = DKI_FMTVOL;
2668 		info.dki_partition = part;
2669 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
2670 		info.dki_addr = 0;
2671 		info.dki_space = 0;
2672 		info.dki_prio = 0;
2673 		info.dki_vec = 0;
2674 
2675 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
2676 			return (EFAULT);
2677 		return (0);
2678 	}
2679 	case DKIOCSTATE: {
2680 		enum dkio_state mstate;
2681 
2682 		if (ddi_copyin((void *)arg, &mstate,
2683 		    sizeof (mstate), mode) != 0)
2684 			return (EFAULT);
2685 		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
2686 			return (rv);
2687 		mstate = vdp->xdf_mstate;
2688 		if (ddi_copyout(&mstate, (void *)arg,
2689 		    sizeof (mstate), mode) != 0)
2690 			return (EFAULT);
2691 		return (0);
2692 	}
2693 	case DKIOCREMOVABLE: {
2694 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2695 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
2696 			return (EFAULT);
2697 		return (0);
2698 	}
2699 	case DKIOCGETWCE: {
2700 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
2701 		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
2702 			return (EFAULT);
2703 		return (0);
2704 	}
2705 	case DKIOCSETWCE: {
2706 		int i;
2707 		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
2708 			return (EFAULT);
2709 		vdp->xdf_wce = VOID2BOOLEAN(i);
2710 		return (0);
2711 	}
2712 	case DKIOCFLUSHWRITECACHE: {
2713 		struct dk_callback *dkc = (struct dk_callback *)arg;
2714 
2715 		if (vdp->xdf_flush_supported) {
2716 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2717 			    NULL, 0, 0, (void *)dev);
2718 		} else if (vdp->xdf_feature_barrier &&
2719 		    !xdf_barrier_flush_disable) {
2720 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
2721 			    vdp->xdf_cache_flush_block, xdf_flush_block,
2722 			    vdp->xdf_xdev_secsize, (void *)dev);
2723 		} else {
2724 			return (ENOTTY);
2725 		}
2726 		if ((mode & FKIOCTL) && (dkc != NULL) &&
2727 		    (dkc->dkc_callback != NULL)) {
2728 			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
2729 			/* need to return 0 after calling callback */
2730 			rv = 0;
2731 		}
2732 		return (rv);
2733 	}
2734 	}
2735 	/*NOTREACHED*/
2736 }
2737 
2738 static int
2739 xdf_strategy(struct buf *bp)
2740 {
2741 	xdf_t	*vdp;
2742 	minor_t minor;
2743 	diskaddr_t p_blkct, p_blkst;
2744 	daddr_t blkno;
2745 	ulong_t nblks;
2746 	int part;
2747 
2748 	minor = getminor(bp->b_edev);
2749 	part = XDF_PART(minor);
2750 	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
2751 
2752 	mutex_enter(&vdp->xdf_dev_lk);
2753 	if (!xdf_isopen(vdp, part)) {
2754 		mutex_exit(&vdp->xdf_dev_lk);
2755 		xdf_io_err(bp, ENXIO, 0);
2756 		return (0);
2757 	}
2758 
2759 	/* We don't allow IO from the oe_change callback thread */
2760 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2761 
2762 	/* Check for writes to a read only device */
2763 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
2764 		mutex_exit(&vdp->xdf_dev_lk);
2765 		xdf_io_err(bp, EROFS, 0);
2766 		return (0);
2767 	}
2768 
2769 	/* Check if this I/O is accessing a partition or the entire disk */
2770 	if ((long)bp->b_private == XB_SLICE_NONE) {
2771 		/* This I/O is using an absolute offset */
2772 		p_blkct = vdp->xdf_xdev_nblocks;
2773 		p_blkst = 0;
2774 	} else {
2775 		/* This I/O is using a partition relative offset */
2776 		mutex_exit(&vdp->xdf_dev_lk);
2777 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
2778 		    &p_blkst, NULL, NULL, NULL)) {
2779 			xdf_io_err(bp, ENXIO, 0);
2780 			return (0);
2781 		}
2782 		mutex_enter(&vdp->xdf_dev_lk);
2783 	}
2784 
2785 	/*
2786 	 * Adjust the real blkno and bcount according to the underline
2787 	 * physical sector size.
2788 	 */
2789 	blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
2790 
2791 	/* check for a starting block beyond the disk or partition limit */
2792 	if (blkno > p_blkct) {
2793 		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
2794 		    vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
2795 		mutex_exit(&vdp->xdf_dev_lk);
2796 		xdf_io_err(bp, EINVAL, 0);
2797 		return (0);
2798 	}
2799 
2800 	/* Legacy: don't set error flag at this case */
2801 	if (blkno == p_blkct) {
2802 		mutex_exit(&vdp->xdf_dev_lk);
2803 		bp->b_resid = bp->b_bcount;
2804 		biodone(bp);
2805 		return (0);
2806 	}
2807 
2808 	/* sanitize the input buf */
2809 	bioerror(bp, 0);
2810 	bp->b_resid = 0;
2811 	bp->av_back = bp->av_forw = NULL;
2812 
2813 	/* Adjust for partial transfer, this will result in an error later */
2814 	if (vdp->xdf_xdev_secsize != 0 &&
2815 	    vdp->xdf_xdev_secsize != XB_BSIZE) {
2816 		nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
2817 	} else {
2818 		nblks = bp->b_bcount >> XB_BSHIFT;
2819 	}
2820 
2821 	if ((blkno + nblks) > p_blkct) {
2822 		if (vdp->xdf_xdev_secsize != 0 &&
2823 		    vdp->xdf_xdev_secsize != XB_BSIZE) {
2824 			bp->b_resid =
2825 			    ((blkno + nblks) - p_blkct) *
2826 			    vdp->xdf_xdev_secsize;
2827 		} else {
2828 			bp->b_resid =
2829 			    ((blkno + nblks) - p_blkct) <<
2830 			    XB_BSHIFT;
2831 		}
2832 		bp->b_bcount -= bp->b_resid;
2833 	}
2834 
2835 	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
2836 	    vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
2837 
2838 	/* Fix up the buf struct */
2839 	bp->b_flags |= B_BUSY;
2840 	bp->b_private = (void *)(uintptr_t)p_blkst;
2841 
2842 	xdf_bp_push(vdp, bp);
2843 	mutex_exit(&vdp->xdf_dev_lk);
2844 	xdf_io_start(vdp);
2845 	if (do_polled_io)
2846 		(void) xdf_ring_drain(vdp);
2847 	return (0);
2848 }
2849 
2850 /*ARGSUSED*/
2851 static int
2852 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
2853 {
2854 	xdf_t	*vdp;
2855 	minor_t minor;
2856 	diskaddr_t p_blkcnt;
2857 	int part;
2858 
2859 	minor = getminor(dev);
2860 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2861 		return (ENXIO);
2862 
2863 	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
2864 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2865 
2866 	part = XDF_PART(minor);
2867 	if (!xdf_isopen(vdp, part))
2868 		return (ENXIO);
2869 
2870 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2871 	    NULL, NULL, NULL, NULL))
2872 		return (ENXIO);
2873 
2874 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2875 		return (ENOSPC);
2876 
2877 	if (U_INVAL(uiop))
2878 		return (EINVAL);
2879 
2880 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
2881 }
2882 
2883 /*ARGSUSED*/
2884 static int
2885 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
2886 {
2887 	xdf_t *vdp;
2888 	minor_t minor;
2889 	diskaddr_t p_blkcnt;
2890 	int part;
2891 
2892 	minor = getminor(dev);
2893 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2894 		return (ENXIO);
2895 
2896 	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
2897 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
2898 
2899 	part = XDF_PART(minor);
2900 	if (!xdf_isopen(vdp, part))
2901 		return (ENXIO);
2902 
2903 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2904 	    NULL, NULL, NULL, NULL))
2905 		return (ENXIO);
2906 
2907 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2908 		return (ENOSPC);
2909 
2910 	if (U_INVAL(uiop))
2911 		return (EINVAL);
2912 
2913 	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
2914 }
2915 
2916 /*ARGSUSED*/
2917 static int
2918 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
2919 {
2920 	xdf_t	*vdp;
2921 	minor_t minor;
2922 	struct uio *uiop = aiop->aio_uio;
2923 	diskaddr_t p_blkcnt;
2924 	int part;
2925 
2926 	minor = getminor(dev);
2927 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2928 		return (ENXIO);
2929 
2930 	part = XDF_PART(minor);
2931 	if (!xdf_isopen(vdp, part))
2932 		return (ENXIO);
2933 
2934 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2935 	    NULL, NULL, NULL, NULL))
2936 		return (ENXIO);
2937 
2938 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2939 		return (ENOSPC);
2940 
2941 	if (U_INVAL(uiop))
2942 		return (EINVAL);
2943 
2944 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
2945 }
2946 
2947 /*ARGSUSED*/
2948 static int
2949 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
2950 {
2951 	xdf_t *vdp;
2952 	minor_t minor;
2953 	struct uio *uiop = aiop->aio_uio;
2954 	diskaddr_t p_blkcnt;
2955 	int part;
2956 
2957 	minor = getminor(dev);
2958 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2959 		return (ENXIO);
2960 
2961 	part = XDF_PART(minor);
2962 	if (!xdf_isopen(vdp, part))
2963 		return (ENXIO);
2964 
2965 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
2966 	    NULL, NULL, NULL, NULL))
2967 		return (ENXIO);
2968 
2969 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
2970 		return (ENOSPC);
2971 
2972 	if (U_INVAL(uiop))
2973 		return (EINVAL);
2974 
2975 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
2976 }
2977 
2978 static int
2979 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
2980 {
2981 	struct buf dumpbuf, *dbp = &dumpbuf;
2982 	xdf_t	*vdp;
2983 	minor_t minor;
2984 	int err = 0;
2985 	int part;
2986 	diskaddr_t p_blkcnt, p_blkst;
2987 
2988 	minor = getminor(dev);
2989 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
2990 		return (ENXIO);
2991 
2992 	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
2993 	    vdp->xdf_addr, (void *)addr, blkno, nblk));
2994 
2995 	/* We don't allow IO from the oe_change callback thread */
2996 	ASSERT(curthread != vdp->xdf_oe_change_thread);
2997 
2998 	part = XDF_PART(minor);
2999 	if (!xdf_isopen(vdp, part))
3000 		return (ENXIO);
3001 
3002 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
3003 	    NULL, NULL, NULL))
3004 		return (ENXIO);
3005 
3006 	if ((blkno + nblk) >
3007 	    (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
3008 		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
3009 		    vdp->xdf_addr, (daddr_t)((blkno + nblk) /
3010 		    (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
3011 		return (EINVAL);
3012 	}
3013 
3014 	bioinit(dbp);
3015 	dbp->b_flags = B_BUSY;
3016 	dbp->b_un.b_addr = addr;
3017 	dbp->b_bcount = nblk << DEV_BSHIFT;
3018 	dbp->b_blkno = blkno;
3019 	dbp->b_edev = dev;
3020 	dbp->b_private = (void *)(uintptr_t)p_blkst;
3021 
3022 	mutex_enter(&vdp->xdf_dev_lk);
3023 	xdf_bp_push(vdp, dbp);
3024 	mutex_exit(&vdp->xdf_dev_lk);
3025 	xdf_io_start(vdp);
3026 	err = xdf_ring_drain(vdp);
3027 	biofini(dbp);
3028 	return (err);
3029 }
3030 
3031 /*ARGSUSED*/
3032 static int
3033 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
3034 {
3035 	minor_t	minor;
3036 	xdf_t	*vdp;
3037 	int part;
3038 	ulong_t parbit;
3039 
3040 	minor = getminor(dev);
3041 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3042 		return (ENXIO);
3043 
3044 	mutex_enter(&vdp->xdf_dev_lk);
3045 	part = XDF_PART(minor);
3046 	if (!xdf_isopen(vdp, part)) {
3047 		mutex_exit(&vdp->xdf_dev_lk);
3048 		return (ENXIO);
3049 	}
3050 	parbit = 1 << part;
3051 
3052 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
3053 	if (otyp == OTYP_LYR) {
3054 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
3055 		if (--vdp->xdf_vd_lyropen[part] == 0)
3056 			vdp->xdf_vd_open[otyp] &= ~parbit;
3057 	} else {
3058 		vdp->xdf_vd_open[otyp] &= ~parbit;
3059 	}
3060 	vdp->xdf_vd_exclopen &= ~parbit;
3061 
3062 	mutex_exit(&vdp->xdf_dev_lk);
3063 	return (0);
3064 }
3065 
3066 static int
3067 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
3068 {
3069 	minor_t	minor;
3070 	xdf_t	*vdp;
3071 	int part;
3072 	ulong_t parbit;
3073 	diskaddr_t p_blkct = 0;
3074 	boolean_t firstopen;
3075 	boolean_t nodelay;
3076 
3077 	minor = getminor(*devp);
3078 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
3079 		return (ENXIO);
3080 
3081 	nodelay = (flag & (FNDELAY | FNONBLOCK));
3082 
3083 	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
3084 
3085 	/* do cv_wait until connected or failed */
3086 	mutex_enter(&vdp->xdf_cb_lk);
3087 	mutex_enter(&vdp->xdf_dev_lk);
3088 	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
3089 		mutex_exit(&vdp->xdf_dev_lk);
3090 		mutex_exit(&vdp->xdf_cb_lk);
3091 		return (ENXIO);
3092 	}
3093 	mutex_exit(&vdp->xdf_cb_lk);
3094 
3095 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
3096 		mutex_exit(&vdp->xdf_dev_lk);
3097 		return (EROFS);
3098 	}
3099 
3100 	part = XDF_PART(minor);
3101 	parbit = 1 << part;
3102 	if ((vdp->xdf_vd_exclopen & parbit) ||
3103 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
3104 		mutex_exit(&vdp->xdf_dev_lk);
3105 		return (EBUSY);
3106 	}
3107 
3108 	/* are we the first one to open this node? */
3109 	firstopen = !xdf_isopen(vdp, -1);
3110 
3111 	if (otyp == OTYP_LYR)
3112 		vdp->xdf_vd_lyropen[part]++;
3113 
3114 	vdp->xdf_vd_open[otyp] |= parbit;
3115 
3116 	if (flag & FEXCL)
3117 		vdp->xdf_vd_exclopen |= parbit;
3118 
3119 	mutex_exit(&vdp->xdf_dev_lk);
3120 
3121 	/* force a re-validation */
3122 	if (firstopen)
3123 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
3124 
3125 	/* If this is a non-blocking open then we're done */
3126 	if (nodelay)
3127 		return (0);
3128 
3129 	/*
3130 	 * This is a blocking open, so we require:
3131 	 * - that the disk have a valid label on it
3132 	 * - that the size of the partition that we're opening is non-zero
3133 	 */
3134 	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
3135 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
3136 		(void) xdf_close(*devp, flag, otyp, credp);
3137 		return (ENXIO);
3138 	}
3139 
3140 	return (0);
3141 }
3142 
3143 /*ARGSUSED*/
3144 static void
3145 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
3146 {
3147 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
3148 	cv_broadcast(&vdp->xdf_hp_status_cv);
3149 }
3150 
3151 static int
3152 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
3153     char *name, caddr_t valuep, int *lengthp)
3154 {
3155 	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
3156 
3157 	/*
3158 	 * Sanity check that if a dev_t or dip were specified that they
3159 	 * correspond to this device driver.  On debug kernels we'll
3160 	 * panic and on non-debug kernels we'll return failure.
3161 	 */
3162 	ASSERT(ddi_driver_major(dip) == xdf_major);
3163 	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
3164 	if ((ddi_driver_major(dip) != xdf_major) ||
3165 	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
3166 		return (DDI_PROP_NOT_FOUND);
3167 
3168 	if (vdp == NULL)
3169 		return (ddi_prop_op(dev, dip, prop_op, flags,
3170 		    name, valuep, lengthp));
3171 
3172 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
3173 	    dev, dip, prop_op, flags, name, valuep, lengthp,
3174 	    XDF_PART(getminor(dev)), NULL));
3175 }
3176 
3177 /*ARGSUSED*/
3178 static int
3179 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
3180 {
3181 	int	instance = XDF_INST(getminor((dev_t)arg));
3182 	xdf_t	*vbdp;
3183 
3184 	switch (cmd) {
3185 	case DDI_INFO_DEVT2DEVINFO:
3186 		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
3187 			*rp = NULL;
3188 			return (DDI_FAILURE);
3189 		}
3190 		*rp = vbdp->xdf_dip;
3191 		return (DDI_SUCCESS);
3192 
3193 	case DDI_INFO_DEVT2INSTANCE:
3194 		*rp = (void *)(uintptr_t)instance;
3195 		return (DDI_SUCCESS);
3196 
3197 	default:
3198 		return (DDI_FAILURE);
3199 	}
3200 }
3201 
3202 /*ARGSUSED*/
3203 static int
3204 xdf_resume(dev_info_t *dip)
3205 {
3206 	xdf_t	*vdp;
3207 	char	*oename;
3208 
3209 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
3210 		goto err;
3211 
3212 	if (xdf_debug & SUSRES_DBG)
3213 		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
3214 
3215 	mutex_enter(&vdp->xdf_cb_lk);
3216 
3217 	if (xvdi_resume(dip) != DDI_SUCCESS) {
3218 		mutex_exit(&vdp->xdf_cb_lk);
3219 		goto err;
3220 	}
3221 
3222 	if (((oename = xvdi_get_oename(dip)) == NULL) ||
3223 	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3224 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
3225 		mutex_exit(&vdp->xdf_cb_lk);
3226 		goto err;
3227 	}
3228 
3229 	mutex_enter(&vdp->xdf_dev_lk);
3230 	ASSERT(vdp->xdf_state != XD_READY);
3231 	xdf_set_state(vdp, XD_UNKNOWN);
3232 	mutex_exit(&vdp->xdf_dev_lk);
3233 
3234 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3235 		mutex_exit(&vdp->xdf_cb_lk);
3236 		goto err;
3237 	}
3238 
3239 	mutex_exit(&vdp->xdf_cb_lk);
3240 
3241 	if (xdf_debug & SUSRES_DBG)
3242 		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
3243 	return (DDI_SUCCESS);
3244 err:
3245 	if (xdf_debug & SUSRES_DBG)
3246 		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
3247 	return (DDI_FAILURE);
3248 }
3249 
3250 /*
3251  * Uses the in-memory devid if one exists.
3252  *
3253  * Create a devid and write it on the first block of the last track of
3254  * the last cylinder.
3255  * Return DDI_SUCCESS or DDI_FAILURE.
3256  */
3257 static int
3258 xdf_devid_fabricate(xdf_t *vdp)
3259 {
3260 	ddi_devid_t	devid = vdp->xdf_tgt_devid; /* null if no devid */
3261 	struct dk_devid *dkdevidp = NULL; /* devid struct stored on disk */
3262 	diskaddr_t	blk;
3263 	uint_t		*ip, chksum;
3264 	int		i, devid_size;
3265 
3266 	if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3267 		goto err;
3268 
3269 	if (devid == NULL && ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0,
3270 	    NULL, &devid) != DDI_SUCCESS)
3271 		goto err;
3272 
3273 	/* allocate a buffer */
3274 	dkdevidp = (struct dk_devid *)kmem_zalloc(NBPSCTR, KM_SLEEP);
3275 
3276 	/* Fill in the revision */
3277 	dkdevidp->dkd_rev_hi = DK_DEVID_REV_MSB;
3278 	dkdevidp->dkd_rev_lo = DK_DEVID_REV_LSB;
3279 
3280 	/* Copy in the device id */
3281 	devid_size = ddi_devid_sizeof(devid);
3282 	if (devid_size > DK_DEVID_SIZE)
3283 		goto err;
3284 	bcopy(devid, dkdevidp->dkd_devid, devid_size);
3285 
3286 	/* Calculate the chksum */
3287 	chksum = 0;
3288 	ip = (uint_t *)dkdevidp;
3289 	for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3290 		chksum ^= ip[i];
3291 
3292 	/* Fill in the checksum */
3293 	DKD_FORMCHKSUM(chksum, dkdevidp);
3294 
3295 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, dkdevidp, blk,
3296 	    NBPSCTR, NULL) != 0)
3297 		goto err;
3298 
3299 	kmem_free(dkdevidp, NBPSCTR);
3300 
3301 	vdp->xdf_tgt_devid = devid;
3302 	return (DDI_SUCCESS);
3303 
3304 err:
3305 	if (dkdevidp != NULL)
3306 		kmem_free(dkdevidp, NBPSCTR);
3307 	if (devid != NULL && vdp->xdf_tgt_devid == NULL)
3308 		ddi_devid_free(devid);
3309 	return (DDI_FAILURE);
3310 }
3311 
3312 /*
3313  * xdf_devid_read() is a local copy of xdfs_devid_read(), modified to use xdf
3314  * functions.
3315  *
3316  * Read a devid from on the first block of the last track of
3317  * the last cylinder.  Make sure what we read is a valid devid.
3318  * Return DDI_SUCCESS or DDI_FAILURE.
3319  */
3320 static int
3321 xdf_devid_read(xdf_t *vdp)
3322 {
3323 	diskaddr_t	blk;
3324 	struct dk_devid *dkdevidp;
3325 	uint_t		*ip, chksum;
3326 	int		i;
3327 
3328 	if (cmlb_get_devid_block(vdp->xdf_vd_lbl, &blk, NULL) != 0)
3329 		return (DDI_FAILURE);
3330 
3331 	dkdevidp = kmem_zalloc(NBPSCTR, KM_SLEEP);
3332 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, dkdevidp, blk,
3333 	    NBPSCTR, NULL) != 0)
3334 		goto err;
3335 
3336 	/* Validate the revision */
3337 	if ((dkdevidp->dkd_rev_hi != DK_DEVID_REV_MSB) ||
3338 	    (dkdevidp->dkd_rev_lo != DK_DEVID_REV_LSB))
3339 		goto err;
3340 
3341 	/* Calculate the checksum */
3342 	chksum = 0;
3343 	ip = (uint_t *)dkdevidp;
3344 	for (i = 0; i < (NBPSCTR / sizeof (int)) - 1; i++)
3345 		chksum ^= ip[i];
3346 	if (DKD_GETCHKSUM(dkdevidp) != chksum)
3347 		goto err;
3348 
3349 	/* Validate the device id */
3350 	if (ddi_devid_valid((ddi_devid_t)dkdevidp->dkd_devid) != DDI_SUCCESS)
3351 		goto err;
3352 
3353 	/* keep a copy of the device id */
3354 	i = ddi_devid_sizeof((ddi_devid_t)dkdevidp->dkd_devid);
3355 	vdp->xdf_tgt_devid = kmem_alloc(i, KM_SLEEP);
3356 	bcopy(dkdevidp->dkd_devid, vdp->xdf_tgt_devid, i);
3357 	kmem_free(dkdevidp, NBPSCTR);
3358 	return (DDI_SUCCESS);
3359 
3360 err:
3361 	kmem_free(dkdevidp, NBPSCTR);
3362 	return (DDI_FAILURE);
3363 }
3364 
3365 /*
3366  * xdf_devid_setup() is a modified copy of cmdk_devid_setup().
3367  *
3368  * This function creates a devid if we don't already have one, and
3369  * registers it.  If we already have one, we make sure that it can be
3370  * read from the disk, otherwise we write it to the disk ourselves.  If
3371  * we didn't already have a devid, and we create one, we also need to
3372  * register it.
3373  */
3374 void
3375 xdf_devid_setup(xdf_t *vdp)
3376 {
3377 	int rc;
3378 	boolean_t existed = vdp->xdf_tgt_devid != NULL;
3379 
3380 	/* Read devid from the disk, if present */
3381 	rc = xdf_devid_read(vdp);
3382 
3383 	/* Otherwise write a devid (which we create if necessary) on the disk */
3384 	if (rc != DDI_SUCCESS)
3385 		rc = xdf_devid_fabricate(vdp);
3386 
3387 	/* If we created a devid or found it on the disk, register it */
3388 	if (rc == DDI_SUCCESS && !existed)
3389 		(void) ddi_devid_register(vdp->xdf_dip, vdp->xdf_tgt_devid);
3390 }
3391 
3392 static int
3393 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3394 {
3395 	int			n, instance = ddi_get_instance(dip);
3396 	ddi_iblock_cookie_t	ibc, softibc;
3397 	boolean_t		dev_iscd = B_FALSE;
3398 	xdf_t			*vdp;
3399 	char			*oename, *xsname, *str;
3400 	clock_t			timeout;
3401 	int			err = 0;
3402 
3403 	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
3404 	    "xdf_debug", 0)) != 0)
3405 		xdf_debug = n;
3406 
3407 	switch (cmd) {
3408 	case DDI_RESUME:
3409 		return (xdf_resume(dip));
3410 	case DDI_ATTACH:
3411 		break;
3412 	default:
3413 		return (DDI_FAILURE);
3414 	}
3415 	/* DDI_ATTACH */
3416 
3417 	if ((xsname = xvdi_get_xsname(dip)) == NULL ||
3418 	    (oename = xvdi_get_oename(dip)) == NULL)
3419 		return (DDI_FAILURE);
3420 
3421 	/*
3422 	 * Disable auto-detach.  This is necessary so that we don't get
3423 	 * detached while we're disconnected from the back end.
3424 	 */
3425 	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
3426 	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
3427 		return (DDI_FAILURE);
3428 
3429 	/* driver handles kernel-issued IOCTLs */
3430 	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
3431 	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
3432 		return (DDI_FAILURE);
3433 
3434 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
3435 		return (DDI_FAILURE);
3436 
3437 	if (ddi_get_soft_iblock_cookie(dip,
3438 	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
3439 		return (DDI_FAILURE);
3440 
3441 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
3442 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
3443 		    ddi_get_name_addr(dip));
3444 		return (DDI_FAILURE);
3445 	}
3446 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
3447 		dev_iscd = B_TRUE;
3448 	strfree(str);
3449 
3450 	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
3451 		return (DDI_FAILURE);
3452 
3453 	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
3454 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3455 	ddi_set_driver_private(dip, vdp);
3456 	vdp->xdf_dip = dip;
3457 	vdp->xdf_addr = ddi_get_name_addr(dip);
3458 	vdp->xdf_suspending = B_FALSE;
3459 	vdp->xdf_media_req_supported = B_FALSE;
3460 	vdp->xdf_peer = INVALID_DOMID;
3461 	vdp->xdf_evtchn = INVALID_EVTCHN;
3462 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
3463 	    offsetof(v_req_t, v_link));
3464 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
3465 	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
3466 	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
3467 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3468 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3469 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
3470 	vdp->xdf_cmlb_reattach = B_TRUE;
3471 	if (dev_iscd) {
3472 		vdp->xdf_dinfo |= VDISK_CDROM;
3473 		vdp->xdf_mstate = DKIO_EJECTED;
3474 	} else {
3475 		vdp->xdf_mstate = DKIO_NONE;
3476 	}
3477 
3478 	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
3479 	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
3480 		goto errout0;
3481 
3482 	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
3483 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
3484 		goto errout0;
3485 
3486 	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
3487 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
3488 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
3489 		    ddi_get_name_addr(dip));
3490 		goto errout0;
3491 	}
3492 
3493 	/*
3494 	 * Initialize the physical geometry stucture.  Note that currently
3495 	 * we don't know the size of the backend device so the number
3496 	 * of blocks on the device will be initialized to zero.  Once
3497 	 * we connect to the backend device we'll update the physical
3498 	 * geometry to reflect the real size of the device.
3499 	 */
3500 	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
3501 	vdp->xdf_pgeom_fixed = B_FALSE;
3502 
3503 	/*
3504 	 * Allocate the cmlb handle, minor nodes will be created once
3505 	 * the device is connected with backend.
3506 	 */
3507 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
3508 
3509 	/* We ship with cache-enabled disks */
3510 	vdp->xdf_wce = B_TRUE;
3511 
3512 	mutex_enter(&vdp->xdf_cb_lk);
3513 	/* Watch backend XenbusState change */
3514 	if (xvdi_add_event_handler(dip,
3515 	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
3516 		mutex_exit(&vdp->xdf_cb_lk);
3517 		goto errout0;
3518 	}
3519 
3520 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
3521 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
3522 		    ddi_get_name_addr(dip));
3523 		mutex_exit(&vdp->xdf_cb_lk);
3524 		goto errout1;
3525 	}
3526 
3527 	/* Nothing else to do for CD devices */
3528 	if (dev_iscd) {
3529 		mutex_exit(&vdp->xdf_cb_lk);
3530 		goto done;
3531 	}
3532 
3533 	/*
3534 	 * In order to do cmlb_validate, we have to wait for the disk to
3535 	 * acknowledge the attach, so we can query the backend for the disk
3536 	 * geometry (see xdf_setstate_connected).
3537 	 *
3538 	 * We only wait 30 seconds; if this is the root disk, the boot
3539 	 * will fail, but it would fail anyway if the device never
3540 	 * connected.  If this is a non-boot disk, that disk will fail
3541 	 * to connect, but again, it would fail anyway.
3542 	 */
3543 	timeout = ddi_get_lbolt() + drv_usectohz(XDF_STATE_TIMEOUT);
3544 	while (vdp->xdf_state != XD_CONNECTED && vdp->xdf_state != XD_READY) {
3545 		if (cv_timedwait(&vdp->xdf_dev_cv, &vdp->xdf_cb_lk,
3546 		    timeout) < 0) {
3547 			cmn_err(CE_WARN, "xdf@%s: disk failed to connect",
3548 			    ddi_get_name_addr(dip));
3549 			mutex_exit(&vdp->xdf_cb_lk);
3550 			goto errout1;
3551 		}
3552 	}
3553 	mutex_exit(&vdp->xdf_cb_lk);
3554 
3555 	/*
3556 	 * We call cmlb_validate so that the geometry information in
3557 	 * vdp->xdf_vd_lbl is correct; this fills out the number of
3558 	 * alternate cylinders so that we have a place to write the
3559 	 * devid.
3560 	 */
3561 	if ((err = cmlb_validate(vdp->xdf_vd_lbl, 0, NULL)) != 0) {
3562 		cmn_err(CE_NOTE,
3563 		    "xdf@%s: cmlb_validate failed: %d",
3564 		    ddi_get_name_addr(dip), err);
3565 		/*
3566 		 * We can carry on even if cmlb_validate() returns EINVAL here,
3567 		 * as we'll rewrite the disk label anyway.
3568 		 */
3569 		if (err != EINVAL)
3570 			goto errout1;
3571 	}
3572 
3573 	/*
3574 	 * xdf_devid_setup will only write a devid if one isn't
3575 	 * already present.  If it fails to find or create one, we
3576 	 * create one in-memory so that when we label the disk later,
3577 	 * it will have a devid to use.  This is helpful to deal with
3578 	 * cases where people use the devids of their disks before
3579 	 * labelling them; note that this does cause problems if
3580 	 * people rely on the devids of unlabelled disks to persist
3581 	 * across reboot.
3582 	 */
3583 	xdf_devid_setup(vdp);
3584 	if (vdp->xdf_tgt_devid == NULL) {
3585 		if (ddi_devid_init(vdp->xdf_dip, DEVID_FAB, 0, NULL,
3586 		    &vdp->xdf_tgt_devid) != DDI_SUCCESS) {
3587 			cmn_err(CE_WARN,
3588 			    "xdf@%s_ attach failed, devid_init failed",
3589 			    ddi_get_name_addr(dip));
3590 			goto errout1;
3591 		} else {
3592 			(void) ddi_devid_register(vdp->xdf_dip,
3593 			    vdp->xdf_tgt_devid);
3594 		}
3595 	}
3596 
3597 done:
3598 #ifdef XPV_HVM_DRIVER
3599 	xdf_hvm_add(dip);
3600 
3601 	/* Report our version to dom0 */
3602 	(void) xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
3603 	    HVMPV_XDF_VERS);
3604 #endif /* XPV_HVM_DRIVER */
3605 
3606 	/* Create kstat for iostat(1M) */
3607 	if (xdf_kstat_create(dip) != 0) {
3608 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
3609 		    ddi_get_name_addr(dip));
3610 		goto errout1;
3611 	}
3612 
3613 	/*
3614 	 * Don't bother with getting real device identification
3615 	 * strings (is it even possible?), they are unlikely to
3616 	 * change often (if at all).
3617 	 */
3618 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_VENDOR_ID,
3619 	    "Xen");
3620 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_PRODUCT_ID,
3621 	    dev_iscd ? "Virtual CD" : "Virtual disk");
3622 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, INQUIRY_REVISION_ID,
3623 	    "1.0");
3624 
3625 	ddi_report_dev(dip);
3626 	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
3627 	return (DDI_SUCCESS);
3628 
3629 errout1:
3630 	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
3631 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3632 errout0:
3633 	if (vdp->xdf_vd_lbl != NULL) {
3634 		cmlb_free_handle(&vdp->xdf_vd_lbl);
3635 		vdp->xdf_vd_lbl = NULL;
3636 	}
3637 	if (vdp->xdf_softintr_id != NULL)
3638 		ddi_remove_softintr(vdp->xdf_softintr_id);
3639 	xvdi_remove_xb_watch_handlers(dip);
3640 	if (vdp->xdf_ready_tq != NULL)
3641 		ddi_taskq_destroy(vdp->xdf_ready_tq);
3642 	mutex_destroy(&vdp->xdf_cb_lk);
3643 	mutex_destroy(&vdp->xdf_dev_lk);
3644 	cv_destroy(&vdp->xdf_dev_cv);
3645 	cv_destroy(&vdp->xdf_hp_status_cv);
3646 	ddi_soft_state_free(xdf_ssp, instance);
3647 	ddi_set_driver_private(dip, NULL);
3648 	ddi_prop_remove_all(dip);
3649 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
3650 	return (DDI_FAILURE);
3651 }
3652 
3653 static int
3654 xdf_suspend(dev_info_t *dip)
3655 {
3656 	int		instance = ddi_get_instance(dip);
3657 	xdf_t		*vdp;
3658 
3659 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
3660 		return (DDI_FAILURE);
3661 
3662 	if (xdf_debug & SUSRES_DBG)
3663 		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
3664 
3665 	xvdi_suspend(dip);
3666 
3667 	mutex_enter(&vdp->xdf_cb_lk);
3668 	mutex_enter(&vdp->xdf_dev_lk);
3669 
3670 	vdp->xdf_suspending = B_TRUE;
3671 	xdf_ring_destroy(vdp);
3672 	xdf_set_state(vdp, XD_SUSPEND);
3673 	vdp->xdf_suspending = B_FALSE;
3674 
3675 	mutex_exit(&vdp->xdf_dev_lk);
3676 	mutex_exit(&vdp->xdf_cb_lk);
3677 
3678 	if (xdf_debug & SUSRES_DBG)
3679 		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
3680 
3681 	return (DDI_SUCCESS);
3682 }
3683 
3684 static int
3685 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3686 {
3687 	xdf_t *vdp;
3688 	int instance;
3689 
3690 	switch (cmd) {
3691 
3692 	case DDI_PM_SUSPEND:
3693 		break;
3694 
3695 	case DDI_SUSPEND:
3696 		return (xdf_suspend(dip));
3697 
3698 	case DDI_DETACH:
3699 		break;
3700 
3701 	default:
3702 		return (DDI_FAILURE);
3703 	}
3704 
3705 	instance = ddi_get_instance(dip);
3706 	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
3707 	vdp = ddi_get_soft_state(xdf_ssp, instance);
3708 
3709 	if (vdp == NULL)
3710 		return (DDI_FAILURE);
3711 
3712 	mutex_enter(&vdp->xdf_cb_lk);
3713 	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
3714 	if (vdp->xdf_state != XD_CLOSED) {
3715 		mutex_exit(&vdp->xdf_cb_lk);
3716 		return (DDI_FAILURE);
3717 	}
3718 	mutex_exit(&vdp->xdf_cb_lk);
3719 
3720 	ASSERT(!ISDMACBON(vdp));
3721 
3722 #ifdef XPV_HVM_DRIVER
3723 	xdf_hvm_rm(dip);
3724 #endif /* XPV_HVM_DRIVER */
3725 
3726 	if (vdp->xdf_timeout_id != 0)
3727 		(void) untimeout(vdp->xdf_timeout_id);
3728 
3729 	xvdi_remove_event_handler(dip, XS_OE_STATE);
3730 	ddi_taskq_destroy(vdp->xdf_ready_tq);
3731 
3732 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
3733 	cmlb_free_handle(&vdp->xdf_vd_lbl);
3734 
3735 	/* we'll support backend running in domU later */
3736 #ifdef	DOMU_BACKEND
3737 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
3738 #endif
3739 
3740 	list_destroy(&vdp->xdf_vreq_act);
3741 	ddi_prop_remove_all(dip);
3742 	xdf_kstat_delete(dip);
3743 	ddi_remove_softintr(vdp->xdf_softintr_id);
3744 	xvdi_remove_xb_watch_handlers(dip);
3745 	ddi_set_driver_private(dip, NULL);
3746 	cv_destroy(&vdp->xdf_dev_cv);
3747 	mutex_destroy(&vdp->xdf_cb_lk);
3748 	mutex_destroy(&vdp->xdf_dev_lk);
3749 	if (vdp->xdf_cache_flush_block != NULL)
3750 		kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
3751 	ddi_soft_state_free(xdf_ssp, instance);
3752 	return (DDI_SUCCESS);
3753 }
3754 
3755 /*
3756  * Driver linkage structures.
3757  */
3758 static struct cb_ops xdf_cbops = {
3759 	xdf_open,
3760 	xdf_close,
3761 	xdf_strategy,
3762 	nodev,
3763 	xdf_dump,
3764 	xdf_read,
3765 	xdf_write,
3766 	xdf_ioctl,
3767 	nodev,
3768 	nodev,
3769 	nodev,
3770 	nochpoll,
3771 	xdf_prop_op,
3772 	NULL,
3773 	D_MP | D_NEW | D_64BIT,
3774 	CB_REV,
3775 	xdf_aread,
3776 	xdf_awrite
3777 };
3778 
3779 struct dev_ops xdf_devops = {
3780 	DEVO_REV,		/* devo_rev */
3781 	0,			/* devo_refcnt */
3782 	xdf_getinfo,		/* devo_getinfo */
3783 	nulldev,		/* devo_identify */
3784 	nulldev,		/* devo_probe */
3785 	xdf_attach,		/* devo_attach */
3786 	xdf_detach,		/* devo_detach */
3787 	nodev,			/* devo_reset */
3788 	&xdf_cbops,		/* devo_cb_ops */
3789 	NULL,			/* devo_bus_ops */
3790 	NULL,			/* devo_power */
3791 	ddi_quiesce_not_supported, /* devo_quiesce */
3792 };
3793 
3794 /*
3795  * Module linkage structures.
3796  */
3797 static struct modldrv modldrv = {
3798 	&mod_driverops,		/* Type of module.  This one is a driver */
3799 	"virtual block driver",	/* short description */
3800 	&xdf_devops		/* driver specific ops */
3801 };
3802 
3803 static struct modlinkage xdf_modlinkage = {
3804 	MODREV_1, (void *)&modldrv, NULL
3805 };
3806 
3807 /*
3808  * standard module entry points
3809  */
3810 int
3811 _init(void)
3812 {
3813 	int rc;
3814 
3815 	xdf_major = ddi_name_to_major("xdf");
3816 	if (xdf_major == (major_t)-1)
3817 		return (EINVAL);
3818 
3819 	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
3820 		return (rc);
3821 
3822 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
3823 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3824 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
3825 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3826 
3827 #ifdef XPV_HVM_DRIVER
3828 	xdf_hvm_init();
3829 #endif /* XPV_HVM_DRIVER */
3830 
3831 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
3832 #ifdef XPV_HVM_DRIVER
3833 		xdf_hvm_fini();
3834 #endif /* XPV_HVM_DRIVER */
3835 		kmem_cache_destroy(xdf_vreq_cache);
3836 		kmem_cache_destroy(xdf_gs_cache);
3837 		ddi_soft_state_fini(&xdf_ssp);
3838 		return (rc);
3839 	}
3840 
3841 	return (rc);
3842 }
3843 
3844 int
3845 _fini(void)
3846 {
3847 	int err;
3848 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
3849 		return (err);
3850 
3851 #ifdef XPV_HVM_DRIVER
3852 	xdf_hvm_fini();
3853 #endif /* XPV_HVM_DRIVER */
3854 
3855 	kmem_cache_destroy(xdf_vreq_cache);
3856 	kmem_cache_destroy(xdf_gs_cache);
3857 	ddi_soft_state_fini(&xdf_ssp);
3858 
3859 	return (0);
3860 }
3861 
3862 int
3863 _info(struct modinfo *modinfop)
3864 {
3865 	return (mod_info(&xdf_modlinkage, modinfop));
3866 }
3867