xref: /illumos-gate/usr/src/uts/common/io/vioblk/vioblk.c (revision 1a5ae140ba142cafb59ab08b3212c4ebbce84f32)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
25  * Copyright 2020 Joyent Inc.
26  * Copyright 2019 Western Digital Corporation.
27  */
28 
29 /*
30  * VIRTIO BLOCK DRIVER
31  *
32  * This driver provides support for Virtio Block devices.  Each driver instance
33  * attaches to a single underlying block device.
34  *
35  * REQUEST CHAIN LAYOUT
36  *
37  * Every request chain sent to the I/O queue has the following structure.  Each
38  * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within
39  * the chain:
40  *
41  *    +-0-----------------------------------------+
42  *    | struct virtio_blk_hdr                     |-----------------------\
43  *    |   (written by driver, read by device)     |                       |
44  *    +-1-----------------------------------------+                       |
45  *    | optional data payload                     |--\                    |
46  *    |   (written by driver for write requests,  |  |                    |
47  *    |    or by device for read requests)        |  |                    |
48  *    +-2-----------------------------------------+  |                    |
49  *    | ,~`           :                              |-cookies loaned     |
50  *    |/              :                        ,~`|  | from blkdev        |
51  *                    :                       /   |  |                    |
52  *    +-(N - 1)-----------------------------------+  |                    |
53  *    | ... end of data payload.                  |  |                    |
54  *    |                                           |  |                    |
55  *    |                                           |--/                    |
56  *    +-N-----------------------------------------+                       |
57  *    | status byte                               |                       |
58  *    |   (written by device, read by driver)     |--------------------\  |
59  *    +-------------------------------------------+                    |  |
60  *                                                                     |  |
61  * The memory for the header and status bytes (i.e., 0 and N above)    |  |
62  * is allocated as a single chunk by vioblk_alloc_reqs():              |  |
63  *                                                                     |  |
64  *    +-------------------------------------------+                    |  |
65  *    | struct virtio_blk_hdr                     |<----------------------/
66  *    +-------------------------------------------+                    |
67  *    | status byte                               |<-------------------/
68  *    +-------------------------------------------+
69  */
70 
71 #include <sys/modctl.h>
72 #include <sys/blkdev.h>
73 #include <sys/types.h>
74 #include <sys/errno.h>
75 #include <sys/param.h>
76 #include <sys/stropts.h>
77 #include <sys/stream.h>
78 #include <sys/strsubr.h>
79 #include <sys/kmem.h>
80 #include <sys/conf.h>
81 #include <sys/devops.h>
82 #include <sys/ksynch.h>
83 #include <sys/stat.h>
84 #include <sys/modctl.h>
85 #include <sys/debug.h>
86 #include <sys/pci.h>
87 #include <sys/containerof.h>
88 #include <sys/ctype.h>
89 #include <sys/sysmacros.h>
90 #include <sys/dkioc_free_util.h>
91 
92 #include "virtio.h"
93 #include "vioblk.h"
94 
95 static void vioblk_get_id(vioblk_t *);
96 uint_t vioblk_int_handler(caddr_t, caddr_t);
97 static uint_t vioblk_poll(vioblk_t *);
98 static int vioblk_quiesce(dev_info_t *);
99 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t);
100 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t);
101 
102 
103 static struct dev_ops vioblk_dev_ops = {
104 	.devo_rev =			DEVO_REV,
105 	.devo_refcnt =			0,
106 
107 	.devo_attach =			vioblk_attach,
108 	.devo_detach =			vioblk_detach,
109 	.devo_quiesce =			vioblk_quiesce,
110 
111 	.devo_getinfo =			ddi_no_info,
112 	.devo_identify =		nulldev,
113 	.devo_probe =			nulldev,
114 	.devo_reset =			nodev,
115 	.devo_cb_ops =			NULL,
116 	.devo_bus_ops =			NULL,
117 	.devo_power =			NULL,
118 };
119 
120 static struct modldrv vioblk_modldrv = {
121 	.drv_modops =			&mod_driverops,
122 	.drv_linkinfo =			"VIRTIO block driver",
123 	.drv_dev_ops =			&vioblk_dev_ops
124 };
125 
126 static struct modlinkage vioblk_modlinkage = {
127 	.ml_rev =			MODREV_1,
128 	.ml_linkage =			{ &vioblk_modldrv, NULL }
129 };
130 
131 /*
132  * DMA attribute template for header and status blocks.  We also make a
133  * per-instance copy of this template with negotiated sizes from the device for
134  * blkdev.
135  */
136 static const ddi_dma_attr_t vioblk_dma_attr = {
137 	.dma_attr_version =		DMA_ATTR_V0,
138 	.dma_attr_addr_lo =		0x0000000000000000,
139 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
140 	.dma_attr_count_max =		0x00000000FFFFFFFF,
141 	.dma_attr_align =		1,
142 	.dma_attr_burstsizes =		1,
143 	.dma_attr_minxfer =		1,
144 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
145 	.dma_attr_seg =			0x00000000FFFFFFFF,
146 	.dma_attr_sgllen =		1,
147 	.dma_attr_granular =		1,
148 	.dma_attr_flags =		0
149 };
150 
151 static vioblk_req_t *
152 vioblk_req_alloc(vioblk_t *vib)
153 {
154 	vioblk_req_t *vbr;
155 
156 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
157 
158 	if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) {
159 		return (NULL);
160 	}
161 	vib->vib_nreqs_alloc++;
162 
163 	VERIFY0(vbr->vbr_status);
164 	vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED;
165 
166 	VERIFY3P(vbr->vbr_xfer, ==, NULL);
167 	VERIFY3S(vbr->vbr_error, ==, 0);
168 
169 	return (vbr);
170 }
171 
172 static void
173 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr)
174 {
175 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
176 
177 	/*
178 	 * Check that this request was allocated, then zero the status field to
179 	 * clear all status bits.
180 	 */
181 	VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED);
182 	vbr->vbr_status = 0;
183 
184 	vbr->vbr_xfer = NULL;
185 	vbr->vbr_error = 0;
186 	vbr->vbr_type = 0;
187 
188 	list_insert_head(&vib->vib_reqs, vbr);
189 
190 	VERIFY3U(vib->vib_nreqs_alloc, >, 0);
191 	vib->vib_nreqs_alloc--;
192 }
193 
194 static void
195 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr)
196 {
197 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
198 
199 	VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE));
200 	vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE;
201 
202 	if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) {
203 		vib->vib_stats->vbs_rw_cacheflush.value.ui64++;
204 	}
205 
206 	if (vbr->vbr_xfer != NULL) {
207 		/*
208 		 * This is a blkdev framework request.
209 		 */
210 		mutex_exit(&vib->vib_mutex);
211 		bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error);
212 		mutex_enter(&vib->vib_mutex);
213 		vbr->vbr_xfer = NULL;
214 	}
215 }
216 
217 static virtio_chain_t *
218 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector,
219     boolean_t polled)
220 {
221 	vioblk_req_t *vbr = NULL;
222 	virtio_chain_t *vic = NULL;
223 
224 	if ((vbr = vioblk_req_alloc(vib)) == NULL) {
225 		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
226 		return (NULL);
227 	}
228 	vbr->vbr_type = type;
229 
230 	if (polled) {
231 		/*
232 		 * Mark this command as polled so that we can wait on it
233 		 * ourselves.
234 		 */
235 		vbr->vbr_status |= VIOBLK_REQSTAT_POLLED;
236 	}
237 
238 	if ((vic = virtio_chain_alloc(vib->vib_vq, KM_NOSLEEP)) == NULL) {
239 		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
240 		goto fail;
241 	}
242 
243 	struct vioblk_req_hdr vbh;
244 	vbh.vbh_type = type;
245 	vbh.vbh_ioprio = 0;
246 	vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE;
247 	bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh));
248 
249 	virtio_chain_data_set(vic, vbr);
250 
251 	/*
252 	 * Put the header in the first descriptor.  See the block comment at
253 	 * the top of the file for more details on the chain layout.
254 	 */
255 	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0),
256 	    sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) !=
257 	    DDI_SUCCESS) {
258 		goto fail;
259 	}
260 
261 	return (vic);
262 
263 fail:
264 	vbr->vbr_xfer = NULL;
265 	vioblk_req_free(vib, vbr);
266 	if (vic != NULL) {
267 		virtio_chain_free(vic);
268 	}
269 	return (NULL);
270 }
271 
272 static int
273 vioblk_common_submit(vioblk_t *vib, virtio_chain_t *vic)
274 {
275 	int r;
276 	vioblk_req_t *vbr = virtio_chain_data(vic);
277 
278 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
279 
280 	/*
281 	 * The device will write the status byte into this last descriptor.
282 	 * See the block comment at the top of the file for more details on the
283 	 * chain layout.
284 	 */
285 	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) +
286 	    sizeof (struct vioblk_req_hdr), sizeof (uint8_t),
287 	    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
288 		r = ENOMEM;
289 		goto out;
290 	}
291 
292 	virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV);
293 	virtio_chain_submit(vic, B_TRUE);
294 
295 	if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) {
296 		/*
297 		 * This is not a polled request.  Our request will be freed and
298 		 * the caller notified later in vioblk_poll().
299 		 */
300 		return (0);
301 	}
302 
303 	/*
304 	 * This is a polled request.  We need to block here and wait for the
305 	 * device to complete request processing.
306 	 */
307 	while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) {
308 		if (ddi_in_panic()) {
309 			/*
310 			 * When panicking, interrupts are disabled.  We must
311 			 * poll the queue manually.
312 			 */
313 			drv_usecwait(10);
314 			(void) vioblk_poll(vib);
315 			continue;
316 		}
317 
318 		/*
319 		 * When not panicking, the device will interrupt on command
320 		 * completion and vioblk_poll() will be called to wake us up.
321 		 */
322 		cv_wait(&vib->vib_cv, &vib->vib_mutex);
323 	}
324 
325 	vioblk_complete(vib, vbr);
326 	r = vbr->vbr_error;
327 
328 out:
329 	vioblk_req_free(vib, vbr);
330 	virtio_chain_free(vic);
331 	return (r);
332 }
333 
334 static int
335 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma,
336     uint64_t sector, virtio_direction_t dir)
337 {
338 	virtio_chain_t *vic;
339 	vioblk_req_t *vbr;
340 	int r;
341 
342 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
343 
344 	/*
345 	 * Allocate a polled request.
346 	 */
347 	if ((vic = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) {
348 		return (ENOMEM);
349 	}
350 	vbr = virtio_chain_data(vic);
351 
352 	/*
353 	 * If there is a request payload, it goes between the header and the
354 	 * status byte.  See the block comment at the top of the file for more
355 	 * detail on the chain layout.
356 	 */
357 	if (dma != NULL) {
358 		for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) {
359 			if (virtio_chain_append(vic,
360 			    virtio_dma_cookie_pa(dma, n),
361 			    virtio_dma_cookie_size(dma, n), dir) !=
362 			    DDI_SUCCESS) {
363 				r = ENOMEM;
364 				goto out;
365 			}
366 		}
367 	}
368 
369 	return (vioblk_common_submit(vib, vic));
370 
371 out:
372 	vioblk_req_free(vib, vbr);
373 	virtio_chain_free(vic);
374 	return (r);
375 }
376 
377 static int
378 vioblk_map_discard(vioblk_t *vib, virtio_chain_t *vic, const bd_xfer_t *xfer)
379 {
380 	const dkioc_free_list_t *dfl = xfer->x_dfl;
381 	const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
382 	virtio_dma_t *dma = NULL;
383 	struct vioblk_discard_write_zeroes *wzp = NULL;
384 
385 	dma = virtio_dma_alloc(vib->vib_virtio,
386 	    dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr,
387 	    DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP);
388 	if (dma == NULL)
389 		return (ENOMEM);
390 
391 	wzp = virtio_dma_va(dma, 0);
392 
393 	for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) {
394 		uint64_t start = dfl->dfl_offset + exts->dfle_start;
395 
396 		const struct vioblk_discard_write_zeroes vdwz = {
397 			.vdwz_sector = start >> DEV_BSHIFT,
398 			.vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT,
399 			.vdwz_flags = 0
400 		};
401 
402 		bcopy(&vdwz, wzp, sizeof (*wzp));
403 	}
404 
405 	if (virtio_chain_append(vic,
406 	    virtio_dma_cookie_pa(dma, 0),
407 	    virtio_dma_cookie_size(dma, 0),
408 	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
409 		virtio_dma_free(dma);
410 		return (ENOMEM);
411 	}
412 
413 	return (0);
414 }
415 
416 static int
417 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type)
418 {
419 	virtio_chain_t *vic = NULL;
420 	vioblk_req_t *vbr = NULL;
421 	uint_t total_cookies = 2;
422 	boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0;
423 	int r;
424 
425 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
426 
427 	/*
428 	 * Ensure that this request falls within the advertised size of the
429 	 * block device.  Be careful to avoid overflow.
430 	 */
431 	if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno ||
432 	    (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) {
433 		vib->vib_stats->vbs_rw_badoffset.value.ui64++;
434 		return (EINVAL);
435 	}
436 
437 	if ((vic = vioblk_common_start(vib, type, xfer->x_blkno, polled)) ==
438 	    NULL) {
439 		return (ENOMEM);
440 	}
441 	vbr = virtio_chain_data(vic);
442 	vbr->vbr_xfer = xfer;
443 
444 	/*
445 	 * If there is a request payload, it goes between the header and the
446 	 * status byte.  See the block comment at the top of the file for more
447 	 * detail on the chain layout.
448 	 */
449 	if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) &&
450 	    xfer->x_nblks > 0) {
451 		virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ?
452 		    VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES;
453 
454 		for (uint_t n = 0; n < xfer->x_ndmac; n++) {
455 			ddi_dma_cookie_t dmac;
456 
457 			if (n == 0) {
458 				/*
459 				 * The first cookie is in the blkdev request.
460 				 */
461 				dmac = xfer->x_dmac;
462 			} else {
463 				ddi_dma_nextcookie(xfer->x_dmah, &dmac);
464 			}
465 
466 			if (virtio_chain_append(vic, dmac.dmac_laddress,
467 			    dmac.dmac_size, dir) != DDI_SUCCESS) {
468 				r = ENOMEM;
469 				goto fail;
470 			}
471 		}
472 
473 		total_cookies += xfer->x_ndmac;
474 
475 	} else if (xfer->x_nblks > 0) {
476 		dev_err(vib->vib_dip, CE_PANIC,
477 		    "request of type %d had payload length of %lu blocks", type,
478 		    xfer->x_nblks);
479 	} else if (type == VIRTIO_BLK_T_DISCARD) {
480 		r = vioblk_map_discard(vib, vic, xfer);
481 		if (r != 0) {
482 			goto fail;
483 		}
484 	}
485 
486 	if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) {
487 		vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies;
488 	}
489 
490 	return (vioblk_common_submit(vib, vic));
491 
492 fail:
493 	vbr->vbr_xfer = NULL;
494 	vioblk_req_free(vib, vbr);
495 	virtio_chain_free(vic);
496 	return (r);
497 }
498 
499 static int
500 vioblk_bd_read(void *arg, bd_xfer_t *xfer)
501 {
502 	vioblk_t *vib = arg;
503 	int r;
504 
505 	mutex_enter(&vib->vib_mutex);
506 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN);
507 	mutex_exit(&vib->vib_mutex);
508 
509 	return (r);
510 }
511 
512 static int
513 vioblk_bd_write(void *arg, bd_xfer_t *xfer)
514 {
515 	vioblk_t *vib = arg;
516 	int r;
517 
518 	mutex_enter(&vib->vib_mutex);
519 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT);
520 	mutex_exit(&vib->vib_mutex);
521 
522 	return (r);
523 }
524 
525 static int
526 vioblk_bd_flush(void *arg, bd_xfer_t *xfer)
527 {
528 	vioblk_t *vib = arg;
529 	int r;
530 
531 	mutex_enter(&vib->vib_mutex);
532 	if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) {
533 		/*
534 		 * We don't really expect to get here, because if we did not
535 		 * negotiate the flush feature we would not have installed this
536 		 * function in the blkdev ops vector.
537 		 */
538 		mutex_exit(&vib->vib_mutex);
539 		return (ENOTSUP);
540 	}
541 
542 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH);
543 	mutex_exit(&vib->vib_mutex);
544 
545 	return (r);
546 }
547 
548 static void
549 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive)
550 {
551 	vioblk_t *vib = arg;
552 
553 	drive->d_qsize = vib->vib_reqs_capacity;
554 	drive->d_removable = B_FALSE;
555 	drive->d_hotpluggable = B_TRUE;
556 	drive->d_target = 0;
557 	drive->d_lun = 0;
558 
559 	drive->d_vendor = "Virtio";
560 	drive->d_vendor_len = strlen(drive->d_vendor);
561 
562 	drive->d_product = "Block Device";
563 	drive->d_product_len = strlen(drive->d_product);
564 
565 	drive->d_serial = vib->vib_devid;
566 	drive->d_serial_len = strlen(drive->d_serial);
567 
568 	drive->d_revision = "0000";
569 	drive->d_revision_len = strlen(drive->d_revision);
570 
571 	if (vib->vib_can_discard) {
572 		drive->d_free_align = vib->vib_discard_sector_align;
573 		drive->d_max_free_seg = vib->vib_max_discard_seg;
574 		drive->d_max_free_blks = vib->vib_max_discard_sectors;
575 		/*
576 		 * The virtio 1.1 spec doesn't specify a per segment sector
577 		 * limit for discards -- only a limit on the total sectors in
578 		 * a discard request. Therefore, we assume a vioblk device must
579 		 * be able to accept a single segment of vib_max_discard_sectors
580 		 * (when it supports discard requests) and use
581 		 * vib_max_discard_sectors both for the overall limit for
582 		 * a discard request, but also as the limit for a single
583 		 * segment. blkdev will ensure we are never called with
584 		 * a dkioc_free_list_t that violates either limit.
585 		 */
586 		drive->d_max_free_seg_blks = vib->vib_max_discard_sectors;
587 	}
588 }
589 
590 static int
591 vioblk_bd_mediainfo(void *arg, bd_media_t *media)
592 {
593 	vioblk_t *vib = (void *)arg;
594 
595 	/*
596 	 * The device protocol is specified in terms of 512 byte logical
597 	 * blocks, regardless of the recommended I/O size which might be
598 	 * larger.
599 	 */
600 	media->m_nblks = vib->vib_nblks;
601 	media->m_blksize = vib->vib_blk_size;
602 
603 	media->m_readonly = vib->vib_readonly;
604 	media->m_pblksize = vib->vib_pblk_size;
605 	return (0);
606 }
607 
608 static void
609 vioblk_get_id(vioblk_t *vib)
610 {
611 	virtio_dma_t *dma;
612 	int r;
613 
614 	if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES,
615 	    &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ,
616 	    KM_SLEEP)) == NULL) {
617 		return;
618 	}
619 
620 	mutex_enter(&vib->vib_mutex);
621 	if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0,
622 	    VIRTIO_DIR_DEVICE_WRITES)) == 0) {
623 		const char *b = virtio_dma_va(dma, 0);
624 		uint_t pos = 0;
625 
626 		/*
627 		 * Save the entire response for debugging purposes.
628 		 */
629 		bcopy(virtio_dma_va(dma, 0), vib->vib_rawid,
630 		    VIRTIO_BLK_ID_BYTES);
631 
632 		/*
633 		 * Process the returned ID.
634 		 */
635 		bzero(vib->vib_devid, sizeof (vib->vib_devid));
636 		for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) {
637 			if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') {
638 				/*
639 				 * Accept a subset of printable ASCII
640 				 * characters.
641 				 */
642 				vib->vib_devid[pos++] = b[n];
643 			} else {
644 				/*
645 				 * Stop processing at the first sign of
646 				 * trouble.
647 				 */
648 				break;
649 			}
650 		}
651 
652 		vib->vib_devid_fetched = B_TRUE;
653 	}
654 	mutex_exit(&vib->vib_mutex);
655 
656 	virtio_dma_free(dma);
657 }
658 
659 static int
660 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid)
661 {
662 	vioblk_t *vib = arg;
663 	size_t len;
664 
665 	if ((len = strlen(vib->vib_devid)) == 0) {
666 		/*
667 		 * The device has no ID.
668 		 */
669 		return (DDI_FAILURE);
670 	}
671 
672 	return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid,
673 	    devid));
674 }
675 
676 static int
677 vioblk_bd_free_space(void *arg, bd_xfer_t *xfer)
678 {
679 	vioblk_t *vib = arg;
680 	int r = 0;
681 
682 	/*
683 	 * Since vib_can_discard is write once (and set during attach),
684 	 * we can check if it's enabled without taking the mutex.
685 	 */
686 	if (!vib->vib_can_discard) {
687 		return (ENOTSUP);
688 	}
689 
690 	mutex_enter(&vib->vib_mutex);
691 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_DISCARD);
692 	mutex_exit(&vib->vib_mutex);
693 
694 	return (r);
695 }
696 
697 /*
698  * As the device completes processing of a request, it returns the chain for
699  * that request to our I/O queue.  This routine is called in two contexts:
700  *   - from the interrupt handler, in response to notification from the device
701  *   - synchronously in line with request processing when panicking
702  */
703 static uint_t
704 vioblk_poll(vioblk_t *vib)
705 {
706 	virtio_chain_t *vic;
707 	uint_t count = 0;
708 	boolean_t wakeup = B_FALSE;
709 
710 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
711 
712 	while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) {
713 		vioblk_req_t *vbr = virtio_chain_data(vic);
714 		uint8_t status;
715 
716 		virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU);
717 
718 		bcopy(virtio_dma_va(vbr->vbr_dma,
719 		    sizeof (struct vioblk_req_hdr)), &status, sizeof (status));
720 
721 		switch (status) {
722 		case VIRTIO_BLK_S_OK:
723 			vbr->vbr_error = 0;
724 			break;
725 		case VIRTIO_BLK_S_IOERR:
726 			vbr->vbr_error = EIO;
727 			vib->vib_stats->vbs_io_errors.value.ui64++;
728 			break;
729 		case VIRTIO_BLK_S_UNSUPP:
730 			vbr->vbr_error = ENOTTY;
731 			vib->vib_stats->vbs_unsupp_errors.value.ui64++;
732 			break;
733 		default:
734 			vbr->vbr_error = ENXIO;
735 			vib->vib_stats->vbs_nxio_errors.value.ui64++;
736 			break;
737 		}
738 
739 		count++;
740 
741 		if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) {
742 			/*
743 			 * This request must not be freed as it is being held
744 			 * by a call to vioblk_common_submit().
745 			 */
746 			VERIFY(!(vbr->vbr_status &
747 			    VIOBLK_REQSTAT_POLL_COMPLETE));
748 			vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE;
749 			wakeup = B_TRUE;
750 			continue;
751 		}
752 
753 		vioblk_complete(vib, vbr);
754 
755 		vioblk_req_free(vib, vbr);
756 		virtio_chain_free(vic);
757 	}
758 
759 	if (wakeup) {
760 		/*
761 		 * Signal anybody waiting for polled command completion.
762 		 */
763 		cv_broadcast(&vib->vib_cv);
764 	}
765 
766 	return (count);
767 }
768 
769 uint_t
770 vioblk_int_handler(caddr_t arg0, caddr_t arg1)
771 {
772 	vioblk_t *vib = (vioblk_t *)arg0;
773 	uint_t count;
774 
775 	mutex_enter(&vib->vib_mutex);
776 	if ((count = vioblk_poll(vib)) >
777 	    vib->vib_stats->vbs_intr_queuemax.value.ui32) {
778 		vib->vib_stats->vbs_intr_queuemax.value.ui32 = count;
779 	}
780 
781 	vib->vib_stats->vbs_intr_total.value.ui64++;
782 	mutex_exit(&vib->vib_mutex);
783 
784 	return (DDI_INTR_CLAIMED);
785 }
786 
787 static void
788 vioblk_free_reqs(vioblk_t *vib)
789 {
790 	VERIFY3U(vib->vib_nreqs_alloc, ==, 0);
791 
792 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
793 		struct vioblk_req *vbr = &vib->vib_reqs_mem[i];
794 
795 		VERIFY(list_link_active(&vbr->vbr_link));
796 		list_remove(&vib->vib_reqs, vbr);
797 
798 		VERIFY0(vbr->vbr_status);
799 
800 		if (vbr->vbr_dma != NULL) {
801 			virtio_dma_free(vbr->vbr_dma);
802 			vbr->vbr_dma = NULL;
803 		}
804 	}
805 	VERIFY(list_is_empty(&vib->vib_reqs));
806 
807 	if (vib->vib_reqs_mem != NULL) {
808 		kmem_free(vib->vib_reqs_mem,
809 		    sizeof (struct vioblk_req) * vib->vib_reqs_capacity);
810 		vib->vib_reqs_mem = NULL;
811 		vib->vib_reqs_capacity = 0;
812 	}
813 }
814 
815 static int
816 vioblk_alloc_reqs(vioblk_t *vib)
817 {
818 	vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq),
819 	    VIRTIO_BLK_REQ_BUFS);
820 	vib->vib_reqs_mem = kmem_zalloc(
821 	    sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP);
822 	vib->vib_nreqs_alloc = 0;
823 
824 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
825 		list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]);
826 	}
827 
828 	for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL;
829 	    vbr = list_next(&vib->vib_reqs, vbr)) {
830 		if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio,
831 		    sizeof (struct vioblk_req_hdr) + sizeof (uint8_t),
832 		    &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
833 		    KM_SLEEP)) == NULL) {
834 			goto fail;
835 		}
836 	}
837 
838 	return (0);
839 
840 fail:
841 	vioblk_free_reqs(vib);
842 	return (ENOMEM);
843 }
844 
845 static int
846 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
847 {
848 	int instance = ddi_get_instance(dip);
849 	vioblk_t *vib;
850 	virtio_t *vio;
851 	boolean_t did_mutex = B_FALSE;
852 
853 	if (cmd != DDI_ATTACH) {
854 		return (DDI_FAILURE);
855 	}
856 
857 	if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) ==
858 	    NULL) {
859 		dev_err(dip, CE_WARN, "failed to start Virtio init");
860 		return (DDI_FAILURE);
861 	}
862 
863 	vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
864 	vib->vib_dip = dip;
865 	vib->vib_virtio = vio;
866 	ddi_set_driver_private(dip, vib);
867 	list_create(&vib->vib_reqs, sizeof (vioblk_req_t),
868 	    offsetof(vioblk_req_t, vbr_link));
869 
870 	/*
871 	 * Determine how many scatter-gather entries we can use in a single
872 	 * request.
873 	 */
874 	vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG;
875 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) {
876 		vib->vib_seg_max = virtio_dev_get32(vio,
877 		    VIRTIO_BLK_CONFIG_SEG_MAX);
878 
879 		if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) {
880 			/*
881 			 * We need to be able to use at least one data segment,
882 			 * so we'll assume that this device is just poorly
883 			 * implemented and try for one.
884 			 */
885 			vib->vib_seg_max = 1;
886 		}
887 	}
888 
889 	if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) {
890 		vib->vib_max_discard_sectors = virtio_dev_get32(vio,
891 		    VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT);
892 		vib->vib_max_discard_seg = virtio_dev_get32(vio,
893 		    VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG);
894 		vib->vib_discard_sector_align = virtio_dev_get32(vio,
895 		    VIRTIO_BLK_CONFIG_DISCARD_ALIGN);
896 
897 		if (vib->vib_max_discard_sectors == 0 ||
898 		    vib->vib_max_discard_seg == 0 ||
899 		    vib->vib_discard_sector_align == 0) {
900 			vib->vib_can_discard = B_FALSE;
901 
902 			/*
903 			 * The hypervisor shouldn't be giving us bad values.
904 			 * If it is, it's probably worth notifying the
905 			 * operator.
906 			 */
907 			dev_err(dip, CE_NOTE,
908 			    "Host is advertising DISCARD support but with bad"
909 			    "parameters: max_discard_sectors=%u, "
910 			    "max_discard_segments=%u, discard_sector_align=%u",
911 			    vib->vib_max_discard_sectors,
912 			    vib->vib_max_discard_seg,
913 			    vib->vib_discard_sector_align);
914 		} else {
915 			vib->vib_can_discard = B_TRUE;
916 		}
917 	}
918 
919 	/*
920 	 * When allocating the request queue, we include two additional
921 	 * descriptors (beyond those required for request data) to account for
922 	 * the header and the status byte.
923 	 */
924 	if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io",
925 	    vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) {
926 		goto fail;
927 	}
928 
929 	if (virtio_init_complete(vio, 0) != DDI_SUCCESS) {
930 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
931 		goto fail;
932 	}
933 
934 	cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL);
935 	mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
936 	did_mutex = B_TRUE;
937 
938 	if ((vib->vib_kstat = kstat_create("vioblk", instance,
939 	    "statistics", "controller", KSTAT_TYPE_NAMED,
940 	    sizeof (struct vioblk_stats) / sizeof (kstat_named_t),
941 	    KSTAT_FLAG_PERSISTENT)) == NULL) {
942 		dev_err(dip, CE_WARN, "kstat_create failed");
943 		goto fail;
944 	}
945 	vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data;
946 	kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory,
947 	    "total_rw_outofmemory", KSTAT_DATA_UINT64);
948 	kstat_named_init(&vib->vib_stats->vbs_rw_badoffset,
949 	    "total_rw_badoffset", KSTAT_DATA_UINT64);
950 	kstat_named_init(&vib->vib_stats->vbs_intr_total,
951 	    "total_intr", KSTAT_DATA_UINT64);
952 	kstat_named_init(&vib->vib_stats->vbs_io_errors,
953 	    "total_io_errors", KSTAT_DATA_UINT64);
954 	kstat_named_init(&vib->vib_stats->vbs_unsupp_errors,
955 	    "total_unsupp_errors", KSTAT_DATA_UINT64);
956 	kstat_named_init(&vib->vib_stats->vbs_nxio_errors,
957 	    "total_nxio_errors", KSTAT_DATA_UINT64);
958 	kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush,
959 	    "total_rw_cacheflush", KSTAT_DATA_UINT64);
960 	kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax,
961 	    "max_rw_cookies", KSTAT_DATA_UINT32);
962 	kstat_named_init(&vib->vib_stats->vbs_intr_queuemax,
963 	    "max_intr_queue", KSTAT_DATA_UINT32);
964 	kstat_install(vib->vib_kstat);
965 
966 	vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO);
967 	if ((vib->vib_nblks = virtio_dev_get64(vio,
968 	    VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) {
969 		dev_err(dip, CE_WARN, "invalid capacity");
970 		goto fail;
971 	}
972 
973 	/*
974 	 * Determine the optimal logical block size recommended by the device.
975 	 * This size is advisory; the protocol always deals in 512 byte blocks.
976 	 */
977 	vib->vib_blk_size = DEV_BSIZE;
978 	if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) {
979 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE);
980 
981 		if (v != 0 && v != PCI_EINVAL32) {
982 			vib->vib_blk_size = v;
983 		}
984 	}
985 
986 	/*
987 	 * Device capacity is always in 512-byte units, convert to
988 	 * native blocks.
989 	 */
990 	vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size;
991 
992 	/*
993 	 * The device may also provide an advisory physical block size.
994 	 */
995 	vib->vib_pblk_size = vib->vib_blk_size;
996 	if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) {
997 		uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP);
998 
999 		if (v != PCI_EINVAL8) {
1000 			vib->vib_pblk_size <<= v;
1001 		}
1002 	}
1003 
1004 	/*
1005 	 * The maximum size for a cookie in a request.
1006 	 */
1007 	vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE;
1008 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) {
1009 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX);
1010 
1011 		if (v != 0 && v != PCI_EINVAL32) {
1012 			vib->vib_seg_size_max = v;
1013 		}
1014 	}
1015 
1016 	/*
1017 	 * Set up the DMA attributes for blkdev to use for request data.  The
1018 	 * specification is not extremely clear about whether DMA-related
1019 	 * parameters include or exclude the header and status descriptors.
1020 	 * For now, we assume they cover only the request data and not the
1021 	 * headers.
1022 	 */
1023 	vib->vib_bd_dma_attr = vioblk_dma_attr;
1024 	vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max;
1025 	vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max;
1026 	vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max *
1027 	    vib->vib_seg_size_max;
1028 
1029 	if (vioblk_alloc_reqs(vib) != 0) {
1030 		goto fail;
1031 	}
1032 
1033 	/*
1034 	 * The blkdev framework does not provide a way to specify that the
1035 	 * device does not support write cache flushing, except by omitting the
1036 	 * "o_sync_cache" member from the ops vector.  As "bd_alloc_handle()"
1037 	 * makes a copy of the ops vector, we can safely assemble one on the
1038 	 * stack based on negotiated features.
1039 	 *
1040 	 * Similarly, the blkdev framework does not provide a way to indicate
1041 	 * if a device supports an TRIM/UNMAP/DISCARD type operation except
1042 	 * by omitting the "o_free_space" member from the ops vector.
1043 	 */
1044 	bd_ops_t vioblk_bd_ops = {
1045 		.o_version =		BD_OPS_CURRENT_VERSION,
1046 		.o_drive_info =		vioblk_bd_driveinfo,
1047 		.o_media_info =		vioblk_bd_mediainfo,
1048 		.o_devid_init =		vioblk_bd_devid,
1049 		.o_sync_cache =		vioblk_bd_flush,
1050 		.o_read =		vioblk_bd_read,
1051 		.o_write =		vioblk_bd_write,
1052 		.o_free_space =		vioblk_bd_free_space,
1053 	};
1054 	if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) {
1055 		vioblk_bd_ops.o_sync_cache = NULL;
1056 	}
1057 	if (!vib->vib_can_discard) {
1058 		vioblk_bd_ops.o_free_space = NULL;
1059 	}
1060 
1061 	vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops,
1062 	    &vib->vib_bd_dma_attr, KM_SLEEP);
1063 
1064 	/*
1065 	 * Enable interrupts now so that we can request the device identity.
1066 	 */
1067 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
1068 		goto fail;
1069 	}
1070 
1071 	vioblk_get_id(vib);
1072 
1073 	if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) {
1074 		dev_err(dip, CE_WARN, "Failed to attach blkdev");
1075 		goto fail;
1076 	}
1077 
1078 	return (DDI_SUCCESS);
1079 
1080 fail:
1081 	if (vib->vib_bd_h != NULL) {
1082 		(void) bd_detach_handle(vib->vib_bd_h);
1083 		bd_free_handle(vib->vib_bd_h);
1084 	}
1085 	if (vio != NULL) {
1086 		(void) virtio_fini(vio, B_TRUE);
1087 	}
1088 	if (did_mutex) {
1089 		mutex_destroy(&vib->vib_mutex);
1090 		cv_destroy(&vib->vib_cv);
1091 	}
1092 	if (vib->vib_kstat != NULL) {
1093 		kstat_delete(vib->vib_kstat);
1094 	}
1095 	vioblk_free_reqs(vib);
1096 	kmem_free(vib, sizeof (*vib));
1097 	return (DDI_FAILURE);
1098 }
1099 
1100 static int
1101 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1102 {
1103 	vioblk_t *vib = ddi_get_driver_private(dip);
1104 
1105 	if (cmd != DDI_DETACH) {
1106 		return (DDI_FAILURE);
1107 	}
1108 
1109 	mutex_enter(&vib->vib_mutex);
1110 	if (vib->vib_nreqs_alloc > 0) {
1111 		/*
1112 		 * Cannot detach while there are still outstanding requests.
1113 		 */
1114 		mutex_exit(&vib->vib_mutex);
1115 		return (DDI_FAILURE);
1116 	}
1117 
1118 	if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) {
1119 		mutex_exit(&vib->vib_mutex);
1120 		return (DDI_FAILURE);
1121 	}
1122 
1123 	/*
1124 	 * Tear down the Virtio framework before freeing the rest of the
1125 	 * resources.  This will ensure the interrupt handlers are no longer
1126 	 * running.
1127 	 */
1128 	virtio_fini(vib->vib_virtio, B_FALSE);
1129 
1130 	vioblk_free_reqs(vib);
1131 	kstat_delete(vib->vib_kstat);
1132 
1133 	mutex_exit(&vib->vib_mutex);
1134 	mutex_destroy(&vib->vib_mutex);
1135 
1136 	kmem_free(vib, sizeof (*vib));
1137 
1138 	return (DDI_SUCCESS);
1139 }
1140 
1141 static int
1142 vioblk_quiesce(dev_info_t *dip)
1143 {
1144 	vioblk_t *vib;
1145 
1146 	if ((vib = ddi_get_driver_private(dip)) == NULL) {
1147 		return (DDI_FAILURE);
1148 	}
1149 
1150 	return (virtio_quiesce(vib->vib_virtio));
1151 }
1152 
1153 int
1154 _init(void)
1155 {
1156 	int rv;
1157 
1158 	bd_mod_init(&vioblk_dev_ops);
1159 
1160 	if ((rv = mod_install(&vioblk_modlinkage)) != 0) {
1161 		bd_mod_fini(&vioblk_dev_ops);
1162 	}
1163 
1164 	return (rv);
1165 }
1166 
1167 int
1168 _fini(void)
1169 {
1170 	int rv;
1171 
1172 	if ((rv = mod_remove(&vioblk_modlinkage)) == 0) {
1173 		bd_mod_fini(&vioblk_dev_ops);
1174 	}
1175 
1176 	return (rv);
1177 }
1178 
1179 int
1180 _info(struct modinfo *modinfop)
1181 {
1182 	return (mod_info(&vioblk_modlinkage, modinfop));
1183 }
1184