xref: /illumos-gate/usr/src/uts/common/io/virtio/virtio_main.c (revision 89cb8ffb5df88f95defaae8f0f4f0c67ccd9d17e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * VIRTIO FRAMEWORK
18  *
19  * For design and usage documentation, see the comments in "virtio.h".
20  */
21 
22 #include <sys/conf.h>
23 #include <sys/kmem.h>
24 #include <sys/debug.h>
25 #include <sys/modctl.h>
26 #include <sys/autoconf.h>
27 #include <sys/ddi_impldefs.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/sunndi.h>
31 #include <sys/avintr.h>
32 #include <sys/spl.h>
33 #include <sys/promif.h>
34 #include <sys/list.h>
35 #include <sys/bootconf.h>
36 #include <sys/bootsvcs.h>
37 #include <sys/sysmacros.h>
38 #include <sys/pci.h>
39 
40 #include "virtio.h"
41 #include "virtio_impl.h"
42 
43 
44 /*
45  * Linkage structures
46  */
47 static struct modlmisc virtio_modlmisc = {
48 	.misc_modops =			&mod_miscops,
49 	.misc_linkinfo =		"VIRTIO common routines",
50 };
51 
52 static struct modlinkage virtio_modlinkage = {
53 	.ml_rev =			MODREV_1,
54 	.ml_linkage =			{ &virtio_modlmisc, NULL }
55 };
56 
57 int
58 _init(void)
59 {
60 	return (mod_install(&virtio_modlinkage));
61 }
62 
63 int
64 _fini(void)
65 {
66 	return (mod_remove(&virtio_modlinkage));
67 }
68 
69 int
70 _info(struct modinfo *modinfop)
71 {
72 	return (mod_info(&virtio_modlinkage, modinfop));
73 }
74 
75 
76 
77 static void virtio_set_status(virtio_t *, uint8_t);
78 static int virtio_chain_append_impl(virtio_chain_t *, uint64_t, size_t,
79     uint16_t);
80 static int virtio_interrupts_setup(virtio_t *, int);
81 static void virtio_interrupts_teardown(virtio_t *);
82 static void virtio_interrupts_disable_locked(virtio_t *);
83 static void virtio_queue_free(virtio_queue_t *);
84 static void virtio_device_reset_locked(virtio_t *);
85 
86 /*
87  * We use the same device access attributes for BAR mapping and access to the
88  * virtqueue memory.
89  */
90 ddi_device_acc_attr_t virtio_acc_attr = {
91 	.devacc_attr_version =		DDI_DEVICE_ATTR_V1,
92 	.devacc_attr_endian_flags =	DDI_NEVERSWAP_ACC,
93 	.devacc_attr_dataorder =	DDI_STORECACHING_OK_ACC,
94 	.devacc_attr_access =		DDI_DEFAULT_ACC
95 };
96 
97 
98 /*
99  * DMA attributes for the memory given to the device for queue management.
100  */
101 ddi_dma_attr_t virtio_dma_attr_queue = {
102 	.dma_attr_version =		DMA_ATTR_V0,
103 	.dma_attr_addr_lo =		0x0000000000000000,
104 	/*
105 	 * Queue memory is aligned on VIRTIO_PAGE_SIZE with the address shifted
106 	 * down by VIRTIO_PAGE_SHIFT before being passed to the device in a
107 	 * 32-bit register.
108 	 */
109 	.dma_attr_addr_hi =		0x00000FFFFFFFF000,
110 	.dma_attr_count_max =		0x00000000FFFFFFFF,
111 	.dma_attr_align =		VIRTIO_PAGE_SIZE,
112 	.dma_attr_burstsizes =		1,
113 	.dma_attr_minxfer =		1,
114 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
115 	.dma_attr_seg =			0x00000000FFFFFFFF,
116 	.dma_attr_sgllen =		1,
117 	.dma_attr_granular =		1,
118 	.dma_attr_flags =		0
119 };
120 
121 /*
122  * DMA attributes for the the allocation of indirect descriptor lists.  The
123  * indirect list is referenced by a regular descriptor entry: the physical
124  * address field is 64 bits wide, but the length field is only 32 bits.  Each
125  * descriptor is 16 bytes long.
126  */
127 ddi_dma_attr_t virtio_dma_attr_indirect = {
128 	.dma_attr_version =		DMA_ATTR_V0,
129 	.dma_attr_addr_lo =		0x0000000000000000,
130 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
131 	.dma_attr_count_max =		0x00000000FFFFFFFF,
132 	.dma_attr_align =		sizeof (struct virtio_vq_desc),
133 	.dma_attr_burstsizes =		1,
134 	.dma_attr_minxfer =		1,
135 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
136 	.dma_attr_seg =			0x00000000FFFFFFFF,
137 	.dma_attr_sgllen =		1,
138 	.dma_attr_granular =		1,
139 	.dma_attr_flags =		0
140 };
141 
142 
143 uint8_t
144 virtio_get8(virtio_t *vio, uintptr_t offset)
145 {
146 	return (ddi_get8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset)));
147 }
148 
149 uint16_t
150 virtio_get16(virtio_t *vio, uintptr_t offset)
151 {
152 	return (ddi_get16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset)));
153 }
154 
155 uint32_t
156 virtio_get32(virtio_t *vio, uintptr_t offset)
157 {
158 	return (ddi_get32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset)));
159 }
160 
161 void
162 virtio_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
163 {
164 	ddi_put8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset), value);
165 }
166 
167 void
168 virtio_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
169 {
170 	ddi_put16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset), value);
171 }
172 
173 void
174 virtio_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
175 {
176 	ddi_put32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset), value);
177 }
178 
179 void
180 virtio_fini(virtio_t *vio, boolean_t failed)
181 {
182 	mutex_enter(&vio->vio_mutex);
183 
184 	virtio_interrupts_teardown(vio);
185 
186 	virtio_queue_t *viq;
187 	while ((viq = list_remove_head(&vio->vio_queues)) != NULL) {
188 		virtio_queue_free(viq);
189 	}
190 	list_destroy(&vio->vio_queues);
191 
192 	if (failed) {
193 		/*
194 		 * Signal to the host that device setup failed.
195 		 */
196 		virtio_set_status(vio, VIRTIO_STATUS_FAILED);
197 	} else {
198 		virtio_device_reset_locked(vio);
199 	}
200 
201 	/*
202 	 * We don't need to do anything for the provider initlevel, as it
203 	 * merely records the fact that virtio_init_complete() was called.
204 	 */
205 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_PROVIDER;
206 
207 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_REGS) {
208 		/*
209 		 * Unmap PCI BAR0.
210 		 */
211 		ddi_regs_map_free(&vio->vio_barh);
212 
213 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_REGS;
214 	}
215 
216 	/*
217 	 * Ensure we have torn down everything we set up.
218 	 */
219 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_SHUTDOWN;
220 	VERIFY0(vio->vio_initlevel);
221 
222 	mutex_exit(&vio->vio_mutex);
223 	mutex_destroy(&vio->vio_mutex);
224 
225 	kmem_free(vio, sizeof (*vio));
226 }
227 
228 /*
229  * Early device initialisation for legacy (pre-1.0 specification) virtio
230  * devices.
231  */
232 virtio_t *
233 virtio_init(dev_info_t *dip, uint64_t driver_features, boolean_t allow_indirect)
234 {
235 	int r;
236 
237 	/*
238 	 * First, confirm that this is a legacy device.
239 	 */
240 	ddi_acc_handle_t pci;
241 	if (pci_config_setup(dip, &pci) != DDI_SUCCESS) {
242 		dev_err(dip, CE_WARN, "pci_config_setup failed");
243 		return (NULL);
244 	}
245 
246 	uint8_t revid;
247 	if ((revid = pci_config_get8(pci, PCI_CONF_REVID)) == PCI_EINVAL8) {
248 		dev_err(dip, CE_WARN, "could not read config space");
249 		pci_config_teardown(&pci);
250 		return (NULL);
251 	}
252 
253 	pci_config_teardown(&pci);
254 
255 	/*
256 	 * The legacy specification requires that the device advertise as PCI
257 	 * Revision 0.
258 	 */
259 	if (revid != 0) {
260 		dev_err(dip, CE_WARN, "PCI Revision %u incorrect for "
261 		    "legacy virtio device", (uint_t)revid);
262 		return (NULL);
263 	}
264 
265 	virtio_t *vio = kmem_zalloc(sizeof (*vio), KM_SLEEP);
266 	vio->vio_dip = dip;
267 
268 	/*
269 	 * Map PCI BAR0 for legacy device access.
270 	 */
271 	if ((r = ddi_regs_map_setup(dip, VIRTIO_LEGACY_PCI_BAR0,
272 	    (caddr_t *)&vio->vio_bar, 0, 0, &virtio_acc_attr,
273 	    &vio->vio_barh)) != DDI_SUCCESS) {
274 		dev_err(dip, CE_WARN, "ddi_regs_map_setup failure (%d)", r);
275 		kmem_free(vio, sizeof (*vio));
276 		return (NULL);
277 	}
278 	vio->vio_initlevel |= VIRTIO_INITLEVEL_REGS;
279 
280 	/*
281 	 * We initialise the mutex without an interrupt priority to ease the
282 	 * implementation of some of the configuration space access routines.
283 	 * Drivers using the virtio framework MUST make a call to
284 	 * "virtio_init_complete()" prior to spawning other threads or enabling
285 	 * interrupt handlers, at which time we will destroy and reinitialise
286 	 * the mutex for use in our interrupt handlers.
287 	 */
288 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, NULL);
289 
290 	list_create(&vio->vio_queues, sizeof (virtio_queue_t),
291 	    offsetof(virtio_queue_t, viq_link));
292 
293 	/*
294 	 * Legacy virtio devices require a few common steps before we can
295 	 * negotiate device features.
296 	 */
297 	virtio_device_reset(vio);
298 	virtio_set_status(vio, VIRTIO_STATUS_ACKNOWLEDGE);
299 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER);
300 
301 	/*
302 	 * Negotiate features with the device.  Record the original supported
303 	 * feature set for debugging purposes.
304 	 */
305 	vio->vio_features_device = virtio_get32(vio,
306 	    VIRTIO_LEGACY_FEATURES_DEVICE);
307 	if (allow_indirect) {
308 		driver_features |= VIRTIO_F_RING_INDIRECT_DESC;
309 	}
310 	vio->vio_features = vio->vio_features_device & driver_features;
311 	virtio_put32(vio, VIRTIO_LEGACY_FEATURES_DRIVER, vio->vio_features);
312 
313 	/*
314 	 * The device-specific configuration begins at an offset into the BAR
315 	 * that depends on whether we have enabled MSI-X interrupts or not.
316 	 * Start out with the offset for pre-MSI-X operation so that we can
317 	 * read device configuration space prior to configuring interrupts.
318 	 */
319 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
320 
321 	return (vio);
322 }
323 
324 /*
325  * This function must be called by the driver once it has completed early setup
326  * calls.
327  */
328 int
329 virtio_init_complete(virtio_t *vio, int allowed_interrupt_types)
330 {
331 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER));
332 	vio->vio_initlevel |= VIRTIO_INITLEVEL_PROVIDER;
333 
334 	if (!list_is_empty(&vio->vio_queues)) {
335 		/*
336 		 * Set up interrupts for the queues that have been registered.
337 		 */
338 		if (virtio_interrupts_setup(vio, allowed_interrupt_types) !=
339 		    DDI_SUCCESS) {
340 			return (DDI_FAILURE);
341 		}
342 	}
343 
344 	/*
345 	 * We can allocate the mutex once we know the priority.
346 	 */
347 	mutex_destroy(&vio->vio_mutex);
348 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
349 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
350 	    viq = list_next(&vio->vio_queues, viq)) {
351 		mutex_destroy(&viq->viq_mutex);
352 		mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER,
353 		    virtio_intr_pri(vio));
354 	}
355 
356 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER_OK);
357 
358 	return (DDI_SUCCESS);
359 }
360 
361 boolean_t
362 virtio_feature_present(virtio_t *vio, uint64_t feature_mask)
363 {
364 	return ((vio->vio_features & feature_mask) != 0);
365 }
366 
367 void *
368 virtio_intr_pri(virtio_t *vio)
369 {
370 	VERIFY(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED);
371 
372 	return (DDI_INTR_PRI(vio->vio_interrupt_priority));
373 }
374 
375 /*
376  * Enable a bit in the device status register.  Each bit signals a level of
377  * guest readiness to the host.  Use the VIRTIO_CONFIG_DEVICE_STATUS_*
378  * constants for "status".  To zero the status field use virtio_device_reset().
379  */
380 static void
381 virtio_set_status(virtio_t *vio, uint8_t status)
382 {
383 	VERIFY3U(status, !=, 0);
384 
385 	mutex_enter(&vio->vio_mutex);
386 
387 	uint8_t old = virtio_get8(vio, VIRTIO_LEGACY_DEVICE_STATUS);
388 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, status | old);
389 
390 	mutex_exit(&vio->vio_mutex);
391 }
392 
393 static void
394 virtio_device_reset_locked(virtio_t *vio)
395 {
396 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, VIRTIO_STATUS_RESET);
397 }
398 
399 void
400 virtio_device_reset(virtio_t *vio)
401 {
402 	mutex_enter(&vio->vio_mutex);
403 	virtio_device_reset_locked(vio);
404 	mutex_exit(&vio->vio_mutex);
405 }
406 
407 /*
408  * Some queues are effectively long-polled; the driver submits a series of
409  * buffers and the device only returns them when there is data available.
410  * During detach, we need to coordinate the return of these buffers.  Calling
411  * "virtio_shutdown()" will reset the device, then allow the removal of all
412  * buffers that were in flight at the time of shutdown via
413  * "virtio_queue_evacuate()".
414  */
415 void
416 virtio_shutdown(virtio_t *vio)
417 {
418 	mutex_enter(&vio->vio_mutex);
419 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
420 		/*
421 		 * Shutdown has been performed already.
422 		 */
423 		mutex_exit(&vio->vio_mutex);
424 		return;
425 	}
426 
427 	/*
428 	 * First, mark all of the queues as shutdown.  This will prevent any
429 	 * further activity.
430 	 */
431 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
432 	    viq = list_next(&vio->vio_queues, viq)) {
433 		mutex_enter(&viq->viq_mutex);
434 		viq->viq_shutdown = B_TRUE;
435 		mutex_exit(&viq->viq_mutex);
436 	}
437 
438 	/*
439 	 * Now, reset the device.  This removes any queue configuration on the
440 	 * device side.
441 	 */
442 	virtio_device_reset_locked(vio);
443 	vio->vio_initlevel |= VIRTIO_INITLEVEL_SHUTDOWN;
444 	mutex_exit(&vio->vio_mutex);
445 }
446 
447 /*
448  * Common implementation of quiesce(9E) for simple Virtio-based devices.
449  */
450 int
451 virtio_quiesce(virtio_t *vio)
452 {
453 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
454 		/*
455 		 * Device has already been reset.
456 		 */
457 		return (DDI_SUCCESS);
458 	}
459 
460 	/*
461 	 * When we reset the device, it should immediately stop using any DMA
462 	 * memory we've previously passed to it.  All queue configuration is
463 	 * discarded.  This is good enough for quiesce(9E).
464 	 */
465 	virtio_device_reset_locked(vio);
466 
467 	return (DDI_SUCCESS);
468 }
469 
470 /*
471  * DEVICE-SPECIFIC REGISTER ACCESS
472  *
473  * Note that these functions take the mutex to avoid racing with interrupt
474  * enable/disable, when the device-specific offset can potentially change.
475  */
476 
477 uint8_t
478 virtio_dev_get8(virtio_t *vio, uintptr_t offset)
479 {
480 	mutex_enter(&vio->vio_mutex);
481 	uint8_t r = virtio_get8(vio, vio->vio_config_offset + offset);
482 	mutex_exit(&vio->vio_mutex);
483 
484 	return (r);
485 }
486 
487 uint16_t
488 virtio_dev_get16(virtio_t *vio, uintptr_t offset)
489 {
490 	mutex_enter(&vio->vio_mutex);
491 	uint16_t r = virtio_get16(vio, vio->vio_config_offset + offset);
492 	mutex_exit(&vio->vio_mutex);
493 
494 	return (r);
495 }
496 
497 uint32_t
498 virtio_dev_get32(virtio_t *vio, uintptr_t offset)
499 {
500 	mutex_enter(&vio->vio_mutex);
501 	uint32_t r = virtio_get32(vio, vio->vio_config_offset + offset);
502 	mutex_exit(&vio->vio_mutex);
503 
504 	return (r);
505 }
506 
507 uint64_t
508 virtio_dev_get64(virtio_t *vio, uintptr_t offset)
509 {
510 	mutex_enter(&vio->vio_mutex);
511 	/*
512 	 * On at least some systems, a 64-bit read or write to this BAR is not
513 	 * possible.  For legacy devices, there is no generation number to use
514 	 * to determine if configuration may have changed half-way through a
515 	 * read.  We need to continue to read both halves of the value until we
516 	 * read the same value at least twice.
517 	 */
518 	uintptr_t o_lo = vio->vio_config_offset + offset;
519 	uintptr_t o_hi = o_lo + 4;
520 
521 	uint64_t val = virtio_get32(vio, o_lo) |
522 	    ((uint64_t)virtio_get32(vio, o_hi) << 32);
523 
524 	for (;;) {
525 		uint64_t tval = virtio_get32(vio, o_lo) |
526 		    ((uint64_t)virtio_get32(vio, o_hi) << 32);
527 
528 		if (tval == val) {
529 			break;
530 		}
531 
532 		val = tval;
533 	}
534 
535 	mutex_exit(&vio->vio_mutex);
536 	return (val);
537 }
538 
539 void
540 virtio_dev_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
541 {
542 	mutex_enter(&vio->vio_mutex);
543 	virtio_put8(vio, vio->vio_config_offset + offset, value);
544 	mutex_exit(&vio->vio_mutex);
545 }
546 
547 void
548 virtio_dev_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
549 {
550 	mutex_enter(&vio->vio_mutex);
551 	virtio_put16(vio, vio->vio_config_offset + offset, value);
552 	mutex_exit(&vio->vio_mutex);
553 }
554 
555 void
556 virtio_dev_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
557 {
558 	mutex_enter(&vio->vio_mutex);
559 	virtio_put32(vio, vio->vio_config_offset + offset, value);
560 	mutex_exit(&vio->vio_mutex);
561 }
562 
563 /*
564  * VIRTQUEUE MANAGEMENT
565  */
566 
567 static int
568 virtio_inflight_compar(const void *lp, const void *rp)
569 {
570 	const virtio_chain_t *l = lp;
571 	const virtio_chain_t *r = rp;
572 
573 	if (l->vic_head < r->vic_head) {
574 		return (-1);
575 	} else if (l->vic_head > r->vic_head) {
576 		return (1);
577 	} else {
578 		return (0);
579 	}
580 }
581 
582 virtio_queue_t *
583 virtio_queue_alloc(virtio_t *vio, uint16_t qidx, const char *name,
584     ddi_intr_handler_t *func, void *funcarg, boolean_t force_direct,
585     uint_t max_segs)
586 {
587 	uint16_t qsz;
588 	char space_name[256];
589 
590 	if (max_segs < 1) {
591 		/*
592 		 * Every descriptor, direct or indirect, needs to refer to at
593 		 * least one buffer.
594 		 */
595 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
596 		    "segment count must be at least 1", name, (uint_t)qidx);
597 		return (NULL);
598 	}
599 
600 	mutex_enter(&vio->vio_mutex);
601 
602 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER) {
603 		/*
604 		 * Cannot configure any more queues once initial setup is
605 		 * complete and interrupts have been allocated.
606 		 */
607 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
608 		    "alloc after init complete", name, (uint_t)qidx);
609 		mutex_exit(&vio->vio_mutex);
610 		return (NULL);
611 	}
612 
613 	/*
614 	 * There is no way to negotiate a different queue size for legacy
615 	 * devices.  We must read and use the native queue size of the device.
616 	 */
617 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
618 	if ((qsz = virtio_get16(vio, VIRTIO_LEGACY_QUEUE_SIZE)) == 0) {
619 		/*
620 		 * A size of zero means the device does not have a queue with
621 		 * this index.
622 		 */
623 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
624 		    "does not exist on device", name, (uint_t)qidx);
625 		mutex_exit(&vio->vio_mutex);
626 		return (NULL);
627 	}
628 
629 	mutex_exit(&vio->vio_mutex);
630 
631 	virtio_queue_t *viq = kmem_zalloc(sizeof (*viq), KM_SLEEP);
632 	viq->viq_virtio = vio;
633 	viq->viq_name = name;
634 	viq->viq_index = qidx;
635 	viq->viq_size = qsz;
636 	viq->viq_func = func;
637 	viq->viq_funcarg = funcarg;
638 	viq->viq_max_segs = max_segs;
639 	avl_create(&viq->viq_inflight, virtio_inflight_compar,
640 	    sizeof (virtio_chain_t), offsetof(virtio_chain_t, vic_node));
641 
642 	/*
643 	 * Allocate the mutex without an interrupt priority for now, as we do
644 	 * with "vio_mutex".  We'll reinitialise it in
645 	 * "virtio_init_complete()".
646 	 */
647 	mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, NULL);
648 
649 	if (virtio_feature_present(vio, VIRTIO_F_RING_INDIRECT_DESC) &&
650 	    !force_direct) {
651 		/*
652 		 * If we were able to negotiate the indirect descriptor
653 		 * feature, and the caller has not explicitly forced the use of
654 		 * direct descriptors, we'll allocate indirect descriptor lists
655 		 * for each chain.
656 		 */
657 		viq->viq_indirect = B_TRUE;
658 	}
659 
660 	/*
661 	 * Track descriptor usage in an identifier space.
662 	 */
663 	(void) snprintf(space_name, sizeof (space_name), "%s%d_vq_%s",
664 	    ddi_get_name(vio->vio_dip), ddi_get_instance(vio->vio_dip), name);
665 	if ((viq->viq_descmap = id_space_create(space_name, 0, qsz)) == NULL) {
666 		dev_err(vio->vio_dip, CE_WARN, "could not allocate descriptor "
667 		    "ID space");
668 		virtio_queue_free(viq);
669 		return (NULL);
670 	}
671 
672 	/*
673 	 * For legacy devices, memory for the queue has a strict layout
674 	 * determined by the queue size.
675 	 */
676 	size_t sz_descs = sizeof (virtio_vq_desc_t) * qsz;
677 	size_t sz_driver = P2ROUNDUP_TYPED(sz_descs +
678 	    sizeof (virtio_vq_driver_t) +
679 	    sizeof (uint16_t) * qsz,
680 	    VIRTIO_PAGE_SIZE, size_t);
681 	size_t sz_device = P2ROUNDUP_TYPED(sizeof (virtio_vq_device_t) +
682 	    sizeof (virtio_vq_elem_t) * qsz,
683 	    VIRTIO_PAGE_SIZE, size_t);
684 
685 	if (virtio_dma_init(vio, &viq->viq_dma, sz_driver + sz_device,
686 	    &virtio_dma_attr_queue, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
687 	    KM_SLEEP) != DDI_SUCCESS) {
688 		dev_err(vio->vio_dip, CE_WARN, "could not allocate queue "
689 		    "DMA memory");
690 		virtio_queue_free(viq);
691 		return (NULL);
692 	}
693 
694 	/*
695 	 * NOTE: The viq_dma_* members below are used by
696 	 * VIRTQ_DMA_SYNC_FORDEV() and VIRTQ_DMA_SYNC_FORKERNEL() to calculate
697 	 * offsets into the DMA allocation for partial synchronisation.  If the
698 	 * ordering of, or relationship between, these pointers changes, the
699 	 * macros must be kept in sync.
700 	 */
701 	viq->viq_dma_descs = virtio_dma_va(&viq->viq_dma, 0);
702 	viq->viq_dma_driver = virtio_dma_va(&viq->viq_dma, sz_descs);
703 	viq->viq_dma_device = virtio_dma_va(&viq->viq_dma, sz_driver);
704 
705 	/*
706 	 * Install in the per-device list of queues.
707 	 */
708 	mutex_enter(&vio->vio_mutex);
709 	for (virtio_queue_t *chkvq = list_head(&vio->vio_queues); chkvq != NULL;
710 	    chkvq = list_next(&vio->vio_queues, chkvq)) {
711 		if (chkvq->viq_index == qidx) {
712 			dev_err(vio->vio_dip, CE_WARN, "attempt to register "
713 			    "queue \"%s\" with same index (%d) as queue \"%s\"",
714 			    name, qidx, chkvq->viq_name);
715 			mutex_exit(&vio->vio_mutex);
716 			virtio_queue_free(viq);
717 			return (NULL);
718 		}
719 	}
720 	list_insert_tail(&vio->vio_queues, viq);
721 
722 	/*
723 	 * Ensure the zeroing of the queue memory is visible to the host before
724 	 * we inform the device of the queue address.
725 	 */
726 	membar_producer();
727 	VIRTQ_DMA_SYNC_FORDEV(viq);
728 
729 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
730 	virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS,
731 	    virtio_dma_cookie_pa(&viq->viq_dma, 0) >> VIRTIO_PAGE_SHIFT);
732 
733 	mutex_exit(&vio->vio_mutex);
734 	return (viq);
735 }
736 
737 static void
738 virtio_queue_free(virtio_queue_t *viq)
739 {
740 	virtio_t *vio = viq->viq_virtio;
741 
742 	/*
743 	 * We are going to destroy the queue mutex.  Make sure we've already
744 	 * removed the interrupt handlers.
745 	 */
746 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
747 
748 	mutex_enter(&viq->viq_mutex);
749 
750 	/*
751 	 * If the device has not already been reset as part of a shutdown,
752 	 * detach the queue from the device now.
753 	 */
754 	if (!viq->viq_shutdown) {
755 		virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, viq->viq_index);
756 		virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, 0);
757 	}
758 
759 	virtio_dma_fini(&viq->viq_dma);
760 
761 	VERIFY(avl_is_empty(&viq->viq_inflight));
762 	avl_destroy(&viq->viq_inflight);
763 	if (viq->viq_descmap != NULL) {
764 		id_space_destroy(viq->viq_descmap);
765 	}
766 
767 	mutex_exit(&viq->viq_mutex);
768 	mutex_destroy(&viq->viq_mutex);
769 
770 	kmem_free(viq, sizeof (*viq));
771 }
772 
773 void
774 virtio_queue_no_interrupt(virtio_queue_t *viq, boolean_t stop_interrupts)
775 {
776 	mutex_enter(&viq->viq_mutex);
777 
778 	if (stop_interrupts) {
779 		viq->viq_dma_driver->vqdr_flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
780 	} else {
781 		viq->viq_dma_driver->vqdr_flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
782 	}
783 	VIRTQ_DMA_SYNC_FORDEV(viq);
784 
785 	mutex_exit(&viq->viq_mutex);
786 }
787 
788 static virtio_chain_t *
789 virtio_queue_complete(virtio_queue_t *viq, uint_t index)
790 {
791 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
792 
793 	virtio_chain_t *vic;
794 
795 	virtio_chain_t search;
796 	bzero(&search, sizeof (search));
797 	search.vic_head = index;
798 
799 	if ((vic = avl_find(&viq->viq_inflight, &search, NULL)) == NULL) {
800 		return (NULL);
801 	}
802 	avl_remove(&viq->viq_inflight, vic);
803 
804 	return (vic);
805 }
806 
807 uint_t
808 virtio_queue_size(virtio_queue_t *viq)
809 {
810 	return (viq->viq_size);
811 }
812 
813 uint_t
814 virtio_queue_nactive(virtio_queue_t *viq)
815 {
816 	mutex_enter(&viq->viq_mutex);
817 	uint_t r = avl_numnodes(&viq->viq_inflight);
818 	mutex_exit(&viq->viq_mutex);
819 
820 	return (r);
821 }
822 
823 virtio_chain_t *
824 virtio_queue_poll(virtio_queue_t *viq)
825 {
826 	mutex_enter(&viq->viq_mutex);
827 	if (viq->viq_shutdown) {
828 		/*
829 		 * The device has been reset by virtio_shutdown(), and queue
830 		 * processing has been halted.  Any previously submitted chains
831 		 * will be evacuated using virtio_queue_evacuate().
832 		 */
833 		mutex_exit(&viq->viq_mutex);
834 		return (NULL);
835 	}
836 
837 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
838 	if (viq->viq_device_index == viq->viq_dma_device->vqde_index) {
839 		/*
840 		 * If the device index has not changed since the last poll,
841 		 * there are no new chains to process.
842 		 */
843 		mutex_exit(&viq->viq_mutex);
844 		return (NULL);
845 	}
846 
847 	/*
848 	 * We need to ensure that all reads from the descriptor (vqde_ring[])
849 	 * and any referenced memory by the descriptor occur after we have read
850 	 * the descriptor index value above (vqde_index).
851 	 */
852 	membar_consumer();
853 
854 	uint16_t index = (viq->viq_device_index++) % viq->viq_size;
855 	uint16_t start = viq->viq_dma_device->vqde_ring[index].vqe_start;
856 	uint32_t len = viq->viq_dma_device->vqde_ring[index].vqe_len;
857 
858 	virtio_chain_t *vic;
859 	if ((vic = virtio_queue_complete(viq, start)) == NULL) {
860 		/*
861 		 * We could not locate a chain for this descriptor index, which
862 		 * suggests that something has gone horribly wrong.
863 		 */
864 		dev_err(viq->viq_virtio->vio_dip, CE_PANIC,
865 		    "queue \"%s\" ring entry %u (descriptor %u) has no chain",
866 		    viq->viq_name, (uint16_t)index, (uint16_t)start);
867 	}
868 
869 	vic->vic_received_length = len;
870 
871 	mutex_exit(&viq->viq_mutex);
872 
873 	return (vic);
874 }
875 
876 /*
877  * After a call to "virtio_shutdown()", the driver must retrieve any previously
878  * submitted chains and free any associated resources.
879  */
880 virtio_chain_t *
881 virtio_queue_evacuate(virtio_queue_t *viq)
882 {
883 	virtio_t *vio = viq->viq_virtio;
884 
885 	mutex_enter(&vio->vio_mutex);
886 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN)) {
887 		dev_err(vio->vio_dip, CE_PANIC,
888 		    "virtio_queue_evacuate() without virtio_shutdown()");
889 	}
890 	mutex_exit(&vio->vio_mutex);
891 
892 	mutex_enter(&viq->viq_mutex);
893 	VERIFY(viq->viq_shutdown);
894 
895 	virtio_chain_t *vic = avl_first(&viq->viq_inflight);
896 	if (vic != NULL) {
897 		avl_remove(&viq->viq_inflight, vic);
898 	}
899 
900 	mutex_exit(&viq->viq_mutex);
901 
902 	return (vic);
903 }
904 
905 /*
906  * VIRTQUEUE DESCRIPTOR CHAIN MANAGEMENT
907  */
908 
909 /*
910  * When the device returns a descriptor chain to the driver, it may provide the
911  * length in bytes of data written into the chain.  Client drivers should use
912  * this value with care; the specification suggests some device implementations
913  * have not always provided a useful or correct value.
914  */
915 size_t
916 virtio_chain_received_length(virtio_chain_t *vic)
917 {
918 	return (vic->vic_received_length);
919 }
920 
921 /*
922  * Allocate a descriptor chain for use with this queue.  The "kmflags" value
923  * may be KM_SLEEP or KM_NOSLEEP as per kmem_alloc(9F).
924  */
925 virtio_chain_t *
926 virtio_chain_alloc(virtio_queue_t *viq, int kmflags)
927 {
928 	virtio_t *vio = viq->viq_virtio;
929 	virtio_chain_t *vic;
930 	uint_t cap;
931 
932 	/*
933 	 * Direct descriptors are known by their index in the descriptor table
934 	 * for the queue.  We use the variable-length array member at the end
935 	 * of the chain tracking object to hold the list of direct descriptors
936 	 * assigned to this chain.
937 	 */
938 	if (viq->viq_indirect) {
939 		/*
940 		 * When using indirect descriptors we still need one direct
941 		 * descriptor entry to hold the physical address and length of
942 		 * the indirect descriptor table.
943 		 */
944 		cap = 1;
945 	} else {
946 		/*
947 		 * For direct descriptors we need to be able to track a
948 		 * descriptor for each possible segment in a single chain.
949 		 */
950 		cap = viq->viq_max_segs;
951 	}
952 
953 	size_t vicsz = sizeof (*vic) + sizeof (uint16_t) * cap;
954 	if ((vic = kmem_zalloc(vicsz, kmflags)) == NULL) {
955 		return (NULL);
956 	}
957 	vic->vic_vq = viq;
958 	vic->vic_direct_capacity = cap;
959 
960 	if (viq->viq_indirect) {
961 		/*
962 		 * Allocate an indirect descriptor list with the appropriate
963 		 * number of entries.
964 		 */
965 		if (virtio_dma_init(vio, &vic->vic_indirect_dma,
966 		    sizeof (virtio_vq_desc_t) * viq->viq_max_segs,
967 		    &virtio_dma_attr_indirect,
968 		    DDI_DMA_CONSISTENT | DDI_DMA_WRITE,
969 		    kmflags) != DDI_SUCCESS) {
970 			goto fail;
971 		}
972 
973 		/*
974 		 * Allocate a single descriptor to hold the indirect list.
975 		 * Leave the length as zero for now; it will be set to include
976 		 * any occupied entries at push time.
977 		 */
978 		mutex_enter(&viq->viq_mutex);
979 		if (virtio_chain_append_impl(vic,
980 		    virtio_dma_cookie_pa(&vic->vic_indirect_dma, 0), 0,
981 		    VIRTQ_DESC_F_INDIRECT) != DDI_SUCCESS) {
982 			mutex_exit(&viq->viq_mutex);
983 			goto fail;
984 		}
985 		mutex_exit(&viq->viq_mutex);
986 		VERIFY3U(vic->vic_direct_used, ==, 1);
987 
988 		/*
989 		 * Don't set the indirect capacity until after we've installed
990 		 * the direct descriptor which points at the indirect list, or
991 		 * virtio_chain_append_impl() will be confused.
992 		 */
993 		vic->vic_indirect_capacity = viq->viq_max_segs;
994 	}
995 
996 	return (vic);
997 
998 fail:
999 	virtio_dma_fini(&vic->vic_indirect_dma);
1000 	kmem_free(vic, vicsz);
1001 	return (NULL);
1002 }
1003 
1004 void *
1005 virtio_chain_data(virtio_chain_t *vic)
1006 {
1007 	return (vic->vic_data);
1008 }
1009 
1010 void
1011 virtio_chain_data_set(virtio_chain_t *vic, void *data)
1012 {
1013 	vic->vic_data = data;
1014 }
1015 
1016 void
1017 virtio_chain_clear(virtio_chain_t *vic)
1018 {
1019 	if (vic->vic_indirect_capacity != 0) {
1020 		/*
1021 		 * There should only be one direct descriptor, which points at
1022 		 * our indirect descriptor list.  We don't want to clear it
1023 		 * here.
1024 		 */
1025 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1026 
1027 		if (vic->vic_indirect_used > 0) {
1028 			/*
1029 			 * Clear out the indirect descriptor table.
1030 			 */
1031 			vic->vic_indirect_used = 0;
1032 			bzero(virtio_dma_va(&vic->vic_indirect_dma, 0),
1033 			    virtio_dma_size(&vic->vic_indirect_dma));
1034 		}
1035 
1036 	} else if (vic->vic_direct_capacity > 0) {
1037 		/*
1038 		 * Release any descriptors that were assigned to us previously.
1039 		 */
1040 		for (uint_t i = 0; i < vic->vic_direct_used; i++) {
1041 			id_free(vic->vic_vq->viq_descmap, vic->vic_direct[i]);
1042 			vic->vic_direct[i] = 0;
1043 		}
1044 		vic->vic_direct_used = 0;
1045 	}
1046 }
1047 
1048 void
1049 virtio_chain_free(virtio_chain_t *vic)
1050 {
1051 	/*
1052 	 * First ensure that we have released any descriptors used by this
1053 	 * chain.
1054 	 */
1055 	virtio_chain_clear(vic);
1056 
1057 	if (vic->vic_indirect_capacity > 0) {
1058 		/*
1059 		 * Release the direct descriptor that points to our indirect
1060 		 * descriptor list.
1061 		 */
1062 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1063 		id_free(vic->vic_vq->viq_descmap, vic->vic_direct[0]);
1064 
1065 		virtio_dma_fini(&vic->vic_indirect_dma);
1066 	}
1067 
1068 	size_t vicsz = sizeof (*vic) +
1069 	    vic->vic_direct_capacity * sizeof (uint16_t);
1070 
1071 	kmem_free(vic, vicsz);
1072 }
1073 
1074 static inline int
1075 virtio_queue_descmap_alloc(virtio_queue_t *viq, uint_t *indexp)
1076 {
1077 	id_t index;
1078 
1079 	if ((index = id_alloc_nosleep(viq->viq_descmap)) == -1) {
1080 		return (ENOMEM);
1081 	}
1082 
1083 	VERIFY3S(index, >=, 0);
1084 	VERIFY3S(index, <=, viq->viq_size);
1085 
1086 	*indexp = (uint_t)index;
1087 	return (0);
1088 }
1089 
1090 static int
1091 virtio_chain_append_impl(virtio_chain_t *vic, uint64_t pa, size_t len,
1092     uint16_t flags)
1093 {
1094 	virtio_queue_t *viq = vic->vic_vq;
1095 	virtio_vq_desc_t *vqd;
1096 	uint_t index;
1097 
1098 	/*
1099 	 * We're modifying the queue-wide descriptor list so make sure we have
1100 	 * the appropriate lock.
1101 	 */
1102 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1103 
1104 	if (vic->vic_indirect_capacity != 0) {
1105 		/*
1106 		 * Use indirect descriptors.
1107 		 */
1108 		if (vic->vic_indirect_used >= vic->vic_indirect_capacity) {
1109 			return (DDI_FAILURE);
1110 		}
1111 
1112 		vqd = virtio_dma_va(&vic->vic_indirect_dma, 0);
1113 
1114 		if ((index = vic->vic_indirect_used++) > 0) {
1115 			/*
1116 			 * Chain the current last indirect descriptor to the
1117 			 * new one.
1118 			 */
1119 			vqd[index - 1].vqd_flags |= VIRTQ_DESC_F_NEXT;
1120 			vqd[index - 1].vqd_next = index;
1121 		}
1122 
1123 	} else {
1124 		/*
1125 		 * Use direct descriptors.
1126 		 */
1127 		if (vic->vic_direct_used >= vic->vic_direct_capacity) {
1128 			return (DDI_FAILURE);
1129 		}
1130 
1131 		if (virtio_queue_descmap_alloc(viq, &index) != 0) {
1132 			return (DDI_FAILURE);
1133 		}
1134 
1135 		vqd = virtio_dma_va(&viq->viq_dma, 0);
1136 
1137 		if (vic->vic_direct_used > 0) {
1138 			/*
1139 			 * This is not the first entry.  Chain the current
1140 			 * descriptor to the next one.
1141 			 */
1142 			uint16_t p = vic->vic_direct[vic->vic_direct_used - 1];
1143 
1144 			vqd[p].vqd_flags |= VIRTQ_DESC_F_NEXT;
1145 			vqd[p].vqd_next = index;
1146 		}
1147 		vic->vic_direct[vic->vic_direct_used++] = index;
1148 	}
1149 
1150 	vqd[index].vqd_addr = pa;
1151 	vqd[index].vqd_len = len;
1152 	vqd[index].vqd_flags = flags;
1153 	vqd[index].vqd_next = 0;
1154 
1155 	return (DDI_SUCCESS);
1156 }
1157 
1158 int
1159 virtio_chain_append(virtio_chain_t *vic, uint64_t pa, size_t len,
1160     virtio_direction_t dir)
1161 {
1162 	virtio_queue_t *viq = vic->vic_vq;
1163 	uint16_t flags = 0;
1164 
1165 	switch (dir) {
1166 	case VIRTIO_DIR_DEVICE_WRITES:
1167 		flags |= VIRTQ_DESC_F_WRITE;
1168 		break;
1169 
1170 	case VIRTIO_DIR_DEVICE_READS:
1171 		break;
1172 
1173 	default:
1174 		panic("unknown direction value %u", dir);
1175 	}
1176 
1177 	mutex_enter(&viq->viq_mutex);
1178 	int r = virtio_chain_append_impl(vic, pa, len, flags);
1179 	mutex_exit(&viq->viq_mutex);
1180 
1181 	return (r);
1182 }
1183 
1184 static void
1185 virtio_queue_flush_locked(virtio_queue_t *viq)
1186 {
1187 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1188 
1189 	/*
1190 	 * Make sure any writes we have just made to the descriptors
1191 	 * (vqdr_ring[]) are visible to the device before we update the ring
1192 	 * pointer (vqdr_index).
1193 	 */
1194 	membar_producer();
1195 	viq->viq_dma_driver->vqdr_index = viq->viq_driver_index;
1196 	VIRTQ_DMA_SYNC_FORDEV(viq);
1197 
1198 	/*
1199 	 * Determine whether the device expects us to notify it of new
1200 	 * descriptors.
1201 	 */
1202 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
1203 	if (!(viq->viq_dma_device->vqde_flags & VIRTQ_USED_F_NO_NOTIFY)) {
1204 		virtio_put16(viq->viq_virtio, VIRTIO_LEGACY_QUEUE_NOTIFY,
1205 		    viq->viq_index);
1206 	}
1207 }
1208 
1209 void
1210 virtio_queue_flush(virtio_queue_t *viq)
1211 {
1212 	mutex_enter(&viq->viq_mutex);
1213 	virtio_queue_flush_locked(viq);
1214 	mutex_exit(&viq->viq_mutex);
1215 }
1216 
1217 void
1218 virtio_chain_submit(virtio_chain_t *vic, boolean_t flush)
1219 {
1220 	virtio_queue_t *viq = vic->vic_vq;
1221 
1222 	mutex_enter(&viq->viq_mutex);
1223 
1224 	if (vic->vic_indirect_capacity != 0) {
1225 		virtio_vq_desc_t *vqd = virtio_dma_va(&viq->viq_dma, 0);
1226 
1227 		VERIFY3U(vic->vic_direct_used, ==, 1);
1228 
1229 		/*
1230 		 * This is an indirect descriptor queue.  The length in bytes
1231 		 * of the descriptor must extend to cover the populated
1232 		 * indirect descriptor entries.
1233 		 */
1234 		vqd[vic->vic_direct[0]].vqd_len =
1235 		    sizeof (virtio_vq_desc_t) * vic->vic_indirect_used;
1236 
1237 		virtio_dma_sync(&vic->vic_indirect_dma, DDI_DMA_SYNC_FORDEV);
1238 	}
1239 
1240 	/*
1241 	 * Populate the next available slot in the driver-owned ring for this
1242 	 * chain.  The updated value of viq_driver_index is not yet visible to
1243 	 * the device until a subsequent queue flush.
1244 	 */
1245 	uint16_t index = (viq->viq_driver_index++) % viq->viq_size;
1246 	viq->viq_dma_driver->vqdr_ring[index] = vic->vic_direct[0];
1247 
1248 	vic->vic_head = vic->vic_direct[0];
1249 	avl_add(&viq->viq_inflight, vic);
1250 
1251 	if (flush) {
1252 		virtio_queue_flush_locked(vic->vic_vq);
1253 	}
1254 
1255 	mutex_exit(&viq->viq_mutex);
1256 }
1257 
1258 /*
1259  * INTERRUPTS MANAGEMENT
1260  */
1261 
1262 static const char *
1263 virtio_interrupt_type_name(int type)
1264 {
1265 	switch (type) {
1266 	case DDI_INTR_TYPE_MSIX:
1267 		return ("MSI-X");
1268 	case DDI_INTR_TYPE_MSI:
1269 		return ("MSI");
1270 	case DDI_INTR_TYPE_FIXED:
1271 		return ("fixed");
1272 	default:
1273 		return ("?");
1274 	}
1275 }
1276 
1277 static int
1278 virtio_interrupts_alloc(virtio_t *vio, int type, int nrequired)
1279 {
1280 	dev_info_t *dip = vio->vio_dip;
1281 	int nintrs = 0;
1282 	int navail = 0;
1283 
1284 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1285 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC));
1286 
1287 	if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
1288 		dev_err(dip, CE_WARN, "could not count %s interrupts",
1289 		    virtio_interrupt_type_name(type));
1290 		return (DDI_FAILURE);
1291 	}
1292 	if (nintrs < 1) {
1293 		dev_err(dip, CE_WARN, "no %s interrupts supported",
1294 		    virtio_interrupt_type_name(type));
1295 		return (DDI_FAILURE);
1296 	}
1297 
1298 	if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
1299 		dev_err(dip, CE_WARN, "could not count available %s interrupts",
1300 		    virtio_interrupt_type_name(type));
1301 		return (DDI_FAILURE);
1302 	}
1303 	if (navail < nrequired) {
1304 		dev_err(dip, CE_WARN, "need %d %s interrupts, but only %d "
1305 		    "available", nrequired, virtio_interrupt_type_name(type),
1306 		    navail);
1307 		return (DDI_FAILURE);
1308 	}
1309 
1310 	VERIFY3P(vio->vio_interrupts, ==, NULL);
1311 	vio->vio_interrupts = kmem_zalloc(
1312 	    sizeof (ddi_intr_handle_t) * nrequired, KM_SLEEP);
1313 
1314 	int r;
1315 	if ((r = ddi_intr_alloc(dip, vio->vio_interrupts, type, 0, nrequired,
1316 	    &vio->vio_ninterrupts, DDI_INTR_ALLOC_STRICT)) != DDI_SUCCESS) {
1317 		dev_err(dip, CE_WARN, "%s interrupt allocation failure (%d)",
1318 		    virtio_interrupt_type_name(type), r);
1319 		kmem_free(vio->vio_interrupts,
1320 		    sizeof (ddi_intr_handle_t) * nrequired);
1321 		vio->vio_interrupts = NULL;
1322 		return (DDI_FAILURE);
1323 	}
1324 
1325 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ALLOC;
1326 	vio->vio_interrupt_type = type;
1327 	return (DDI_SUCCESS);
1328 }
1329 
1330 static uint_t
1331 virtio_shared_isr(caddr_t arg0, caddr_t arg1)
1332 {
1333 	virtio_t *vio = (virtio_t *)arg0;
1334 	uint_t r = DDI_INTR_UNCLAIMED;
1335 	uint8_t isr;
1336 
1337 	mutex_enter(&vio->vio_mutex);
1338 
1339 	/*
1340 	 * Check the ISR status to see if the interrupt applies to us.  Reading
1341 	 * this field resets it to zero.
1342 	 */
1343 	isr = virtio_get8(vio, VIRTIO_LEGACY_ISR_STATUS);
1344 	if ((isr & VIRTIO_ISR_CHECK_QUEUES) == 0) {
1345 		goto done;
1346 	}
1347 
1348 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1349 	    viq = list_next(&vio->vio_queues, viq)) {
1350 		if (viq->viq_func != NULL) {
1351 			mutex_exit(&vio->vio_mutex);
1352 			if (viq->viq_func(viq->viq_funcarg, arg0) ==
1353 			    DDI_INTR_CLAIMED) {
1354 				r = DDI_INTR_CLAIMED;
1355 			}
1356 			mutex_enter(&vio->vio_mutex);
1357 
1358 			if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
1359 				/*
1360 				 * The device was shut down while in a queue
1361 				 * handler routine.
1362 				 */
1363 				goto done;
1364 			}
1365 		}
1366 	}
1367 
1368 done:
1369 	mutex_exit(&vio->vio_mutex);
1370 	return (r);
1371 }
1372 
1373 static int
1374 virtio_interrupts_setup(virtio_t *vio, int allow_types)
1375 {
1376 	dev_info_t *dip = vio->vio_dip;
1377 	int types;
1378 	int count = 0;
1379 
1380 	mutex_enter(&vio->vio_mutex);
1381 
1382 	/*
1383 	 * Determine the number of interrupts we'd like based on the number of
1384 	 * virtqueues.
1385 	 */
1386 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1387 	    viq = list_next(&vio->vio_queues, viq)) {
1388 		if (viq->viq_func != NULL) {
1389 			count++;
1390 		}
1391 	}
1392 
1393 	if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
1394 		dev_err(dip, CE_WARN, "could not get supported interrupts");
1395 		mutex_exit(&vio->vio_mutex);
1396 		return (DDI_FAILURE);
1397 	}
1398 
1399 	if (allow_types != 0) {
1400 		/*
1401 		 * Restrict the possible interrupt types at the request of the
1402 		 * driver.
1403 		 */
1404 		types &= allow_types;
1405 	}
1406 
1407 	/*
1408 	 * Try each potential interrupt type in descending order of preference.
1409 	 * Note that the specification does not appear to allow for the use of
1410 	 * classical MSI, so we are limited to either MSI-X or fixed
1411 	 * interrupts.
1412 	 */
1413 	if (types & DDI_INTR_TYPE_MSIX) {
1414 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_MSIX,
1415 		    count) == DDI_SUCCESS) {
1416 			goto add_handlers;
1417 		}
1418 	}
1419 	if (types & DDI_INTR_TYPE_FIXED) {
1420 		/*
1421 		 * If fixed interrupts are all that are available, we'll just
1422 		 * ask for one.
1423 		 */
1424 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_FIXED, 1) ==
1425 		    DDI_SUCCESS) {
1426 			goto add_handlers;
1427 		}
1428 	}
1429 
1430 	dev_err(dip, CE_WARN, "interrupt allocation failed");
1431 	mutex_exit(&vio->vio_mutex);
1432 	return (DDI_FAILURE);
1433 
1434 add_handlers:
1435 	/*
1436 	 * Ensure that we have not been given any high-level interrupts as our
1437 	 * interrupt handlers do not support them.
1438 	 */
1439 	for (int i = 0; i < vio->vio_ninterrupts; i++) {
1440 		uint_t ipri;
1441 
1442 		if (ddi_intr_get_pri(vio->vio_interrupts[i], &ipri) !=
1443 		    DDI_SUCCESS) {
1444 			dev_err(dip, CE_WARN, "could not determine interrupt "
1445 			    "priority");
1446 			goto fail;
1447 		}
1448 
1449 		if (ipri >= ddi_intr_get_hilevel_pri()) {
1450 			dev_err(dip, CE_WARN, "high level interrupts not "
1451 			    "supported");
1452 			goto fail;
1453 		}
1454 
1455 		/*
1456 		 * Record the highest priority we've been allocated to use for
1457 		 * mutex initialisation.
1458 		 */
1459 		if (i == 0 || ipri > vio->vio_interrupt_priority) {
1460 			vio->vio_interrupt_priority = ipri;
1461 		}
1462 	}
1463 
1464 	/*
1465 	 * Get the interrupt capabilities from the first handle to determine
1466 	 * whether we need to use ddi_intr_block_enable(9F).
1467 	 */
1468 	if (ddi_intr_get_cap(vio->vio_interrupts[0],
1469 	    &vio->vio_interrupt_cap) != DDI_SUCCESS) {
1470 		dev_err(dip, CE_WARN, "failed to get interrupt capabilities");
1471 		goto fail;
1472 	}
1473 
1474 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1475 		VERIFY3S(vio->vio_ninterrupts, ==, 1);
1476 		/*
1477 		 * For fixed interrupts, we need to use our shared handler to
1478 		 * multiplex the per-queue handlers provided by the driver.
1479 		 */
1480 		if (ddi_intr_add_handler(vio->vio_interrupts[0],
1481 		    virtio_shared_isr, (caddr_t)vio, NULL) != DDI_SUCCESS) {
1482 			dev_err(dip, CE_WARN, "adding shared %s interrupt "
1483 			    "handler failed", virtio_interrupt_type_name(
1484 			    vio->vio_interrupt_type));
1485 			goto fail;
1486 		}
1487 
1488 		goto done;
1489 	}
1490 
1491 	VERIFY3S(vio->vio_ninterrupts, ==, count);
1492 
1493 	uint_t n = 0;
1494 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1495 	    viq = list_next(&vio->vio_queues, viq)) {
1496 		if (viq->viq_func == NULL) {
1497 			continue;
1498 		}
1499 
1500 		if (ddi_intr_add_handler(vio->vio_interrupts[n],
1501 		    viq->viq_func, (caddr_t)viq->viq_funcarg,
1502 		    (caddr_t)vio) != DDI_SUCCESS) {
1503 			dev_err(dip, CE_WARN, "adding interrupt %u (%s) failed",
1504 			    n, viq->viq_name);
1505 			goto fail;
1506 		}
1507 
1508 		viq->viq_handler_index = n;
1509 		viq->viq_handler_added = B_TRUE;
1510 		n++;
1511 	}
1512 
1513 done:
1514 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ADDED;
1515 	mutex_exit(&vio->vio_mutex);
1516 	return (DDI_SUCCESS);
1517 
1518 fail:
1519 	virtio_interrupts_teardown(vio);
1520 	mutex_exit(&vio->vio_mutex);
1521 	return (DDI_FAILURE);
1522 }
1523 
1524 static void
1525 virtio_interrupts_teardown(virtio_t *vio)
1526 {
1527 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1528 
1529 	virtio_interrupts_disable_locked(vio);
1530 
1531 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1532 		/*
1533 		 * Remove the multiplexing interrupt handler.
1534 		 */
1535 		if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED) {
1536 			int r;
1537 
1538 			VERIFY3S(vio->vio_ninterrupts, ==, 1);
1539 
1540 			if ((r = ddi_intr_remove_handler(
1541 			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
1542 				dev_err(vio->vio_dip, CE_WARN, "removing "
1543 				    "shared interrupt handler failed (%d)", r);
1544 			}
1545 		}
1546 	} else {
1547 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1548 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1549 			int r;
1550 
1551 			if (!viq->viq_handler_added) {
1552 				continue;
1553 			}
1554 
1555 			if ((r = ddi_intr_remove_handler(
1556 			    vio->vio_interrupts[viq->viq_handler_index])) !=
1557 			    DDI_SUCCESS) {
1558 				dev_err(vio->vio_dip, CE_WARN, "removing "
1559 				    "interrupt handler (%s) failed (%d)",
1560 				    viq->viq_name, r);
1561 			}
1562 
1563 			viq->viq_handler_added = B_FALSE;
1564 		}
1565 	}
1566 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ADDED;
1567 
1568 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC) {
1569 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1570 			int r;
1571 
1572 			if ((r = ddi_intr_free(vio->vio_interrupts[i])) !=
1573 			    DDI_SUCCESS) {
1574 				dev_err(vio->vio_dip, CE_WARN, "freeing "
1575 				    "interrupt %u failed (%d)", i, r);
1576 			}
1577 		}
1578 		kmem_free(vio->vio_interrupts,
1579 		    sizeof (ddi_intr_handle_t) * vio->vio_ninterrupts);
1580 		vio->vio_interrupts = NULL;
1581 		vio->vio_ninterrupts = 0;
1582 		vio->vio_interrupt_type = 0;
1583 		vio->vio_interrupt_cap = 0;
1584 		vio->vio_interrupt_priority = 0;
1585 
1586 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ALLOC;
1587 	}
1588 }
1589 
1590 static void
1591 virtio_interrupts_unwind(virtio_t *vio)
1592 {
1593 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1594 
1595 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1596 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1597 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1598 			if (!viq->viq_handler_added) {
1599 				continue;
1600 			}
1601 
1602 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT,
1603 			    viq->viq_index);
1604 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE,
1605 			    VIRTIO_LEGACY_MSI_NO_VECTOR);
1606 		}
1607 	}
1608 
1609 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1610 		(void) ddi_intr_block_disable(vio->vio_interrupts,
1611 		    vio->vio_ninterrupts);
1612 	} else {
1613 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1614 			(void) ddi_intr_disable(vio->vio_interrupts[i]);
1615 		}
1616 	}
1617 
1618 	/*
1619 	 * Disabling the interrupts makes the MSI-X fields disappear from the
1620 	 * BAR once more.
1621 	 */
1622 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
1623 }
1624 
1625 int
1626 virtio_interrupts_enable(virtio_t *vio)
1627 {
1628 	mutex_enter(&vio->vio_mutex);
1629 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED) {
1630 		mutex_exit(&vio->vio_mutex);
1631 		return (DDI_SUCCESS);
1632 	}
1633 
1634 	int r = DDI_SUCCESS;
1635 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1636 		r = ddi_intr_block_enable(vio->vio_interrupts,
1637 		    vio->vio_ninterrupts);
1638 	} else {
1639 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1640 			if ((r = ddi_intr_enable(vio->vio_interrupts[i])) !=
1641 			    DDI_SUCCESS) {
1642 				/*
1643 				 * Disable the interrupts we have enabled so
1644 				 * far.
1645 				 */
1646 				for (i--; i >= 0; i--) {
1647 					(void) ddi_intr_disable(
1648 					    vio->vio_interrupts[i]);
1649 				}
1650 				break;
1651 			}
1652 		}
1653 	}
1654 
1655 	if (r != DDI_SUCCESS) {
1656 		mutex_exit(&vio->vio_mutex);
1657 		return (r);
1658 	}
1659 
1660 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1661 		/*
1662 		 * When asked to enable the interrupts, the system enables
1663 		 * MSI-X in the PCI configuration for the device.  While
1664 		 * enabled, the extra MSI-X configuration table fields appear
1665 		 * between the general and the device-specific regions of the
1666 		 * BAR.
1667 		 */
1668 		vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET_MSIX;
1669 
1670 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1671 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1672 			if (!viq->viq_handler_added) {
1673 				continue;
1674 			}
1675 
1676 			uint16_t qi = viq->viq_index;
1677 			uint16_t msi = viq->viq_handler_index;
1678 
1679 			/*
1680 			 * Route interrupts for this queue to the assigned
1681 			 * MSI-X vector number.
1682 			 */
1683 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qi);
1684 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, msi);
1685 
1686 			/*
1687 			 * The device may not actually accept the vector number
1688 			 * we're attempting to program.  We need to confirm
1689 			 * that configuration was successful by re-reading the
1690 			 * configuration we just wrote.
1691 			 */
1692 			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_QUEUE) !=
1693 			    msi) {
1694 				dev_err(vio->vio_dip, CE_WARN,
1695 				    "failed to configure MSI-X vector %u for "
1696 				    "queue \"%s\" (#%u)", (uint_t)msi,
1697 				    viq->viq_name, (uint_t)qi);
1698 
1699 				virtio_interrupts_unwind(vio);
1700 				mutex_exit(&vio->vio_mutex);
1701 				return (DDI_FAILURE);
1702 			}
1703 		}
1704 	}
1705 
1706 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ENABLED;
1707 
1708 	mutex_exit(&vio->vio_mutex);
1709 	return (DDI_SUCCESS);
1710 }
1711 
1712 static void
1713 virtio_interrupts_disable_locked(virtio_t *vio)
1714 {
1715 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1716 
1717 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED)) {
1718 		return;
1719 	}
1720 
1721 	virtio_interrupts_unwind(vio);
1722 
1723 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ENABLED;
1724 }
1725 
1726 void
1727 virtio_interrupts_disable(virtio_t *vio)
1728 {
1729 	mutex_enter(&vio->vio_mutex);
1730 	virtio_interrupts_disable_locked(vio);
1731 	mutex_exit(&vio->vio_mutex);
1732 }
1733