xref: /illumos-gate/usr/src/uts/intel/io/vmm/intel/vtd.c (revision 32640292)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * This file and its contents are supplied under the terms of the
30  * Common Development and Distribution License ("CDDL"), version 1.0.
31  * You may only use this file in accordance with the terms of version
32  * 1.0 of the CDDL.
33  *
34  * A full copy of the text of the CDDL should have accompanied this
35  * source.  A copy of the CDDL is also available via the Internet at
36  * http://www.illumos.org/license/CDDL.
37  *
38  * Copyright 2018 Joyent, Inc.
39  * Copyright 2022 Oxide Computer Company
40  */
41 
42 #include <sys/cdefs.h>
43 
44 #include <sys/param.h>
45 #include <sys/kernel.h>
46 #include <sys/systm.h>
47 #include <sys/kmem.h>
48 
49 #include <dev/pci/pcireg.h>
50 
51 #include <machine/vmparam.h>
52 #include <sys/vmm_vm.h>
53 
54 #include <contrib/dev/acpica/include/acpi.h>
55 
56 #include <sys/sunndi.h>
57 
58 #include "io/iommu.h"
59 
60 /*
61  * Documented in the "Intel Virtualization Technology for Directed I/O",
62  * Architecture Spec, September 2008.
63  */
64 
65 #define	VTD_DRHD_INCLUDE_PCI_ALL(Flags)  (((Flags) >> 0) & 0x1)
66 
67 /* Section 10.4 "Register Descriptions" */
68 struct vtdmap {
69 	volatile uint32_t	version;
70 	volatile uint32_t	res0;
71 	volatile uint64_t	cap;
72 	volatile uint64_t	ext_cap;
73 	volatile uint32_t	gcr;
74 	volatile uint32_t	gsr;
75 	volatile uint64_t	rta;
76 	volatile uint64_t	ccr;
77 };
78 
79 #define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
80 #define	VTD_CAP_ND(cap)		((cap) & 0x7)
81 #define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
82 #define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
83 #define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
84 
85 #define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
86 #define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
87 #define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
88 
89 #define	VTD_GCR_WBF		(1 << 27)
90 #define	VTD_GCR_SRTP		(1 << 30)
91 #define	VTD_GCR_TE		(1U << 31)
92 
93 #define	VTD_GSR_WBFS		(1 << 27)
94 #define	VTD_GSR_RTPS		(1 << 30)
95 #define	VTD_GSR_TES		(1U << 31)
96 
97 #define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
98 #define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
99 
100 #define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
101 #define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
102 #define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
103 #define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
104 #define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
105 #define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
106 #define	VTD_IIR_DOMAIN_P	32
107 
108 #define	VTD_ROOT_PRESENT	0x1
109 #define	VTD_CTX_PRESENT		0x1
110 #define	VTD_CTX_TT_ALL		(1UL << 2)
111 
112 #define	VTD_PTE_RD		(1UL << 0)
113 #define	VTD_PTE_WR		(1UL << 1)
114 #define	VTD_PTE_SUPERPAGE	(1UL << 7)
115 #define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
116 
117 #define	VTD_RID2IDX(rid)	(((rid) & 0xff) * 2)
118 
119 struct domain {
120 	uint64_t	*ptp;		/* first level page table page */
121 	int		pt_levels;	/* number of page table levels */
122 	int		addrwidth;	/* 'AW' field in context entry */
123 	int		spsmask;	/* supported super page sizes */
124 	uint_t		id;		/* domain id */
125 	vm_paddr_t	maxaddr;	/* highest address to be mapped */
126 	SLIST_ENTRY(domain) next;
127 };
128 
129 static SLIST_HEAD(, domain) domhead;
130 
131 #define	DRHD_MAX_UNITS	16
132 static ACPI_DMAR_HARDWARE_UNIT	*drhds[DRHD_MAX_UNITS];
133 static int			drhd_num;
134 static struct vtdmap		*vtdmaps[DRHD_MAX_UNITS];
135 static int			max_domains;
136 typedef int			(*drhd_ident_func_t)(void);
137 static dev_info_t		*vtddips[DRHD_MAX_UNITS];
138 
139 static uint64_t root_table[PAGE_SIZE / sizeof (uint64_t)] __aligned(4096);
140 static uint64_t ctx_tables[256][PAGE_SIZE / sizeof (uint64_t)] __aligned(4096);
141 
142 static int
vtd_max_domains(struct vtdmap * vtdmap)143 vtd_max_domains(struct vtdmap *vtdmap)
144 {
145 	int nd;
146 
147 	nd = VTD_CAP_ND(vtdmap->cap);
148 
149 	switch (nd) {
150 	case 0:
151 		return (16);
152 	case 1:
153 		return (64);
154 	case 2:
155 		return (256);
156 	case 3:
157 		return (1024);
158 	case 4:
159 		return (4 * 1024);
160 	case 5:
161 		return (16 * 1024);
162 	case 6:
163 		return (64 * 1024);
164 	default:
165 		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
166 	}
167 }
168 
169 static uint_t
domain_id(void)170 domain_id(void)
171 {
172 	uint_t id;
173 	struct domain *dom;
174 
175 	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
176 	for (id = 1; id < max_domains; id++) {
177 		SLIST_FOREACH(dom, &domhead, next) {
178 			if (dom->id == id)
179 				break;
180 		}
181 		if (dom == NULL)
182 			break;		/* found it */
183 	}
184 
185 	if (id >= max_domains)
186 		panic("domain ids exhausted");
187 
188 	return (id);
189 }
190 
191 static struct vtdmap *
vtd_device_scope(uint16_t rid)192 vtd_device_scope(uint16_t rid)
193 {
194 	int i, remaining, pathrem;
195 	char *end, *pathend;
196 	struct vtdmap *vtdmap;
197 	ACPI_DMAR_HARDWARE_UNIT *drhd;
198 	ACPI_DMAR_DEVICE_SCOPE *device_scope;
199 	ACPI_DMAR_PCI_PATH *path;
200 
201 	for (i = 0; i < drhd_num; i++) {
202 		drhd = drhds[i];
203 
204 		if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) {
205 			/*
206 			 * From Intel VT-d arch spec, version 3.0:
207 			 * If a DRHD structure with INCLUDE_PCI_ALL flag Set is
208 			 * reported for a Segment, it must be enumerated by BIOS
209 			 * after all other DRHD structures for the same Segment.
210 			 */
211 			vtdmap = vtdmaps[i];
212 			return (vtdmap);
213 		}
214 
215 		end = (char *)drhd + drhd->Header.Length;
216 		remaining = drhd->Header.Length -
217 		    sizeof (ACPI_DMAR_HARDWARE_UNIT);
218 		while (remaining > sizeof (ACPI_DMAR_DEVICE_SCOPE)) {
219 			device_scope =
220 			    (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining);
221 			remaining -= device_scope->Length;
222 
223 			switch (device_scope->EntryType) {
224 				/* 0x01 and 0x02 are PCI device entries */
225 				case 0x01:
226 				case 0x02:
227 					break;
228 				default:
229 					continue;
230 			}
231 
232 			if (PCI_RID2BUS(rid) != device_scope->Bus)
233 				continue;
234 
235 			pathend = (char *)device_scope + device_scope->Length;
236 			pathrem = device_scope->Length -
237 			    sizeof (ACPI_DMAR_DEVICE_SCOPE);
238 			while (pathrem >= sizeof (ACPI_DMAR_PCI_PATH)) {
239 				path = (ACPI_DMAR_PCI_PATH *)
240 				    (pathend - pathrem);
241 				pathrem -= sizeof (ACPI_DMAR_PCI_PATH);
242 
243 				if (PCI_RID2SLOT(rid) != path->Device)
244 					continue;
245 				if (PCI_RID2FUNC(rid) != path->Function)
246 					continue;
247 
248 				vtdmap = vtdmaps[i];
249 				return (vtdmap);
250 			}
251 		}
252 	}
253 
254 	/* No matching scope */
255 	return (NULL);
256 }
257 
258 static void
vtd_wbflush(struct vtdmap * vtdmap)259 vtd_wbflush(struct vtdmap *vtdmap)
260 {
261 
262 	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
263 		invalidate_cache_all();
264 
265 	if (VTD_CAP_RWBF(vtdmap->cap)) {
266 		vtdmap->gcr = VTD_GCR_WBF;
267 		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
268 			;
269 	}
270 }
271 
272 static void
vtd_ctx_global_invalidate(struct vtdmap * vtdmap)273 vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
274 {
275 
276 	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
277 	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
278 		;
279 }
280 
281 static void
vtd_iotlb_global_invalidate(struct vtdmap * vtdmap)282 vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
283 {
284 	int offset;
285 	volatile uint64_t *iotlb_reg, val;
286 
287 	vtd_wbflush(vtdmap);
288 
289 	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
290 	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
291 
292 	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
293 	    VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
294 
295 	while (1) {
296 		val = *iotlb_reg;
297 		if ((val & VTD_IIR_IVT) == 0)
298 			break;
299 	}
300 }
301 
302 static void
vtd_translation_enable(struct vtdmap * vtdmap)303 vtd_translation_enable(struct vtdmap *vtdmap)
304 {
305 
306 	vtdmap->gcr = VTD_GCR_TE;
307 	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
308 		;
309 }
310 
311 static void
vtd_translation_disable(struct vtdmap * vtdmap)312 vtd_translation_disable(struct vtdmap *vtdmap)
313 {
314 
315 	vtdmap->gcr = 0;
316 	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
317 		;
318 }
319 
320 static void *
vtd_map(dev_info_t * dip)321 vtd_map(dev_info_t *dip)
322 {
323 	caddr_t regs;
324 	ddi_acc_handle_t hdl;
325 	int error;
326 
327 	static ddi_device_acc_attr_t regs_attr = {
328 		DDI_DEVICE_ATTR_V0,
329 		DDI_NEVERSWAP_ACC,
330 		DDI_STRICTORDER_ACC,
331 	};
332 
333 	error = ddi_regs_map_setup(dip, 0, &regs, 0, PAGE_SIZE, &regs_attr,
334 	    &hdl);
335 
336 	if (error != DDI_SUCCESS)
337 		return (NULL);
338 
339 	ddi_set_driver_private(dip, hdl);
340 
341 	return (regs);
342 }
343 
344 static void
vtd_unmap(dev_info_t * dip)345 vtd_unmap(dev_info_t *dip)
346 {
347 	ddi_acc_handle_t hdl = ddi_get_driver_private(dip);
348 
349 	if (hdl != NULL)
350 		ddi_regs_map_free(&hdl);
351 }
352 
353 static dev_info_t *
vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT * drhd,int unit)354 vtd_get_dip(ACPI_DMAR_HARDWARE_UNIT *drhd, int unit)
355 {
356 	dev_info_t *dip;
357 	struct ddi_parent_private_data *pdptr;
358 	struct regspec reg;
359 
360 	/*
361 	 * Try to find an existing devinfo node for this vtd unit.
362 	 */
363 	ndi_devi_enter(ddi_root_node());
364 	dip = ddi_find_devinfo("vtd", unit, 0);
365 	ndi_devi_exit(ddi_root_node());
366 
367 	if (dip != NULL)
368 		return (dip);
369 
370 	/*
371 	 * None found, construct a devinfo node for this vtd unit.
372 	 */
373 	dip = ddi_add_child(ddi_root_node(), "vtd",
374 	    DEVI_SID_NODEID, unit);
375 
376 	reg.regspec_bustype = 0;
377 	reg.regspec_addr = drhd->Address;
378 	reg.regspec_size = PAGE_SIZE;
379 
380 	/*
381 	 * update the reg properties
382 	 *
383 	 *   reg property will be used for register
384 	 *   set access
385 	 *
386 	 * refer to the bus_map of root nexus driver
387 	 * I/O or memory mapping:
388 	 *
389 	 * <bustype=0, addr=x, len=x>: memory
390 	 * <bustype=1, addr=x, len=x>: i/o
391 	 * <bustype>1, addr=0, len=x>: x86-compatibility i/o
392 	 */
393 	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE,
394 	    dip, "reg", (int *)&reg,
395 	    sizeof (struct regspec) / sizeof (int));
396 
397 	/*
398 	 * This is an artificially constructed dev_info, and we
399 	 * need to set a few more things to be able to use it
400 	 * for ddi_dma_alloc_handle/free_handle.
401 	 */
402 	ddi_set_driver(dip, ddi_get_driver(ddi_root_node()));
403 	DEVI(dip)->devi_bus_dma_allochdl =
404 	    DEVI(ddi_get_driver((ddi_root_node())));
405 
406 	pdptr = kmem_zalloc(sizeof (struct ddi_parent_private_data)
407 	    + sizeof (struct regspec), KM_SLEEP);
408 	pdptr->par_nreg = 1;
409 	pdptr->par_reg = (struct regspec *)(pdptr + 1);
410 	pdptr->par_reg->regspec_bustype = 0;
411 	pdptr->par_reg->regspec_addr = drhd->Address;
412 	pdptr->par_reg->regspec_size = PAGE_SIZE;
413 	ddi_set_parent_data(dip, pdptr);
414 
415 	return (dip);
416 }
417 
418 static int
vtd_init(void)419 vtd_init(void)
420 {
421 	int i, units, remaining, tmp;
422 	struct vtdmap *vtdmap;
423 	vm_paddr_t ctx_paddr;
424 	char *end;
425 #ifdef __FreeBSD__
426 	char envname[32];
427 	unsigned long mapaddr;
428 #endif
429 	ACPI_STATUS status;
430 	ACPI_TABLE_DMAR *dmar;
431 	ACPI_DMAR_HEADER *hdr;
432 	ACPI_DMAR_HARDWARE_UNIT *drhd;
433 
434 #ifdef __FreeBSD__
435 	/*
436 	 * Allow the user to override the ACPI DMAR table by specifying the
437 	 * physical address of each remapping unit.
438 	 *
439 	 * The following example specifies two remapping units at
440 	 * physical addresses 0xfed90000 and 0xfeda0000 respectively.
441 	 * set vtd.regmap.0.addr=0xfed90000
442 	 * set vtd.regmap.1.addr=0xfeda0000
443 	 */
444 	for (units = 0; units < DRHD_MAX_UNITS; units++) {
445 		snprintf(envname, sizeof (envname), "vtd.regmap.%d.addr",
446 		    units);
447 		if (getenv_ulong(envname, &mapaddr) == 0)
448 			break;
449 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
450 	}
451 
452 	if (units > 0)
453 		goto skip_dmar;
454 #else
455 	units = 0;
456 #endif
457 	/* Search for DMAR table. */
458 	status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
459 	if (ACPI_FAILURE(status))
460 		return (ENXIO);
461 
462 	end = (char *)dmar + dmar->Header.Length;
463 	remaining = dmar->Header.Length - sizeof (ACPI_TABLE_DMAR);
464 	while (remaining > sizeof (ACPI_DMAR_HEADER)) {
465 		hdr = (ACPI_DMAR_HEADER *)(end - remaining);
466 		if (hdr->Length > remaining)
467 			break;
468 		/*
469 		 * From Intel VT-d arch spec, version 1.3:
470 		 * BIOS implementations must report mapping structures
471 		 * in numerical order, i.e. All remapping structures of
472 		 * type 0 (DRHD) enumerated before remapping structures of
473 		 * type 1 (RMRR) and so forth.
474 		 */
475 		if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
476 			break;
477 
478 		drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
479 		drhds[units] = drhd;
480 #ifdef __FreeBSD__
481 		vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
482 #else
483 		vtddips[units] = vtd_get_dip(drhd, units);
484 		vtdmaps[units] = (struct vtdmap *)vtd_map(vtddips[units]);
485 		if (vtdmaps[units] == NULL)
486 			goto fail;
487 #endif
488 		if (++units >= DRHD_MAX_UNITS)
489 			break;
490 		remaining -= hdr->Length;
491 	}
492 
493 	if (units <= 0)
494 		return (ENXIO);
495 
496 #ifdef __FreeBSD__
497 skip_dmar:
498 #endif
499 	drhd_num = units;
500 
501 	max_domains = 64 * 1024; /* maximum valid value */
502 	for (i = 0; i < drhd_num; i++) {
503 		vtdmap = vtdmaps[i];
504 
505 		if (VTD_CAP_CM(vtdmap->cap) != 0)
506 			panic("vtd_init: invalid caching mode");
507 
508 		/* take most compatible (minimum) value */
509 		if ((tmp = vtd_max_domains(vtdmap)) < max_domains)
510 			max_domains = tmp;
511 	}
512 
513 	/*
514 	 * Set up the root-table to point to the context-entry tables
515 	 */
516 	for (i = 0; i < 256; i++) {
517 		ctx_paddr = vtophys(ctx_tables[i]);
518 		if (ctx_paddr & PAGE_MASK)
519 			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
520 
521 		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
522 	}
523 
524 	return (0);
525 
526 #ifndef __FreeBSD__
527 fail:
528 	for (i = 0; i <= units; i++)
529 		vtd_unmap(vtddips[i]);
530 	return (ENXIO);
531 #endif
532 }
533 
534 static void
vtd_cleanup(void)535 vtd_cleanup(void)
536 {
537 #ifndef __FreeBSD__
538 	int i;
539 
540 	KASSERT(SLIST_EMPTY(&domhead), ("domain list not empty"));
541 
542 	bzero(root_table, sizeof (root_table));
543 
544 	for (i = 0; i <= drhd_num; i++) {
545 		vtdmaps[i] = NULL;
546 		/*
547 		 * Unmap the vtd registers. Note that the devinfo nodes
548 		 * themselves aren't removed, they are considered system state
549 		 * and can be reused when the module is reloaded.
550 		 */
551 		if (vtddips[i] != NULL)
552 			vtd_unmap(vtddips[i]);
553 	}
554 #endif
555 }
556 
557 static void
vtd_enable(void)558 vtd_enable(void)
559 {
560 	int i;
561 	struct vtdmap *vtdmap;
562 
563 	for (i = 0; i < drhd_num; i++) {
564 		vtdmap = vtdmaps[i];
565 		vtd_wbflush(vtdmap);
566 
567 		/* Update the root table address */
568 		vtdmap->rta = vtophys(root_table);
569 		vtdmap->gcr = VTD_GCR_SRTP;
570 		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
571 			;
572 
573 		vtd_ctx_global_invalidate(vtdmap);
574 		vtd_iotlb_global_invalidate(vtdmap);
575 
576 		vtd_translation_enable(vtdmap);
577 	}
578 }
579 
580 static void
vtd_disable(void)581 vtd_disable(void)
582 {
583 	int i;
584 	struct vtdmap *vtdmap;
585 
586 	for (i = 0; i < drhd_num; i++) {
587 		vtdmap = vtdmaps[i];
588 		vtd_translation_disable(vtdmap);
589 	}
590 }
591 
592 static void
vtd_add_device(void * arg,uint16_t rid)593 vtd_add_device(void *arg, uint16_t rid)
594 {
595 	int idx;
596 	uint64_t *ctxp;
597 	struct domain *dom = arg;
598 	vm_paddr_t pt_paddr;
599 	struct vtdmap *vtdmap;
600 	uint8_t bus;
601 
602 	bus = PCI_RID2BUS(rid);
603 	ctxp = ctx_tables[bus];
604 	pt_paddr = vtophys(dom->ptp);
605 	idx = VTD_RID2IDX(rid);
606 
607 	if (ctxp[idx] & VTD_CTX_PRESENT) {
608 		panic("vtd_add_device: device %x is already owned by "
609 		    "domain %d", rid, (uint16_t)(ctxp[idx + 1] >> 8));
610 	}
611 
612 	if ((vtdmap = vtd_device_scope(rid)) == NULL)
613 		panic("vtd_add_device: device %x is not in scope for "
614 		    "any DMA remapping unit", rid);
615 
616 	/*
617 	 * Order is important. The 'present' bit is set only after all fields
618 	 * of the context pointer are initialized.
619 	 */
620 	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
621 
622 	if (VTD_ECAP_DI(vtdmap->ext_cap))
623 		ctxp[idx] = VTD_CTX_TT_ALL;
624 	else
625 		ctxp[idx] = 0;
626 
627 	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
628 
629 	/*
630 	 * 'Not Present' entries are not cached in either the Context Cache
631 	 * or in the IOTLB, so there is no need to invalidate either of them.
632 	 */
633 }
634 
635 static void
vtd_remove_device(void * arg,uint16_t rid)636 vtd_remove_device(void *arg, uint16_t rid)
637 {
638 	int i, idx;
639 	uint64_t *ctxp;
640 	struct vtdmap *vtdmap;
641 	uint8_t bus;
642 
643 	bus = PCI_RID2BUS(rid);
644 	ctxp = ctx_tables[bus];
645 	idx = VTD_RID2IDX(rid);
646 
647 	/*
648 	 * Order is important. The 'present' bit is must be cleared first.
649 	 */
650 	ctxp[idx] = 0;
651 	ctxp[idx + 1] = 0;
652 
653 	/*
654 	 * Invalidate the Context Cache and the IOTLB.
655 	 *
656 	 * XXX use device-selective invalidation for Context Cache
657 	 * XXX use domain-selective invalidation for IOTLB
658 	 */
659 	for (i = 0; i < drhd_num; i++) {
660 		vtdmap = vtdmaps[i];
661 		vtd_ctx_global_invalidate(vtdmap);
662 		vtd_iotlb_global_invalidate(vtdmap);
663 	}
664 }
665 
666 #define	CREATE_MAPPING	0
667 #define	REMOVE_MAPPING	1
668 
669 static uint64_t
vtd_update_mapping(void * arg,vm_paddr_t gpa,vm_paddr_t hpa,uint64_t len,int remove)670 vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
671     int remove)
672 {
673 	struct domain *dom;
674 	int i, spshift, ptpshift, ptpindex, nlevels;
675 	uint64_t spsize, *ptp;
676 
677 	dom = arg;
678 	ptpindex = 0;
679 	ptpshift = 0;
680 
681 	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %lx/%lx", __func__,
682 	    gpa, len));
683 	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %lx/%lx beyond "
684 	    "domain maxaddr %lx", __func__, gpa, len, dom->maxaddr));
685 
686 	if (gpa & PAGE_MASK)
687 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
688 
689 	if (hpa & PAGE_MASK)
690 		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
691 
692 	if (len & PAGE_MASK)
693 		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
694 
695 	/*
696 	 * Compute the size of the mapping that we can accommodate.
697 	 *
698 	 * This is based on three factors:
699 	 * - supported super page size
700 	 * - alignment of the region starting at 'gpa' and 'hpa'
701 	 * - length of the region 'len'
702 	 */
703 	spshift = 48;
704 	for (i = 3; i >= 0; i--) {
705 		spsize = 1UL << spshift;
706 		if ((dom->spsmask & (1 << i)) != 0 &&
707 		    (gpa & (spsize - 1)) == 0 &&
708 		    (hpa & (spsize - 1)) == 0 &&
709 		    (len >= spsize)) {
710 			break;
711 		}
712 		spshift -= 9;
713 	}
714 
715 	ptp = dom->ptp;
716 	nlevels = dom->pt_levels;
717 	while (--nlevels >= 0) {
718 		ptpshift = 12 + nlevels * 9;
719 		ptpindex = (gpa >> ptpshift) & 0x1FF;
720 
721 		/* We have reached the leaf mapping */
722 		if (spshift >= ptpshift) {
723 			break;
724 		}
725 
726 		/*
727 		 * We are working on a non-leaf page table page.
728 		 *
729 		 * Create a downstream page table page if necessary and point
730 		 * to it from the current page table.
731 		 */
732 		if (ptp[ptpindex] == 0) {
733 			void *nlp = vmm_ptp_alloc();
734 			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
735 		}
736 
737 		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
738 	}
739 
740 	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
741 		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
742 
743 	/*
744 	 * Update the 'gpa' -> 'hpa' mapping
745 	 */
746 	if (remove) {
747 		ptp[ptpindex] = 0;
748 	} else {
749 		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
750 
751 		if (nlevels > 0)
752 			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
753 	}
754 
755 	return (1UL << ptpshift);
756 }
757 
758 static uint64_t
vtd_create_mapping(void * arg,vm_paddr_t gpa,vm_paddr_t hpa,uint64_t len)759 vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
760 {
761 
762 	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
763 }
764 
765 static uint64_t
vtd_remove_mapping(void * arg,vm_paddr_t gpa,uint64_t len)766 vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
767 {
768 
769 	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
770 }
771 
772 static void
vtd_invalidate_tlb(void * dom)773 vtd_invalidate_tlb(void *dom)
774 {
775 	int i;
776 	struct vtdmap *vtdmap;
777 
778 	/*
779 	 * Invalidate the IOTLB.
780 	 * XXX use domain-selective invalidation for IOTLB
781 	 */
782 	for (i = 0; i < drhd_num; i++) {
783 		vtdmap = vtdmaps[i];
784 		vtd_iotlb_global_invalidate(vtdmap);
785 	}
786 }
787 
788 static void *
vtd_create_domain(vm_paddr_t maxaddr)789 vtd_create_domain(vm_paddr_t maxaddr)
790 {
791 	struct domain *dom;
792 	vm_paddr_t addr;
793 	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
794 	struct vtdmap *vtdmap;
795 
796 	if (drhd_num <= 0)
797 		panic("vtd_create_domain: no dma remapping hardware available");
798 
799 	/*
800 	 * Calculate AGAW.
801 	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
802 	 */
803 	addr = 0;
804 	for (gaw = 0; addr < maxaddr; gaw++)
805 		addr = 1ULL << gaw;
806 
807 	res = (gaw - 12) % 9;
808 	if (res == 0)
809 		agaw = gaw;
810 	else
811 		agaw = gaw + 9 - res;
812 
813 	if (agaw > 64)
814 		agaw = 64;
815 
816 	/*
817 	 * Select the smallest Supported AGAW and the corresponding number
818 	 * of page table levels.
819 	 */
820 	pt_levels = 2;
821 	sagaw = 30;
822 	addrwidth = 0;
823 
824 	tmp = ~0;
825 	for (i = 0; i < drhd_num; i++) {
826 		vtdmap = vtdmaps[i];
827 		/* take most compatible value */
828 		tmp &= VTD_CAP_SAGAW(vtdmap->cap);
829 	}
830 
831 	for (i = 0; i < 5; i++) {
832 		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
833 			break;
834 		pt_levels++;
835 		addrwidth++;
836 		sagaw += 9;
837 		if (sagaw > 64)
838 			sagaw = 64;
839 	}
840 
841 	if (i >= 5) {
842 		panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d",
843 		    tmp, agaw);
844 	}
845 
846 	dom = kmem_zalloc(sizeof (struct domain), KM_SLEEP);
847 	dom->pt_levels = pt_levels;
848 	dom->addrwidth = addrwidth;
849 	dom->id = domain_id();
850 	dom->maxaddr = maxaddr;
851 	dom->ptp = vmm_ptp_alloc();
852 	if ((uintptr_t)dom->ptp & PAGE_MASK)
853 		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
854 
855 #ifdef __FreeBSD__
856 #ifdef notyet
857 	/*
858 	 * XXX superpage mappings for the iommu do not work correctly.
859 	 *
860 	 * By default all physical memory is mapped into the host_domain.
861 	 * When a VM is allocated wired memory the pages belonging to it
862 	 * are removed from the host_domain and added to the vm's domain.
863 	 *
864 	 * If the page being removed was mapped using a superpage mapping
865 	 * in the host_domain then we need to demote the mapping before
866 	 * removing the page.
867 	 *
868 	 * There is not any code to deal with the demotion at the moment
869 	 * so we disable superpage mappings altogether.
870 	 */
871 	dom->spsmask = ~0;
872 	for (i = 0; i < drhd_num; i++) {
873 		vtdmap = vtdmaps[i];
874 		/* take most compatible value */
875 		dom->spsmask &= VTD_CAP_SPS(vtdmap->cap);
876 	}
877 #endif
878 #else
879 	/*
880 	 * On illumos we decidedly do not remove memory mapped to a VM's domain
881 	 * from the host_domain, so we don't have to deal with page demotion and
882 	 * can just use large pages.
883 	 *
884 	 * Since VM memory is currently allocated as 4k pages and mapped into
885 	 * the VM domain page by page, the use of large pages is essentially
886 	 * limited to the host_domain.
887 	 */
888 	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
889 #endif
890 
891 	SLIST_INSERT_HEAD(&domhead, dom, next);
892 
893 	return (dom);
894 }
895 
896 static void
vtd_free_ptp(uint64_t * ptp,int level)897 vtd_free_ptp(uint64_t *ptp, int level)
898 {
899 	int i;
900 	uint64_t *nlp;
901 
902 	if (level > 1) {
903 		for (i = 0; i < 512; i++) {
904 			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
905 				continue;
906 			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
907 				continue;
908 			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
909 			vtd_free_ptp(nlp, level - 1);
910 		}
911 	}
912 
913 	vmm_ptp_free(ptp);
914 }
915 
916 static void
vtd_destroy_domain(void * arg)917 vtd_destroy_domain(void *arg)
918 {
919 	struct domain *dom;
920 
921 	dom = arg;
922 
923 	SLIST_REMOVE(&domhead, dom, domain, next);
924 	vtd_free_ptp(dom->ptp, dom->pt_levels);
925 	kmem_free(dom, sizeof (*dom));
926 }
927 
928 const struct iommu_ops vmm_iommu_ops = {
929 	.init = vtd_init,
930 	.cleanup = vtd_cleanup,
931 	.enable = vtd_enable,
932 	.disable = vtd_disable,
933 	.create_domain = vtd_create_domain,
934 	.destroy_domain = vtd_destroy_domain,
935 	.create_mapping = vtd_create_mapping,
936 	.remove_mapping = vtd_remove_mapping,
937 	.add_device = vtd_add_device,
938 	.remove_device = vtd_remove_device,
939 	.invalidate_tlb = vtd_invalidate_tlb,
940 };
941 
942 
943 static struct modlmisc modlmisc = {
944 	&mod_miscops,
945 	"bhyve vmm vtd",
946 };
947 
948 static struct modlinkage modlinkage = {
949 	MODREV_1,
950 	&modlmisc,
951 	NULL
952 };
953 
954 int
_init(void)955 _init(void)
956 {
957 	return (mod_install(&modlinkage));
958 }
959 
960 int
_fini(void)961 _fini(void)
962 {
963 	return (mod_remove(&modlinkage));
964 }
965 
966 int
_info(struct modinfo * modinfop)967 _info(struct modinfo *modinfop)
968 {
969 	return (mod_info(&modlinkage, modinfop));
970 }
971