xref: /illumos-gate/usr/src/cmd/bhyve/pci_passthru.c (revision 32640292)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/mman.h>
34 #include <sys/pciio.h>
35 #include <sys/ioctl.h>
36 #include <sys/stat.h>
37 
38 #include <sys/pci.h>
39 
40 #include <dev/io/iodev.h>
41 #include <dev/pci/pcireg.h>
42 
43 #include <machine/iodev.h>
44 
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <err.h>
49 #include <errno.h>
50 #include <fcntl.h>
51 #include <sysexits.h>
52 #include <unistd.h>
53 
54 #include <machine/vmm.h>
55 #include <vmmapi.h>
56 #include <sys/ppt_dev.h>
57 
58 #include "config.h"
59 #include "debug.h"
60 #include "pci_passthru.h"
61 #include "mem.h"
62 
63 #define	LEGACY_SUPPORT	1
64 
65 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
66 #define MSIX_CAPLEN 12
67 
68 struct passthru_softc {
69 	struct pci_devinst *psc_pi;
70 	/* ROM is handled like a BAR */
71 	struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1];
72 	struct {
73 		int		capoff;
74 		int		msgctrl;
75 		int		emulated;
76 	} psc_msi;
77 	struct {
78 		int		capoff;
79 	} psc_msix;
80 	int pptfd;
81 	int msi_limit;
82 	int msix_limit;
83 
84 	cfgread_handler psc_pcir_rhandler[PCI_REGMAX + 1];
85 	cfgwrite_handler psc_pcir_whandler[PCI_REGMAX + 1];
86 };
87 
88 static int
msi_caplen(int msgctrl)89 msi_caplen(int msgctrl)
90 {
91 	int len;
92 
93 	len = 10;		/* minimum length of msi capability */
94 
95 	if (msgctrl & PCIM_MSICTRL_64BIT)
96 		len += 4;
97 
98 #if 0
99 	/*
100 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
101 	 * We'll let the guest manipulate them directly.
102 	 */
103 	if (msgctrl & PCIM_MSICTRL_VECTOR)
104 		len += 10;
105 #endif
106 
107 	return (len);
108 }
109 
110 static uint32_t
passthru_read_config(const struct passthru_softc * sc,long reg,int width)111 passthru_read_config(const struct passthru_softc *sc, long reg, int width)
112 {
113 	struct ppt_cfg_io pi;
114 
115 	pi.pci_off = reg;
116 	pi.pci_width = width;
117 
118 	if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
119 		return (0);
120 	}
121 	return (pi.pci_data);
122 }
123 
124 static void
passthru_write_config(const struct passthru_softc * sc,long reg,int width,uint32_t data)125 passthru_write_config(const struct passthru_softc *sc, long reg, int width,
126     uint32_t data)
127 {
128 	struct ppt_cfg_io pi;
129 
130 	pi.pci_off = reg;
131 	pi.pci_width = width;
132 	pi.pci_data = data;
133 
134 	(void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
135 }
136 
137 static int
passthru_get_bar(struct passthru_softc * sc,int bar,enum pcibar_type * type,uint64_t * base,uint64_t * size)138 passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
139     uint64_t *base, uint64_t *size)
140 {
141 	struct ppt_bar_query pb;
142 
143 	pb.pbq_baridx = bar;
144 
145 	if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
146 		return (-1);
147 	}
148 
149 	switch (pb.pbq_type) {
150 	case PCI_ADDR_IO:
151 		*type = PCIBAR_IO;
152 		break;
153 	case PCI_ADDR_MEM32:
154 		*type = PCIBAR_MEM32;
155 		break;
156 	case PCI_ADDR_MEM64:
157 		*type = PCIBAR_MEM64;
158 		break;
159 	default:
160 		err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
161 		break;
162 	}
163 
164 	*base = pb.pbq_base;
165 	*size = pb.pbq_size;
166 	return (0);
167 }
168 
169 static int
passthru_dev_open(const char * path,int * pptfdp)170 passthru_dev_open(const char *path, int *pptfdp)
171 {
172 	int pptfd;
173 
174 	if ((pptfd = open(path, O_RDWR)) < 0) {
175 		return (errno);
176 	}
177 
178 	/* XXX: verify fd with ioctl? */
179 	*pptfdp = pptfd;
180 	return (0);
181 }
182 
183 #ifdef LEGACY_SUPPORT
184 static int
passthru_add_msicap(struct pci_devinst * pi,int msgnum,int nextptr)185 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
186 {
187 	int capoff;
188 	struct msicap msicap;
189 	u_char *capdata;
190 
191 	pci_populate_msicap(&msicap, msgnum, nextptr);
192 
193 	/*
194 	 * XXX
195 	 * Copy the msi capability structure in the last 16 bytes of the
196 	 * config space. This is wrong because it could shadow something
197 	 * useful to the device.
198 	 */
199 	capoff = 256 - roundup(sizeof(msicap), 4);
200 	capdata = (u_char *)&msicap;
201 	for (size_t i = 0; i < sizeof(msicap); i++)
202 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
203 
204 	return (capoff);
205 }
206 #endif	/* LEGACY_SUPPORT */
207 
208 static void
passthru_intr_limit(struct passthru_softc * sc,struct msixcap * msixcap)209 passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
210 {
211 	struct pci_devinst *pi = sc->psc_pi;
212 	int off;
213 
214 	/* Reduce the number of MSI vectors if higher than OS limit */
215 	if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
216 		int msi_limit, mmc;
217 
218 		msi_limit =
219 		    sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
220 		    sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
221 		    sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
222 		    sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
223 		    sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
224 		    PCIM_MSICTRL_MMC_1;
225 		mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
226 
227 		if (mmc > msi_limit) {
228 			sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
229 			sc->psc_msi.msgctrl |= msi_limit;
230 			pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
231 		}
232 	}
233 
234 	/* Reduce the number of MSI-X vectors if higher than OS limit */
235 	if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
236 		if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
237 			msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
238 			msixcap->msgctrl |= sc->msix_limit - 1;
239 			pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
240 		}
241 	}
242 }
243 
244 static int
cfginitmsi(struct passthru_softc * sc)245 cfginitmsi(struct passthru_softc *sc)
246 {
247 	int i, ptr, capptr, cap, sts, caplen, table_size;
248 	uint32_t u32;
249 	struct pci_devinst *pi = sc->psc_pi;
250 	struct msixcap msixcap;
251 	char *msixcap_ptr;
252 
253 	/*
254 	 * Parse the capabilities and cache the location of the MSI
255 	 * and MSI-X capabilities.
256 	 */
257 	sts = passthru_read_config(sc, PCIR_STATUS, 2);
258 	if (sts & PCIM_STATUS_CAPPRESENT) {
259 		ptr = passthru_read_config(sc, PCIR_CAP_PTR, 1);
260 		while (ptr != 0 && ptr != 0xff) {
261 			cap = passthru_read_config(sc, ptr + PCICAP_ID, 1);
262 			if (cap == PCIY_MSI) {
263 				/*
264 				 * Copy the MSI capability into the config
265 				 * space of the emulated pci device
266 				 */
267 				sc->psc_msi.capoff = ptr;
268 				sc->psc_msi.msgctrl = passthru_read_config(sc,
269 				    ptr + 2, 2);
270 				sc->psc_msi.emulated = 0;
271 				caplen = msi_caplen(sc->psc_msi.msgctrl);
272 				capptr = ptr;
273 				while (caplen > 0) {
274 					u32 = passthru_read_config(sc,
275 					    capptr, 4);
276 					pci_set_cfgdata32(pi, capptr, u32);
277 					caplen -= 4;
278 					capptr += 4;
279 				}
280 			} else if (cap == PCIY_MSIX) {
281 				/*
282 				 * Copy the MSI-X capability
283 				 */
284 				sc->psc_msix.capoff = ptr;
285 				caplen = 12;
286 				msixcap_ptr = (char *)&msixcap;
287 				capptr = ptr;
288 				while (caplen > 0) {
289 					u32 = passthru_read_config(sc,
290 					    capptr, 4);
291 					memcpy(msixcap_ptr, &u32, 4);
292 					pci_set_cfgdata32(pi, capptr, u32);
293 					caplen -= 4;
294 					capptr += 4;
295 					msixcap_ptr += 4;
296 				}
297 			}
298 			ptr = passthru_read_config(sc, ptr + PCICAP_NEXTPTR, 1);
299 		}
300 	}
301 
302 	passthru_intr_limit(sc, &msixcap);
303 
304 	if (sc->psc_msix.capoff != 0) {
305 		pi->pi_msix.pba_bar =
306 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
307 		pi->pi_msix.pba_offset =
308 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
309 		pi->pi_msix.table_bar =
310 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
311 		pi->pi_msix.table_offset =
312 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
313 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
314 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
315 
316 		/* Allocate the emulated MSI-X table array */
317 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
318 		pi->pi_msix.table = calloc(1, table_size);
319 
320 		/* Mask all table entries */
321 		for (i = 0; i < pi->pi_msix.table_count; i++) {
322 			pi->pi_msix.table[i].vector_control |=
323 						PCIM_MSIX_VCTRL_MASK;
324 		}
325 	}
326 
327 #ifdef LEGACY_SUPPORT
328 	/*
329 	 * If the passthrough device does not support MSI then craft a
330 	 * MSI capability for it. We link the new MSI capability at the
331 	 * head of the list of capabilities.
332 	 */
333 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
334 		int origptr, msiptr;
335 		origptr = passthru_read_config(sc, PCIR_CAP_PTR, 1);
336 		msiptr = passthru_add_msicap(pi, 1, origptr);
337 		sc->psc_msi.capoff = msiptr;
338 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
339 		sc->psc_msi.emulated = 1;
340 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
341 	}
342 #endif
343 
344 	/* Make sure one of the capabilities is present */
345 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
346 		return (-1);
347 	else
348 		return (0);
349 }
350 
351 static uint64_t
msix_table_read(struct passthru_softc * sc,uint64_t offset,int size)352 msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
353 {
354 	struct pci_devinst *pi;
355 	struct msix_table_entry *entry;
356 	uint8_t *src8;
357 	uint16_t *src16;
358 	uint32_t *src32;
359 	uint64_t *src64;
360 	uint64_t data;
361 	size_t entry_offset;
362 	uint32_t table_offset;
363 	int index, table_count;
364 
365 	pi = sc->psc_pi;
366 
367 	table_offset = pi->pi_msix.table_offset;
368 	table_count = pi->pi_msix.table_count;
369 	if (offset < table_offset ||
370 	    offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
371 		switch (size) {
372 		case 1:
373 			src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
374 			data = *src8;
375 			break;
376 		case 2:
377 			src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
378 			data = *src16;
379 			break;
380 		case 4:
381 			src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
382 			data = *src32;
383 			break;
384 		case 8:
385 			src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
386 			data = *src64;
387 			break;
388 		default:
389 			return (-1);
390 		}
391 		return (data);
392 	}
393 
394 	offset -= table_offset;
395 	index = offset / MSIX_TABLE_ENTRY_SIZE;
396 	assert(index < table_count);
397 
398 	entry = &pi->pi_msix.table[index];
399 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
400 
401 	switch (size) {
402 	case 1:
403 		src8 = (uint8_t *)((uint8_t *)entry + entry_offset);
404 		data = *src8;
405 		break;
406 	case 2:
407 		src16 = (uint16_t *)((uint8_t *)entry + entry_offset);
408 		data = *src16;
409 		break;
410 	case 4:
411 		src32 = (uint32_t *)((uint8_t *)entry + entry_offset);
412 		data = *src32;
413 		break;
414 	case 8:
415 		src64 = (uint64_t *)((uint8_t *)entry + entry_offset);
416 		data = *src64;
417 		break;
418 	default:
419 		return (-1);
420 	}
421 
422 	return (data);
423 }
424 
425 static void
msix_table_write(struct vmctx * ctx,struct passthru_softc * sc,uint64_t offset,int size,uint64_t data)426 msix_table_write(struct vmctx *ctx, struct passthru_softc *sc,
427 		 uint64_t offset, int size, uint64_t data)
428 {
429 	struct pci_devinst *pi;
430 	struct msix_table_entry *entry;
431 	uint8_t *dest8;
432 	uint16_t *dest16;
433 	uint32_t *dest32;
434 	uint64_t *dest64;
435 	size_t entry_offset;
436 	uint32_t table_offset, vector_control;
437 	int index, table_count;
438 
439 	pi = sc->psc_pi;
440 
441 	table_offset = pi->pi_msix.table_offset;
442 	table_count = pi->pi_msix.table_count;
443 	if (offset < table_offset ||
444 	    offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) {
445 		switch (size) {
446 		case 1:
447 			dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset);
448 			*dest8 = data;
449 			break;
450 		case 2:
451 			dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset);
452 			*dest16 = data;
453 			break;
454 		case 4:
455 			dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset);
456 			*dest32 = data;
457 			break;
458 		case 8:
459 			dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset);
460 			*dest64 = data;
461 			break;
462 		}
463 		return;
464 	}
465 
466 	offset -= table_offset;
467 	index = offset / MSIX_TABLE_ENTRY_SIZE;
468 	assert(index < table_count);
469 
470 	entry = &pi->pi_msix.table[index];
471 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
472 
473 	/* Only 4 byte naturally-aligned writes are supported */
474 	assert(size == 4);
475 	assert(entry_offset % 4 == 0);
476 
477 	vector_control = entry->vector_control;
478 	dest32 = (uint32_t *)((uint8_t *)entry + entry_offset);
479 	*dest32 = data;
480 	/* If MSI-X hasn't been enabled, do nothing */
481 	if (pi->pi_msix.enabled) {
482 		/* If the entry is masked, don't set it up */
483 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
484 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
485 			(void) vm_setup_pptdev_msix(ctx, sc->pptfd,
486 			    index, entry->addr, entry->msg_data,
487 			    entry->vector_control);
488 		}
489 	}
490 }
491 
492 static int
init_msix_table(struct vmctx * ctx __unused,struct passthru_softc * sc)493 init_msix_table(struct vmctx *ctx __unused, struct passthru_softc *sc)
494 {
495 	struct pci_devinst *pi = sc->psc_pi;
496 	uint32_t table_size, table_offset;
497 	int i;
498 
499 	i = pci_msix_table_bar(pi);
500 	assert(i >= 0);
501 
502         /*
503          * Map the region of the BAR containing the MSI-X table.  This is
504          * necessary for two reasons:
505          * 1. The PBA may reside in the first or last page containing the MSI-X
506          *    table.
507          * 2. While PCI devices are not supposed to use the page(s) containing
508          *    the MSI-X table for other purposes, some do in practice.
509          */
510 
511 	/*
512 	 * Mapping pptfd provides access to the BAR containing the MSI-X
513 	 * table. See ppt_devmap() in usr/src/uts/intel/io/vmm/io/ppt.c
514 	 *
515 	 * This maps the whole BAR and then mprotect(PROT_NONE) is used below
516 	 * to prevent access to pages that don't contain the MSI-X table.
517 	 * When porting this, it was tempting to just map the MSI-X table pages
518 	 * but that would mean updating everywhere that assumes that
519 	 * pi->pi_msix.mapped_addr points to the start of the BAR. For now,
520 	 * keep closer to upstream.
521 	 */
522 	pi->pi_msix.mapped_size = sc->psc_bar[i].size;
523 	pi->pi_msix.mapped_addr = (uint8_t *)mmap(NULL, pi->pi_msix.mapped_size,
524 	    PROT_READ | PROT_WRITE, MAP_SHARED, sc->pptfd, 0);
525 	if (pi->pi_msix.mapped_addr == MAP_FAILED) {
526 		warn("Failed to map MSI-X table BAR on %d", sc->pptfd);
527 		return (-1);
528 	}
529 
530 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
531 
532 	table_size = pi->pi_msix.table_offset - table_offset;
533 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
534 	table_size = roundup2(table_size, 4096);
535 
536 	/*
537 	 * Unmap any pages not containing the table, we do not need to emulate
538 	 * accesses to them.  Avoid releasing address space to help ensure that
539 	 * a buggy out-of-bounds access causes a crash.
540 	 */
541 	if (table_offset != 0)
542 		if (mprotect((caddr_t)pi->pi_msix.mapped_addr, table_offset,
543 		    PROT_NONE) != 0)
544 			warn("Failed to unmap MSI-X table BAR region");
545 	if (table_offset + table_size != pi->pi_msix.mapped_size)
546 		if (mprotect((caddr_t)
547 		    pi->pi_msix.mapped_addr + table_offset + table_size,
548 		    pi->pi_msix.mapped_size - (table_offset + table_size),
549 		    PROT_NONE) != 0)
550 			warn("Failed to unmap MSI-X table BAR region");
551 
552 	return (0);
553 }
554 
555 static int
cfginitbar(struct vmctx * ctx __unused,struct passthru_softc * sc)556 cfginitbar(struct vmctx *ctx __unused, struct passthru_softc *sc)
557 {
558 	struct pci_devinst *pi = sc->psc_pi;
559 	uint_t i;
560 
561 	/*
562 	 * Initialize BAR registers
563 	 */
564 	for (i = 0; i <= PCI_BARMAX; i++) {
565 		enum pcibar_type bartype;
566 		uint64_t base, size;
567 		int error;
568 
569 		if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
570 			continue;
571 		}
572 
573 		if (bartype != PCIBAR_IO) {
574 			if (((base | size) & PAGE_MASK) != 0) {
575 				warnx("passthru device %d BAR %d: "
576 				    "base %#lx or size %#lx not page aligned\n",
577 				    sc->pptfd, i, base, size);
578 				return (-1);
579 			}
580 		}
581 
582 		/* Cache information about the "real" BAR */
583 		sc->psc_bar[i].type = bartype;
584 		sc->psc_bar[i].size = size;
585 		sc->psc_bar[i].addr = base;
586 		sc->psc_bar[i].lobits = 0;
587 
588 		/* Allocate the BAR in the guest I/O or MMIO space */
589 		error = pci_emul_alloc_bar(pi, i, bartype, size);
590 		if (error)
591 			return (-1);
592 
593 		/* Use same lobits as physical bar */
594 		uint8_t lobits = passthru_read_config(sc, PCIR_BAR(i), 0x01);
595 		if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) {
596 			lobits &= ~PCIM_BAR_MEM_BASE;
597 		} else {
598 			lobits &= ~PCIM_BAR_IO_BASE;
599 		}
600 		sc->psc_bar[i].lobits = lobits;
601 		pi->pi_bar[i].lobits = lobits;
602 
603 		/*
604 		 * 64-bit BAR takes up two slots so skip the next one.
605 		 */
606 		if (bartype == PCIBAR_MEM64) {
607 			i++;
608 			assert(i <= PCI_BARMAX);
609 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
610 		}
611 	}
612 	return (0);
613 }
614 
615 static int
cfginit(struct vmctx * ctx,struct passthru_softc * sc)616 cfginit(struct vmctx *ctx, struct passthru_softc *sc)
617 {
618 	int error;
619 	struct pci_devinst *pi = sc->psc_pi;
620 	uint8_t intline, intpin;
621 
622 	/*
623 	 * Copy physical PCI header to virtual config space. INTLINE and INTPIN
624 	 * shouldn't be aligned with their physical value and they are already
625 	 * set by pci_emul_init().
626 	 */
627 	intline = pci_get_cfgdata8(pi, PCIR_INTLINE);
628 	intpin = pci_get_cfgdata8(pi, PCIR_INTPIN);
629 	for (int i = 0; i <= PCIR_MAXLAT; i += 4) {
630 #ifdef	__FreeBSD__
631 		pci_set_cfgdata32(pi, i, read_config(&sc->psc_sel, i, 4));
632 #else
633 		pci_set_cfgdata32(pi, i, passthru_read_config(sc, i, 4));
634 #endif
635 	}
636 
637 	pci_set_cfgdata8(pi, PCIR_INTLINE, intline);
638 	pci_set_cfgdata8(pi, PCIR_INTPIN, intpin);
639 
640 	if (cfginitmsi(sc) != 0) {
641 		warnx("failed to initialize MSI for PCI %d", sc->pptfd);
642 		return (-1);
643 	}
644 
645 	if (cfginitbar(ctx, sc) != 0) {
646 		warnx("failed to initialize BARs for PCI %d", sc->pptfd);
647 		return (-1);
648 	}
649 
650 	passthru_write_config(sc, PCIR_COMMAND, 2,
651 	    pci_get_cfgdata16(pi, PCIR_COMMAND));
652 
653 	/*
654 	* We need to do this after PCIR_COMMAND got possibly updated, e.g.,
655 	* a BAR was enabled.
656 	*/
657 	if (pci_msix_table_bar(pi) >= 0) {
658 		error = init_msix_table(ctx, sc);
659 		if (error != 0) {
660 			warnx("failed to initialize MSI-X table for PCI %d",
661 			    sc->pptfd);
662 			goto done;
663 		}
664 	}
665 
666 	/* Emulate most PCI header register. */
667 	if ((error = set_pcir_handler(sc, 0, PCIR_MAXLAT + 1,
668 	    passthru_cfgread_emulate, passthru_cfgwrite_emulate)) != 0)
669 		goto done;
670 
671 	/* Allow access to the physical command and status register. */
672 	if ((error = set_pcir_handler(sc, PCIR_COMMAND, 0x04, NULL, NULL)) != 0)
673 		goto done;
674 
675 	error = 0;				/* success */
676 done:
677 	return (error);
678 }
679 
680 int
set_pcir_handler(struct passthru_softc * sc,int reg,int len,cfgread_handler rhandler,cfgwrite_handler whandler)681 set_pcir_handler(struct passthru_softc *sc, int reg, int len,
682     cfgread_handler rhandler, cfgwrite_handler whandler)
683 {
684 	if (reg > PCI_REGMAX || reg + len > PCI_REGMAX + 1)
685 		return (-1);
686 
687 	for (int i = reg; i < reg + len; ++i) {
688 		assert(sc->psc_pcir_rhandler[i] == NULL || rhandler == NULL);
689 		assert(sc->psc_pcir_whandler[i] == NULL || whandler == NULL);
690 		sc->psc_pcir_rhandler[i] = rhandler;
691 		sc->psc_pcir_whandler[i] = whandler;
692 	}
693 
694 	return (0);
695 }
696 
697 static int
passthru_legacy_config(nvlist_t * nvl,const char * opt)698 passthru_legacy_config(nvlist_t *nvl, const char *opt)
699 {
700 	char *config, *name, *tofree, *value;
701 
702 	if (opt == NULL)
703 		return (0);
704 
705 	config = tofree = strdup(opt);
706 	while ((name = strsep(&config, ",")) != NULL) {
707 		value = strchr(name, '=');
708 		if (value != NULL) {
709 			*value++ = '\0';
710 			set_config_value_node(nvl, name, value);
711 		} else {
712 			if (strncmp(name, "/dev/ppt", 8) != 0) {
713 				EPRINTLN("passthru: invalid path \"%s\"", name);
714 				free(tofree);
715 				return (-1);
716 			}
717 			set_config_value_node(nvl, "path", name);
718 		}
719 	}
720 	free(tofree);
721 	return (0);
722 }
723 
724 static int
passthru_init_rom(struct vmctx * const ctx __unused,struct passthru_softc * const sc,const char * const romfile)725 passthru_init_rom(struct vmctx *const ctx __unused,
726     struct passthru_softc *const sc, const char *const romfile)
727 {
728 	if (romfile == NULL) {
729 		return (0);
730 	}
731 
732 	const int fd = open(romfile, O_RDONLY);
733 	if (fd < 0) {
734 		warnx("%s: can't open romfile \"%s\"", __func__, romfile);
735 		return (-1);
736 	}
737 
738 	struct stat sbuf;
739 	if (fstat(fd, &sbuf) < 0) {
740 		warnx("%s: can't fstat romfile \"%s\"", __func__, romfile);
741 		close(fd);
742 		return (-1);
743 	}
744 	const uint64_t rom_size = sbuf.st_size;
745 
746 	void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd,
747 	    0);
748 	if (rom_data == MAP_FAILED) {
749 		warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__,
750 		    romfile, errno);
751 		close(fd);
752 		return (-1);
753 	}
754 
755 	void *rom_addr;
756 	int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr);
757 	if (error) {
758 		warnx("%s: failed to alloc rom segment", __func__);
759 		munmap(rom_data, rom_size);
760 		close(fd);
761 		return (error);
762 	}
763 	memcpy(rom_addr, rom_data, rom_size);
764 
765 	sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM;
766 	sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr;
767 	sc->psc_bar[PCI_ROM_IDX].size = rom_size;
768 
769 	munmap(rom_data, rom_size);
770 	close(fd);
771 
772  	return (0);
773  }
774 
775 static int
passthru_init(struct pci_devinst * pi,nvlist_t * nvl)776 passthru_init(struct pci_devinst *pi, nvlist_t *nvl)
777 {
778 	int error, memflags, pptfd;
779 	struct passthru_softc *sc;
780 	const char *path;
781 	struct vmctx *ctx = pi->pi_vmctx;
782 
783 	pptfd = -1;
784 	sc = NULL;
785 	error = 1;
786 
787 	memflags = vm_get_memflags(ctx);
788 	if (!(memflags & VM_MEM_F_WIRED)) {
789 		warnx("passthru requires guest memory to be wired");
790 		goto done;
791 	}
792 
793 	path = get_config_value_node(nvl, "path");
794 	if (path == NULL || passthru_dev_open(path, &pptfd) != 0) {
795 		warnx("invalid passthru options");
796 		goto done;
797 	}
798 
799 	if (vm_assign_pptdev(ctx, pptfd) != 0) {
800 		warnx("PCI device at %d is not using the ppt driver", pptfd);
801 		goto done;
802 	}
803 
804 	sc = calloc(1, sizeof(struct passthru_softc));
805 
806 	pi->pi_arg = sc;
807 	sc->psc_pi = pi;
808 	sc->pptfd = pptfd;
809 
810 	if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
811 	    &sc->msix_limit)) != 0)
812 		goto done;
813 
814 #ifndef	__FreeBSD__
815 	/*
816 	 * If this function uses legacy interrupt messages, then request one for
817 	 * the guest in case drivers expect to see it. Note that nothing in the
818 	 * hypervisor is currently wired up do deliver such an interrupt should
819 	 * the guest actually rely upon it.
820 	 */
821 	uint8_t intpin = passthru_read_config(sc, PCIR_INTPIN, 1);
822 	if (intpin > 0 && intpin < 5)
823 		pci_lintr_request(sc->psc_pi);
824 #endif
825 
826 	/* initialize config space */
827 	if ((error = cfginit(ctx, sc)) != 0)
828 		goto done;
829 
830 	/* initialize ROM */
831 	if ((error = passthru_init_rom(ctx, sc,
832 	    get_config_value_node(nvl, "rom"))) != 0) {
833 		goto done;
834 	}
835 
836 done:
837 	if (error) {
838 		free(sc);
839 		if (pptfd != -1)
840 			vm_unassign_pptdev(ctx, pptfd);
841 	}
842 	return (error);
843 }
844 
845 static int
msicap_access(struct passthru_softc * sc,int coff)846 msicap_access(struct passthru_softc *sc, int coff)
847 {
848 	int caplen;
849 
850 	if (sc->psc_msi.capoff == 0)
851 		return (0);
852 
853 	caplen = msi_caplen(sc->psc_msi.msgctrl);
854 
855 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
856 		return (1);
857 	else
858 		return (0);
859 }
860 
861 static int
msixcap_access(struct passthru_softc * sc,int coff)862 msixcap_access(struct passthru_softc *sc, int coff)
863 {
864 	if (sc->psc_msix.capoff == 0)
865 		return (0);
866 
867 	return (coff >= sc->psc_msix.capoff &&
868 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
869 }
870 
871 static int
passthru_cfgread_default(struct passthru_softc * sc,struct pci_devinst * pi __unused,int coff,int bytes,uint32_t * rv)872 passthru_cfgread_default(struct passthru_softc *sc,
873     struct pci_devinst *pi __unused, int coff, int bytes, uint32_t *rv)
874 {
875 	/*
876 	 * MSI capability is emulated.
877 	 */
878 	if (msicap_access(sc, coff) || msixcap_access(sc, coff))
879 		return (-1);
880 
881 	/*
882 	 * MSI-X is also emulated since a limit on interrupts may be imposed by
883 	 * the OS, altering the perceived register state.
884 	 */
885 	if (msixcap_access(sc, coff))
886 		return (-1);
887 
888 	/*
889 	 * Emulate the command register.  If a single read reads both the
890 	 * command and status registers, read the status register from the
891 	 * device's config space.
892 	 */
893 	if (coff == PCIR_COMMAND) {
894 		if (bytes <= 2)
895 			return (-1);
896 		*rv = passthru_read_config(sc, PCIR_STATUS, 2) << 16 |
897 		    pci_get_cfgdata16(pi, PCIR_COMMAND);
898 		return (0);
899 	}
900 
901 	/* Everything else just read from the device's config space */
902 	*rv = passthru_read_config(sc, coff, bytes);
903 
904 	return (0);
905 }
906 
907 int
passthru_cfgread_emulate(struct passthru_softc * sc __unused,struct pci_devinst * pi __unused,int coff __unused,int bytes __unused,uint32_t * rv __unused)908 passthru_cfgread_emulate(struct passthru_softc *sc __unused,
909     struct pci_devinst *pi __unused, int coff __unused, int bytes __unused,
910     uint32_t *rv __unused)
911 {
912 	return (-1);
913 }
914 
915 static int
passthru_cfgread(struct pci_devinst * pi,int coff,int bytes,uint32_t * rv)916 passthru_cfgread(struct pci_devinst *pi, int coff, int bytes, uint32_t *rv)
917 {
918 	struct passthru_softc *sc;
919 
920 	sc = pi->pi_arg;
921 
922 	if (sc->psc_pcir_rhandler[coff] != NULL)
923 		return (sc->psc_pcir_rhandler[coff](sc, pi, coff, bytes, rv));
924 
925 	return (passthru_cfgread_default(sc, pi, coff, bytes, rv));
926 }
927 
928 static int
passthru_cfgwrite_default(struct passthru_softc * sc,struct pci_devinst * pi,int coff,int bytes,uint32_t val)929 passthru_cfgwrite_default(struct passthru_softc *sc, struct pci_devinst *pi,
930     int coff, int bytes, uint32_t val)
931 {
932 	int error, msix_table_entries, i;
933 	uint16_t cmd_old;
934 	struct vmctx *ctx = pi->pi_vmctx;
935 
936 	/*
937 	 * MSI capability is emulated
938 	 */
939 	if (msicap_access(sc, coff)) {
940 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
941 		    PCIY_MSI);
942 		error = vm_setup_pptdev_msi(ctx, sc->pptfd,
943 		    pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
944 		if (error != 0)
945 			err(1, "vm_setup_pptdev_msi");
946 		return (0);
947 	}
948 
949 	if (msixcap_access(sc, coff)) {
950 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
951 		    PCIY_MSIX);
952 		if (pi->pi_msix.enabled) {
953 			msix_table_entries = pi->pi_msix.table_count;
954 			for (i = 0; i < msix_table_entries; i++) {
955 				error = vm_setup_pptdev_msix(ctx,
956 				    sc->pptfd, i,
957 				    pi->pi_msix.table[i].addr,
958 				    pi->pi_msix.table[i].msg_data,
959 				    pi->pi_msix.table[i].vector_control);
960 
961 				if (error)
962 					err(1, "vm_setup_pptdev_msix");
963 			}
964 		} else {
965 			error = vm_disable_pptdev_msix(ctx, sc->pptfd);
966 			if (error)
967 				err(1, "vm_disable_pptdev_msix");
968 		}
969 		return (0);
970 	}
971 
972 #ifdef LEGACY_SUPPORT
973 	/*
974 	 * If this device does not support MSI natively then we cannot let
975 	 * the guest disable legacy interrupts from the device. It is the
976 	 * legacy interrupt that is triggering the virtual MSI to the guest.
977 	 */
978 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
979 		if (coff == PCIR_COMMAND && bytes == 2)
980 			val &= ~PCIM_CMD_INTxDIS;
981 	}
982 #endif
983 
984 	passthru_write_config(sc, coff, bytes, val);
985 	if (coff == PCIR_COMMAND) {
986 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
987 		if (bytes == 1)
988 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
989 		else if (bytes == 2)
990 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
991 		pci_emul_cmd_changed(pi, cmd_old);
992 	}
993 
994 	return (0);
995 }
996 
997 int
passthru_cfgwrite_emulate(struct passthru_softc * sc __unused,struct pci_devinst * pi __unused,int coff __unused,int bytes __unused,uint32_t val __unused)998 passthru_cfgwrite_emulate(struct passthru_softc *sc __unused,
999     struct pci_devinst *pi __unused, int coff __unused, int bytes __unused,
1000     uint32_t val __unused)
1001 {
1002 	return (-1);
1003 }
1004 
1005 static int
passthru_cfgwrite(struct pci_devinst * pi,int coff,int bytes,uint32_t val)1006 passthru_cfgwrite(struct pci_devinst *pi, int coff, int bytes, uint32_t val)
1007 {
1008 	struct passthru_softc *sc;
1009 
1010 	sc = pi->pi_arg;
1011 
1012 	if (sc->psc_pcir_whandler[coff] != NULL)
1013 		return (sc->psc_pcir_whandler[coff](sc, pi, coff, bytes, val));
1014 
1015 	return (passthru_cfgwrite_default(sc, pi, coff, bytes, val));
1016 }
1017 
1018 static void
passthru_write(struct pci_devinst * pi,int baridx,uint64_t offset,int size,uint64_t value)1019 passthru_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
1020     uint64_t value)
1021 {
1022 	struct passthru_softc *sc = pi->pi_arg;
1023 	struct vmctx *ctx = pi->pi_vmctx;
1024 
1025 	if (baridx == pci_msix_table_bar(pi)) {
1026 		msix_table_write(ctx, sc, offset, size, value);
1027 	} else {
1028 		struct ppt_bar_io pbi;
1029 
1030 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
1031 
1032 		pbi.pbi_bar = baridx;
1033 		pbi.pbi_width = size;
1034 		pbi.pbi_off = offset;
1035 		pbi.pbi_data = value;
1036 		(void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
1037 	}
1038 }
1039 
1040 static uint64_t
passthru_read(struct pci_devinst * pi,int baridx,uint64_t offset,int size)1041 passthru_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
1042 {
1043 	struct passthru_softc *sc = pi->pi_arg;
1044 	uint64_t val;
1045 
1046 	if (baridx == pci_msix_table_bar(pi)) {
1047 		val = msix_table_read(sc, offset, size);
1048 	} else {
1049 		struct ppt_bar_io pbi;
1050 
1051 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
1052 
1053 		pbi.pbi_bar = baridx;
1054 		pbi.pbi_width = size;
1055 		pbi.pbi_off = offset;
1056 		if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
1057 			val = pbi.pbi_data;
1058 		} else {
1059 			val = 0;
1060 		}
1061 	}
1062 
1063 	return (val);
1064 }
1065 
1066 static void
passthru_msix_addr(struct vmctx * ctx,struct pci_devinst * pi,int baridx,int enabled,uint64_t address)1067 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1068 		   int enabled, uint64_t address)
1069 {
1070 	struct passthru_softc *sc;
1071 	size_t remaining;
1072 	uint32_t table_size, table_offset;
1073 
1074 	sc = pi->pi_arg;
1075 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
1076 	if (table_offset > 0) {
1077 		if (!enabled) {
1078 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
1079 			    table_offset) != 0)
1080 				warnx("pci_passthru: unmap_pptdev_mmio failed");
1081 		} else {
1082 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
1083 			    table_offset, sc->psc_bar[baridx].addr) != 0)
1084 				warnx("pci_passthru: map_pptdev_mmio failed");
1085 		}
1086 	}
1087 	table_size = pi->pi_msix.table_offset - table_offset;
1088 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
1089 	table_size = roundup2(table_size, 4096);
1090 	remaining = pi->pi_bar[baridx].size - table_offset - table_size;
1091 	if (remaining > 0) {
1092 		address += table_offset + table_size;
1093 		if (!enabled) {
1094 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
1095 			    remaining) != 0)
1096 				warnx("pci_passthru: unmap_pptdev_mmio failed");
1097 		} else {
1098 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
1099 			    remaining, sc->psc_bar[baridx].addr +
1100 			    table_offset + table_size) != 0)
1101 				warnx("pci_passthru: map_pptdev_mmio failed");
1102 		}
1103 	}
1104 }
1105 
1106 static void
passthru_mmio_addr(struct vmctx * ctx,struct pci_devinst * pi,int baridx,int enabled,uint64_t address)1107 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
1108 		   int enabled, uint64_t address)
1109 {
1110 	struct passthru_softc *sc;
1111 
1112 	sc = pi->pi_arg;
1113 	if (!enabled) {
1114 		if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
1115 		    sc->psc_bar[baridx].size) != 0)
1116 			warnx("pci_passthru: unmap_pptdev_mmio failed");
1117 	} else {
1118 		if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
1119 		    sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0)
1120 			warnx("pci_passthru: map_pptdev_mmio failed");
1121 	}
1122 }
1123 
1124 static void
passthru_addr_rom(struct pci_devinst * const pi,const int idx,const int enabled)1125 passthru_addr_rom(struct pci_devinst *const pi, const int idx,
1126     const int enabled)
1127 {
1128 	const uint64_t addr = pi->pi_bar[idx].addr;
1129 	const uint64_t size = pi->pi_bar[idx].size;
1130 
1131 	if (!enabled) {
1132 		if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) {
1133 			errx(4, "%s: munmap_memseg @ [%016lx - %016lx] failed",
1134 			    __func__, addr, addr + size);
1135 		}
1136 
1137 	} else {
1138 		if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM,
1139 			pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) {
1140 			errx(4, "%s: mmap_memseg @ [%016lx - %016lx]  failed",
1141 			    __func__, addr, addr + size);
1142 		}
1143 	}
1144 }
1145 
1146 static void
passthru_addr(struct pci_devinst * pi,int baridx,int enabled,uint64_t address)1147 passthru_addr(struct pci_devinst *pi, int baridx,
1148     int enabled, uint64_t address)
1149 {
1150 	struct vmctx *ctx = pi->pi_vmctx;
1151 
1152 	switch (pi->pi_bar[baridx].type) {
1153 	case PCIBAR_IO:
1154 		/* IO BARs are emulated */
1155 		break;
1156 	case PCIBAR_ROM:
1157 		passthru_addr_rom(pi, baridx, enabled);
1158 		break;
1159 	case PCIBAR_MEM32:
1160 	case PCIBAR_MEM64:
1161 		if (baridx == pci_msix_table_bar(pi))
1162 			passthru_msix_addr(ctx, pi, baridx, enabled, address);
1163 		else
1164 			passthru_mmio_addr(ctx, pi, baridx, enabled, address);
1165 		break;
1166 	default:
1167 		errx(4, "%s: invalid BAR type %d", __func__,
1168 		    pi->pi_bar[baridx].type);
1169 	}
1170 }
1171 
1172 static const struct pci_devemu passthru = {
1173 	.pe_emu		= "passthru",
1174 	.pe_init	= passthru_init,
1175 	.pe_legacy_config = passthru_legacy_config,
1176 	.pe_cfgwrite	= passthru_cfgwrite,
1177 	.pe_cfgread	= passthru_cfgread,
1178 	.pe_barwrite 	= passthru_write,
1179 	.pe_barread    	= passthru_read,
1180 	.pe_baraddr	= passthru_addr,
1181 };
1182 PCI_EMUL_SET(passthru);
1183 
1184 /*
1185  * This isn't the right place for these functions which, on FreeBSD, can
1186  * read or write from arbitrary devices. They are not supported on illumos;
1187  * not least because bhyve is generally run in a non-global zone which doesn't
1188  * have access to the devinfo tree.
1189  */
1190 uint32_t
read_config(const struct pcisel * sel __unused,long reg __unused,int width __unused)1191 read_config(const struct pcisel *sel __unused, long reg __unused,
1192     int width __unused)
1193 {
1194 	return (-1);
1195 }
1196 
1197 void
write_config(const struct pcisel * sel __unused,long reg __unused,int width __unused,uint32_t data __unused)1198 write_config(const struct pcisel *sel __unused, long reg __unused,
1199     int width __unused, uint32_t data __unused)
1200 {
1201        errx(4, "write_config() unimplemented on illumos");
1202 }
1203