xref: /illumos-gate/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c (revision 6884664d)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2021 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static list_t		vmm_destroy_list;
81 static id_space_t	*vmm_minors;
82 static void		*vmm_statep;
83 
84 static const char *vmmdev_hvm_name = "bhyve";
85 
86 /* For sdev plugin (/dev) */
87 #define	VMM_SDEV_ROOT "/dev/vmm"
88 
89 /* From uts/i86pc/io/vmm/intel/vmx.c */
90 extern int vmx_x86_supported(const char **);
91 
92 /* Holds and hooks from drivers external to vmm */
93 struct vmm_hold {
94 	list_node_t	vmh_node;
95 	vmm_softc_t	*vmh_sc;
96 	boolean_t	vmh_release_req;
97 	uint_t		vmh_ioport_hook_cnt;
98 };
99 
100 struct vmm_lease {
101 	list_node_t		vml_node;
102 	struct vm		*vml_vm;
103 	vm_client_t		*vml_vmclient;
104 	boolean_t		vml_expired;
105 	boolean_t		vml_break_deferred;
106 	boolean_t		(*vml_expire_func)(void *);
107 	void			*vml_expire_arg;
108 	struct vmm_hold		*vml_hold;
109 };
110 
111 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
112 static void vmm_lease_block(vmm_softc_t *);
113 static void vmm_lease_unblock(vmm_softc_t *);
114 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
115 static void vmm_kstat_init(vmm_softc_t *);
116 static void vmm_kstat_fini(vmm_softc_t *);
117 
118 static int
vmmdev_get_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)119 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
120 {
121 	int error;
122 	bool sysmem;
123 
124 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
125 	    NULL);
126 	if (error || mseg->len == 0)
127 		return (error);
128 
129 	if (!sysmem) {
130 		vmm_devmem_entry_t *de;
131 		list_t *dl = &sc->vmm_devmem_list;
132 
133 		for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
134 			if (de->vde_segid == mseg->segid) {
135 				break;
136 			}
137 		}
138 		if (de != NULL) {
139 			(void) strlcpy(mseg->name, de->vde_name,
140 			    sizeof (mseg->name));
141 		}
142 	} else {
143 		bzero(mseg->name, sizeof (mseg->name));
144 	}
145 
146 	return (error);
147 }
148 
149 /*
150  * The 'devmem' hack:
151  *
152  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
153  * in the vm which appear with their own name related to the vm under /dev.
154  * Since this would be a hassle from an sdev perspective and would require a
155  * new cdev interface (or complicate the existing one), we choose to implement
156  * this in a different manner.  When 'devmem' mappings are created, an
157  * identifying off_t is communicated back out to userspace.  That off_t,
158  * residing above the normal guest memory space, can be used to mmap the
159  * 'devmem' mapping from the already-open vm device.
160  */
161 
162 static int
vmmdev_devmem_create(vmm_softc_t * sc,struct vm_memseg * mseg,const char * name)163 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
164 {
165 	off_t map_offset;
166 	vmm_devmem_entry_t *entry;
167 
168 	if (list_is_empty(&sc->vmm_devmem_list)) {
169 		map_offset = VM_DEVMEM_START;
170 	} else {
171 		entry = list_tail(&sc->vmm_devmem_list);
172 		map_offset = entry->vde_off + entry->vde_len;
173 		if (map_offset < entry->vde_off) {
174 			/* Do not tolerate overflow */
175 			return (ERANGE);
176 		}
177 		/*
178 		 * XXXJOY: We could choose to search the list for duplicate
179 		 * names and toss an error.  Since we're using the offset
180 		 * method for now, it does not make much of a difference.
181 		 */
182 	}
183 
184 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
185 	entry->vde_segid = mseg->segid;
186 	entry->vde_len = mseg->len;
187 	entry->vde_off = map_offset;
188 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
189 	list_insert_tail(&sc->vmm_devmem_list, entry);
190 
191 	return (0);
192 }
193 
194 static boolean_t
vmmdev_devmem_segid(vmm_softc_t * sc,off_t off,off_t len,int * segidp,off_t * map_offp)195 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
196     off_t *map_offp)
197 {
198 	list_t *dl = &sc->vmm_devmem_list;
199 	vmm_devmem_entry_t *de = NULL;
200 	const off_t map_end = off + len;
201 
202 	VERIFY(off >= VM_DEVMEM_START);
203 
204 	if (map_end < off) {
205 		/* No match on overflow */
206 		return (B_FALSE);
207 	}
208 
209 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
210 		const off_t item_end = de->vde_off + de->vde_len;
211 
212 		if (de->vde_off <= off && item_end >= map_end) {
213 			*segidp = de->vde_segid;
214 			*map_offp = off - de->vde_off;
215 			return (B_TRUE);
216 		}
217 	}
218 	return (B_FALSE);
219 }
220 
221 static void
vmmdev_devmem_purge(vmm_softc_t * sc)222 vmmdev_devmem_purge(vmm_softc_t *sc)
223 {
224 	vmm_devmem_entry_t *entry;
225 
226 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
227 		kmem_free(entry, sizeof (*entry));
228 	}
229 }
230 
231 static int
vmmdev_alloc_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)232 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
233 {
234 	int error;
235 	bool sysmem = true;
236 
237 	if (VM_MEMSEG_NAME(mseg)) {
238 		sysmem = false;
239 	}
240 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
241 
242 	if (error == 0 && VM_MEMSEG_NAME(mseg)) {
243 		/*
244 		 * Rather than create a whole fresh device from which userspace
245 		 * can mmap this segment, instead make it available at an
246 		 * offset above where the main guest memory resides.
247 		 */
248 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
249 		if (error != 0) {
250 			vm_free_memseg(sc->vmm_vm, mseg->segid);
251 		}
252 	}
253 	return (error);
254 }
255 
256 /*
257  * Resource Locking and Exclusion
258  *
259  * Much of bhyve depends on key portions of VM state, such as the guest memory
260  * map, to remain unchanged while the guest is running.  As ported from
261  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
262  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
263  * performing the work of actually running the guest in VMX/SVM, would lock
264  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
265  * state, all of the vCPUs would be first locked, ensuring that the
266  * operation(s) could complete without any other threads stumbling into
267  * intermediate states.
268  *
269  * This approach is largely effective for bhyve.  Common operations, such as
270  * running the vCPUs, steer clear of lock contention.  The model begins to
271  * break down for operations which do not occur in the context of a specific
272  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
273  * thread in the bhyve process.  In order to properly protect those vCPU-less
274  * operations from encountering invalid states, additional locking is required.
275  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
276  * It does mean that class of operations will be serialized on locking the
277  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
278  * undue contention on the VM_MAXCPU-1 vCPU.
279  *
280  * In order to address the shortcomings of this model, the concept of a
281  * read/write lock has been added to bhyve.  Operations which change
282  * fundamental aspects of a VM (such as the memory map) must acquire the write
283  * lock, which also implies locking all of the vCPUs and waiting for all read
284  * lock holders to release.  While it increases the cost and waiting time for
285  * those few operations, it allows most hot-path operations on the VM (which
286  * depend on its configuration remaining stable) to occur with minimal locking.
287  *
288  * Consumers of the Driver API (see below) are a special case when it comes to
289  * this locking, since they may hold a read lock via the drv_lease mechanism
290  * for an extended period of time.  Rather than forcing those consumers to
291  * continuously poll for a write lock attempt, the lease system forces them to
292  * provide a release callback to trigger their clean-up (and potential later
293  * reacquisition) of the read lock.
294  */
295 
296 static void
vcpu_lock_one(vmm_softc_t * sc,int vcpu)297 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
298 {
299 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
300 
301 	/*
302 	 * Since this state transition is utilizing from_idle=true, it should
303 	 * not fail, but rather block until it can be successful.
304 	 */
305 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
306 }
307 
308 static void
vcpu_unlock_one(vmm_softc_t * sc,int vcpu)309 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
310 {
311 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
312 
313 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
314 	vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
315 }
316 
317 static void
vmm_read_lock(vmm_softc_t * sc)318 vmm_read_lock(vmm_softc_t *sc)
319 {
320 	rw_enter(&sc->vmm_rwlock, RW_READER);
321 }
322 
323 static void
vmm_read_unlock(vmm_softc_t * sc)324 vmm_read_unlock(vmm_softc_t *sc)
325 {
326 	rw_exit(&sc->vmm_rwlock);
327 }
328 
329 static void
vmm_write_lock(vmm_softc_t * sc)330 vmm_write_lock(vmm_softc_t *sc)
331 {
332 	int maxcpus;
333 
334 	/* First lock all the vCPUs */
335 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
336 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
337 		vcpu_lock_one(sc, vcpu);
338 	}
339 
340 	/*
341 	 * Block vmm_drv leases from being acquired or held while the VM write
342 	 * lock is held.
343 	 */
344 	vmm_lease_block(sc);
345 
346 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
347 	/*
348 	 * For now, the 'maxcpus' value for an instance is fixed at the
349 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
350 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
351 	 * of the write lock will need to be wary of such changes.
352 	 */
353 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
354 }
355 
356 static void
vmm_write_unlock(vmm_softc_t * sc)357 vmm_write_unlock(vmm_softc_t *sc)
358 {
359 	int maxcpus;
360 
361 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
362 	vmm_lease_unblock(sc);
363 
364 	/*
365 	 * The VM write lock _must_ be released from the same thread it was
366 	 * acquired in, unlike the read lock.
367 	 */
368 	VERIFY(rw_write_held(&sc->vmm_rwlock));
369 	rw_exit(&sc->vmm_rwlock);
370 
371 	/* Unlock all the vCPUs */
372 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
373 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
374 		vcpu_unlock_one(sc, vcpu);
375 	}
376 }
377 
378 static int
vmmdev_do_ioctl(vmm_softc_t * sc,int cmd,intptr_t arg,int md,cred_t * credp,int * rvalp)379 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
380     cred_t *credp, int *rvalp)
381 {
382 	int error = 0, vcpu = -1;
383 	void *datap = (void *)arg;
384 	enum vm_lock_type {
385 		LOCK_NONE = 0,
386 		LOCK_VCPU,
387 		LOCK_READ_HOLD,
388 		LOCK_WRITE_HOLD
389 	} lock_type = LOCK_NONE;
390 
391 	/* Acquire any exclusion resources needed for the operation. */
392 	switch (cmd) {
393 	case VM_RUN:
394 	case VM_GET_REGISTER:
395 	case VM_SET_REGISTER:
396 	case VM_GET_SEGMENT_DESCRIPTOR:
397 	case VM_SET_SEGMENT_DESCRIPTOR:
398 	case VM_GET_REGISTER_SET:
399 	case VM_SET_REGISTER_SET:
400 	case VM_INJECT_EXCEPTION:
401 	case VM_GET_CAPABILITY:
402 	case VM_SET_CAPABILITY:
403 	case VM_PPTDEV_MSI:
404 	case VM_PPTDEV_MSIX:
405 	case VM_SET_X2APIC_STATE:
406 	case VM_GLA2GPA:
407 	case VM_GLA2GPA_NOFAULT:
408 	case VM_ACTIVATE_CPU:
409 	case VM_SET_INTINFO:
410 	case VM_GET_INTINFO:
411 	case VM_RESTART_INSTRUCTION:
412 	case VM_SET_KERNEMU_DEV:
413 	case VM_GET_KERNEMU_DEV:
414 	case VM_RESET_CPU:
415 	case VM_GET_RUN_STATE:
416 	case VM_SET_RUN_STATE:
417 		/*
418 		 * Copy in the ID of the vCPU chosen for this operation.
419 		 * Since a nefarious caller could update their struct between
420 		 * this locking and when the rest of the ioctl data is copied
421 		 * in, it is _critical_ that this local 'vcpu' variable be used
422 		 * rather than the in-struct one when performing the ioctl.
423 		 */
424 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
425 			return (EFAULT);
426 		}
427 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
428 			return (EINVAL);
429 		}
430 		vcpu_lock_one(sc, vcpu);
431 		lock_type = LOCK_VCPU;
432 		break;
433 
434 	case VM_REINIT:
435 	case VM_BIND_PPTDEV:
436 	case VM_UNBIND_PPTDEV:
437 	case VM_MAP_PPTDEV_MMIO:
438 	case VM_UNMAP_PPTDEV_MMIO:
439 	case VM_ALLOC_MEMSEG:
440 	case VM_MMAP_MEMSEG:
441 	case VM_MUNMAP_MEMSEG:
442 	case VM_WRLOCK_CYCLE:
443 	case VM_PMTMR_LOCATE:
444 		vmm_write_lock(sc);
445 		lock_type = LOCK_WRITE_HOLD;
446 		break;
447 
448 	case VM_GET_MEMSEG:
449 	case VM_MMAP_GETNEXT:
450 	case VM_LAPIC_IRQ:
451 	case VM_INJECT_NMI:
452 	case VM_IOAPIC_ASSERT_IRQ:
453 	case VM_IOAPIC_DEASSERT_IRQ:
454 	case VM_IOAPIC_PULSE_IRQ:
455 	case VM_LAPIC_MSI:
456 	case VM_LAPIC_LOCAL_IRQ:
457 	case VM_GET_X2APIC_STATE:
458 	case VM_RTC_READ:
459 	case VM_RTC_WRITE:
460 	case VM_RTC_SETTIME:
461 	case VM_RTC_GETTIME:
462 	case VM_PPTDEV_DISABLE_MSIX:
463 	case VM_DEVMEM_GETOFFSET:
464 		vmm_read_lock(sc);
465 		lock_type = LOCK_READ_HOLD;
466 		break;
467 
468 	case VM_GET_GPA_PMAP:
469 	case VM_IOAPIC_PINCOUNT:
470 	case VM_SUSPEND:
471 	default:
472 		break;
473 	}
474 
475 	/* Execute the primary logic for the ioctl. */
476 	switch (cmd) {
477 	case VM_RUN: {
478 		struct vm_entry entry;
479 
480 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
481 			error = EFAULT;
482 			break;
483 		}
484 
485 		if (!(curthread->t_schedflag & TS_VCPU))
486 			smt_mark_as_vcpu();
487 
488 		error = vm_run(sc->vmm_vm, vcpu, &entry);
489 
490 		/*
491 		 * Unexpected states in vm_run() are expressed through positive
492 		 * errno-oriented return values.  VM states which expect further
493 		 * processing in userspace (necessary context via exitinfo) are
494 		 * expressed through negative return values.  For the time being
495 		 * a return value of 0 is not expected from vm_run().
496 		 */
497 		ASSERT(error != 0);
498 		if (error < 0) {
499 			const struct vm_exit *vme;
500 			void *outp = entry.exit_data;
501 
502 			error = 0;
503 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
504 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
505 				error = EFAULT;
506 			}
507 		}
508 		break;
509 	}
510 	case VM_SUSPEND: {
511 		struct vm_suspend vmsuspend;
512 
513 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
514 			error = EFAULT;
515 			break;
516 		}
517 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
518 		break;
519 	}
520 	case VM_REINIT: {
521 		struct vm_reinit reinit;
522 
523 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
524 			error = EFAULT;
525 			break;
526 		}
527 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
528 			/*
529 			 * The VM instance should be free of driver-attached
530 			 * hooks during the reinitialization process.
531 			 */
532 			break;
533 		}
534 		error = vm_reinit(sc->vmm_vm, reinit.flags);
535 		(void) vmm_drv_block_hook(sc, B_FALSE);
536 		break;
537 	}
538 	case VM_STAT_DESC: {
539 		struct vm_stat_desc statdesc;
540 
541 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
542 			error = EFAULT;
543 			break;
544 		}
545 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
546 		    sizeof (statdesc.desc));
547 		if (error == 0 &&
548 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
549 			error = EFAULT;
550 			break;
551 		}
552 		break;
553 	}
554 	case VM_STATS_IOC: {
555 		struct vm_stats vmstats;
556 
557 		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
558 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
559 			error = EFAULT;
560 			break;
561 		}
562 		hrt2tv(gethrtime(), &vmstats.tv);
563 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
564 		    &vmstats.num_entries, vmstats.statbuf);
565 		if (error == 0 &&
566 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
567 			error = EFAULT;
568 			break;
569 		}
570 		break;
571 	}
572 
573 	case VM_PPTDEV_MSI: {
574 		struct vm_pptdev_msi pptmsi;
575 
576 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
577 			error = EFAULT;
578 			break;
579 		}
580 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
581 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
582 		break;
583 	}
584 	case VM_PPTDEV_MSIX: {
585 		struct vm_pptdev_msix pptmsix;
586 
587 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
588 			error = EFAULT;
589 			break;
590 		}
591 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
592 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
593 		    pptmsix.vector_control);
594 		break;
595 	}
596 	case VM_PPTDEV_DISABLE_MSIX: {
597 		struct vm_pptdev pptdev;
598 
599 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
600 			error = EFAULT;
601 			break;
602 		}
603 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
604 		break;
605 	}
606 	case VM_MAP_PPTDEV_MMIO: {
607 		struct vm_pptdev_mmio pptmmio;
608 
609 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
610 			error = EFAULT;
611 			break;
612 		}
613 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
614 		    pptmmio.len, pptmmio.hpa);
615 		break;
616 	}
617 	case VM_UNMAP_PPTDEV_MMIO: {
618 		struct vm_pptdev_mmio pptmmio;
619 
620 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
621 			error = EFAULT;
622 			break;
623 		}
624 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
625 		    pptmmio.len);
626 		break;
627 	}
628 	case VM_BIND_PPTDEV: {
629 		struct vm_pptdev pptdev;
630 
631 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
632 			error = EFAULT;
633 			break;
634 		}
635 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
636 		break;
637 	}
638 	case VM_UNBIND_PPTDEV: {
639 		struct vm_pptdev pptdev;
640 
641 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
642 			error = EFAULT;
643 			break;
644 		}
645 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
646 		break;
647 	}
648 	case VM_GET_PPTDEV_LIMITS: {
649 		struct vm_pptdev_limits pptlimits;
650 
651 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
652 			error = EFAULT;
653 			break;
654 		}
655 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
656 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
657 		if (error == 0 &&
658 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
659 			error = EFAULT;
660 			break;
661 		}
662 		break;
663 	}
664 	case VM_INJECT_EXCEPTION: {
665 		struct vm_exception vmexc;
666 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
667 			error = EFAULT;
668 			break;
669 		}
670 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
671 		    vmexc.error_code_valid, vmexc.error_code,
672 		    vmexc.restart_instruction);
673 		break;
674 	}
675 	case VM_INJECT_NMI: {
676 		struct vm_nmi vmnmi;
677 
678 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
679 			error = EFAULT;
680 			break;
681 		}
682 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
683 		break;
684 	}
685 	case VM_LAPIC_IRQ: {
686 		struct vm_lapic_irq vmirq;
687 
688 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
689 			error = EFAULT;
690 			break;
691 		}
692 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
693 		break;
694 	}
695 	case VM_LAPIC_LOCAL_IRQ: {
696 		struct vm_lapic_irq vmirq;
697 
698 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
699 			error = EFAULT;
700 			break;
701 		}
702 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
703 		    vmirq.vector);
704 		break;
705 	}
706 	case VM_LAPIC_MSI: {
707 		struct vm_lapic_msi vmmsi;
708 
709 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
710 			error = EFAULT;
711 			break;
712 		}
713 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
714 		break;
715 	}
716 
717 	case VM_IOAPIC_ASSERT_IRQ: {
718 		struct vm_ioapic_irq ioapic_irq;
719 
720 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
721 			error = EFAULT;
722 			break;
723 		}
724 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
725 		break;
726 	}
727 	case VM_IOAPIC_DEASSERT_IRQ: {
728 		struct vm_ioapic_irq ioapic_irq;
729 
730 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
731 			error = EFAULT;
732 			break;
733 		}
734 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
735 		break;
736 	}
737 	case VM_IOAPIC_PULSE_IRQ: {
738 		struct vm_ioapic_irq ioapic_irq;
739 
740 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
741 			error = EFAULT;
742 			break;
743 		}
744 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
745 		break;
746 	}
747 	case VM_IOAPIC_PINCOUNT: {
748 		int pincount;
749 
750 		pincount = vioapic_pincount(sc->vmm_vm);
751 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
752 			error = EFAULT;
753 			break;
754 		}
755 		break;
756 	}
757 
758 	case VM_ISA_ASSERT_IRQ: {
759 		struct vm_isa_irq isa_irq;
760 
761 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
762 			error = EFAULT;
763 			break;
764 		}
765 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
766 		if (error == 0 && isa_irq.ioapic_irq != -1) {
767 			error = vioapic_assert_irq(sc->vmm_vm,
768 			    isa_irq.ioapic_irq);
769 		}
770 		break;
771 	}
772 	case VM_ISA_DEASSERT_IRQ: {
773 		struct vm_isa_irq isa_irq;
774 
775 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
776 			error = EFAULT;
777 			break;
778 		}
779 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
780 		if (error == 0 && isa_irq.ioapic_irq != -1) {
781 			error = vioapic_deassert_irq(sc->vmm_vm,
782 			    isa_irq.ioapic_irq);
783 		}
784 		break;
785 	}
786 	case VM_ISA_PULSE_IRQ: {
787 		struct vm_isa_irq isa_irq;
788 
789 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
790 			error = EFAULT;
791 			break;
792 		}
793 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
794 		if (error == 0 && isa_irq.ioapic_irq != -1) {
795 			error = vioapic_pulse_irq(sc->vmm_vm,
796 			    isa_irq.ioapic_irq);
797 		}
798 		break;
799 	}
800 	case VM_ISA_SET_IRQ_TRIGGER: {
801 		struct vm_isa_irq_trigger isa_irq_trigger;
802 
803 		if (ddi_copyin(datap, &isa_irq_trigger,
804 		    sizeof (isa_irq_trigger), md)) {
805 			error = EFAULT;
806 			break;
807 		}
808 		error = vatpic_set_irq_trigger(sc->vmm_vm,
809 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
810 		break;
811 	}
812 
813 	case VM_MMAP_GETNEXT: {
814 		struct vm_memmap mm;
815 
816 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
817 			error = EFAULT;
818 			break;
819 		}
820 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
821 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
822 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
823 			error = EFAULT;
824 			break;
825 		}
826 		break;
827 	}
828 	case VM_MMAP_MEMSEG: {
829 		struct vm_memmap mm;
830 
831 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
832 			error = EFAULT;
833 			break;
834 		}
835 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
836 		    mm.len, mm.prot, mm.flags);
837 		break;
838 	}
839 	case VM_MUNMAP_MEMSEG: {
840 		struct vm_munmap mu;
841 
842 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
843 			error = EFAULT;
844 			break;
845 		}
846 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
847 		break;
848 	}
849 	case VM_ALLOC_MEMSEG: {
850 		struct vm_memseg vmseg;
851 
852 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
853 			error = EFAULT;
854 			break;
855 		}
856 		error = vmmdev_alloc_memseg(sc, &vmseg);
857 		break;
858 	}
859 	case VM_GET_MEMSEG: {
860 		struct vm_memseg vmseg;
861 
862 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
863 			error = EFAULT;
864 			break;
865 		}
866 		error = vmmdev_get_memseg(sc, &vmseg);
867 		if (error == 0 &&
868 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
869 			error = EFAULT;
870 			break;
871 		}
872 		break;
873 	}
874 	case VM_GET_REGISTER: {
875 		struct vm_register vmreg;
876 
877 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
878 			error = EFAULT;
879 			break;
880 		}
881 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
882 		    &vmreg.regval);
883 		if (error == 0 &&
884 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
885 			error = EFAULT;
886 			break;
887 		}
888 		break;
889 	}
890 	case VM_SET_REGISTER: {
891 		struct vm_register vmreg;
892 
893 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
894 			error = EFAULT;
895 			break;
896 		}
897 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
898 		    vmreg.regval);
899 		break;
900 	}
901 	case VM_SET_SEGMENT_DESCRIPTOR: {
902 		struct vm_seg_desc vmsegd;
903 
904 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
905 			error = EFAULT;
906 			break;
907 		}
908 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
909 		    &vmsegd.desc);
910 		break;
911 	}
912 	case VM_GET_SEGMENT_DESCRIPTOR: {
913 		struct vm_seg_desc vmsegd;
914 
915 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
916 			error = EFAULT;
917 			break;
918 		}
919 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
920 		    &vmsegd.desc);
921 		if (error == 0 &&
922 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
923 			error = EFAULT;
924 			break;
925 		}
926 		break;
927 	}
928 	case VM_GET_REGISTER_SET: {
929 		struct vm_register_set vrs;
930 		int regnums[VM_REG_LAST];
931 		uint64_t regvals[VM_REG_LAST];
932 
933 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
934 			error = EFAULT;
935 			break;
936 		}
937 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
938 			error = EINVAL;
939 			break;
940 		}
941 		if (ddi_copyin(vrs.regnums, regnums,
942 		    sizeof (int) * vrs.count, md)) {
943 			error = EFAULT;
944 			break;
945 		}
946 
947 		error = 0;
948 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
949 			if (regnums[i] < 0) {
950 				error = EINVAL;
951 				break;
952 			}
953 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
954 			    &regvals[i]);
955 		}
956 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
957 		    sizeof (uint64_t) * vrs.count, md)) {
958 			error = EFAULT;
959 		}
960 		break;
961 	}
962 	case VM_SET_REGISTER_SET: {
963 		struct vm_register_set vrs;
964 		int regnums[VM_REG_LAST];
965 		uint64_t regvals[VM_REG_LAST];
966 
967 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
968 			error = EFAULT;
969 			break;
970 		}
971 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
972 			error = EINVAL;
973 			break;
974 		}
975 		if (ddi_copyin(vrs.regnums, regnums,
976 		    sizeof (int) * vrs.count, md)) {
977 			error = EFAULT;
978 			break;
979 		}
980 		if (ddi_copyin(vrs.regvals, regvals,
981 		    sizeof (uint64_t) * vrs.count, md)) {
982 			error = EFAULT;
983 			break;
984 		}
985 
986 		error = 0;
987 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
988 			/*
989 			 * Setting registers in a set is not atomic, since a
990 			 * failure in the middle of the set will cause a
991 			 * bail-out and inconsistent register state.  Callers
992 			 * should be wary of this.
993 			 */
994 			if (regnums[i] < 0) {
995 				error = EINVAL;
996 				break;
997 			}
998 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
999 			    regvals[i]);
1000 		}
1001 		break;
1002 	}
1003 	case VM_RESET_CPU: {
1004 		struct vm_vcpu_reset vvr;
1005 
1006 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1007 			error = EFAULT;
1008 			break;
1009 		}
1010 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1011 			error = EINVAL;
1012 		}
1013 
1014 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1015 		break;
1016 	}
1017 	case VM_GET_RUN_STATE: {
1018 		struct vm_run_state vrs;
1019 
1020 		bzero(&vrs, sizeof (vrs));
1021 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1022 		    &vrs.sipi_vector);
1023 		if (error == 0) {
1024 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1025 				error = EFAULT;
1026 				break;
1027 			}
1028 		}
1029 		break;
1030 	}
1031 	case VM_SET_RUN_STATE: {
1032 		struct vm_run_state vrs;
1033 
1034 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1035 			error = EFAULT;
1036 			break;
1037 		}
1038 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1039 		    vrs.sipi_vector);
1040 		break;
1041 	}
1042 
1043 	case VM_SET_KERNEMU_DEV:
1044 	case VM_GET_KERNEMU_DEV: {
1045 		struct vm_readwrite_kernemu_device kemu;
1046 		size_t size = 0;
1047 
1048 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1049 			error = EFAULT;
1050 			break;
1051 		}
1052 
1053 		if (kemu.access_width > 3) {
1054 			error = EINVAL;
1055 			break;
1056 		}
1057 		size = (1 << kemu.access_width);
1058 		ASSERT(size >= 1 && size <= 8);
1059 
1060 		if (cmd == VM_SET_KERNEMU_DEV) {
1061 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1062 			    kemu.gpa, kemu.value, size);
1063 		} else {
1064 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1065 			    kemu.gpa, &kemu.value, size);
1066 		}
1067 
1068 		if (error == 0) {
1069 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1070 				error = EFAULT;
1071 				break;
1072 			}
1073 		}
1074 		break;
1075 	}
1076 
1077 	case VM_GET_CAPABILITY: {
1078 		struct vm_capability vmcap;
1079 
1080 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1081 			error = EFAULT;
1082 			break;
1083 		}
1084 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1085 		    &vmcap.capval);
1086 		if (error == 0 &&
1087 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1088 			error = EFAULT;
1089 			break;
1090 		}
1091 		break;
1092 	}
1093 	case VM_SET_CAPABILITY: {
1094 		struct vm_capability vmcap;
1095 
1096 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1097 			error = EFAULT;
1098 			break;
1099 		}
1100 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1101 		    vmcap.capval);
1102 		break;
1103 	}
1104 	case VM_SET_X2APIC_STATE: {
1105 		struct vm_x2apic x2apic;
1106 
1107 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1108 			error = EFAULT;
1109 			break;
1110 		}
1111 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1112 		break;
1113 	}
1114 	case VM_GET_X2APIC_STATE: {
1115 		struct vm_x2apic x2apic;
1116 
1117 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1118 			error = EFAULT;
1119 			break;
1120 		}
1121 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1122 		    &x2apic.state);
1123 		if (error == 0 &&
1124 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1125 			error = EFAULT;
1126 			break;
1127 		}
1128 		break;
1129 	}
1130 	case VM_GET_GPA_PMAP: {
1131 		/*
1132 		 * Until there is a necessity to leak EPT/RVI PTE values to
1133 		 * userspace, this will remain unimplemented
1134 		 */
1135 		error = EINVAL;
1136 		break;
1137 	}
1138 	case VM_GET_HPET_CAPABILITIES: {
1139 		struct vm_hpet_cap hpetcap;
1140 
1141 		error = vhpet_getcap(&hpetcap);
1142 		if (error == 0 &&
1143 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1144 			error = EFAULT;
1145 			break;
1146 		}
1147 		break;
1148 	}
1149 	case VM_GLA2GPA: {
1150 		struct vm_gla2gpa gg;
1151 
1152 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1153 			error = EFAULT;
1154 			break;
1155 		}
1156 		gg.vcpuid = vcpu;
1157 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1158 		    gg.prot, &gg.gpa, &gg.fault);
1159 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1160 			error = EFAULT;
1161 			break;
1162 		}
1163 		break;
1164 	}
1165 	case VM_GLA2GPA_NOFAULT: {
1166 		struct vm_gla2gpa gg;
1167 
1168 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1169 			error = EFAULT;
1170 			break;
1171 		}
1172 		gg.vcpuid = vcpu;
1173 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1174 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1175 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1176 			error = EFAULT;
1177 			break;
1178 		}
1179 		break;
1180 	}
1181 
1182 	case VM_ACTIVATE_CPU:
1183 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1184 		break;
1185 
1186 	case VM_SUSPEND_CPU:
1187 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1188 			error = EFAULT;
1189 		} else {
1190 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1191 		}
1192 		break;
1193 
1194 	case VM_RESUME_CPU:
1195 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1196 			error = EFAULT;
1197 		} else {
1198 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1199 		}
1200 		break;
1201 
1202 	case VM_GET_CPUS: {
1203 		struct vm_cpuset vm_cpuset;
1204 		cpuset_t tempset;
1205 		void *srcp = &tempset;
1206 		int size;
1207 
1208 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1209 			error = EFAULT;
1210 			break;
1211 		}
1212 
1213 		/* Be more generous about sizing since our cpuset_t is large. */
1214 		size = vm_cpuset.cpusetsize;
1215 		if (size <= 0 || size > sizeof (cpuset_t)) {
1216 			error = ERANGE;
1217 		}
1218 		/*
1219 		 * If they want a ulong_t or less, make sure they receive the
1220 		 * low bits with all the useful information.
1221 		 */
1222 		if (size <= sizeof (tempset.cpub[0])) {
1223 			srcp = &tempset.cpub[0];
1224 		}
1225 
1226 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1227 			tempset = vm_active_cpus(sc->vmm_vm);
1228 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1229 			tempset = vm_suspended_cpus(sc->vmm_vm);
1230 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1231 			tempset = vm_debug_cpus(sc->vmm_vm);
1232 		} else {
1233 			error = EINVAL;
1234 		}
1235 
1236 		ASSERT(size > 0 && size <= sizeof (tempset));
1237 		if (error == 0 &&
1238 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1239 			error = EFAULT;
1240 			break;
1241 		}
1242 		break;
1243 	}
1244 	case VM_SET_INTINFO: {
1245 		struct vm_intinfo vmii;
1246 
1247 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1248 			error = EFAULT;
1249 			break;
1250 		}
1251 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1252 		break;
1253 	}
1254 	case VM_GET_INTINFO: {
1255 		struct vm_intinfo vmii;
1256 
1257 		vmii.vcpuid = vcpu;
1258 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1259 		    &vmii.info2);
1260 		if (error == 0 &&
1261 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1262 			error = EFAULT;
1263 			break;
1264 		}
1265 		break;
1266 	}
1267 	case VM_RTC_WRITE: {
1268 		struct vm_rtc_data rtcdata;
1269 
1270 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1271 			error = EFAULT;
1272 			break;
1273 		}
1274 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1275 		    rtcdata.value);
1276 		break;
1277 	}
1278 	case VM_RTC_READ: {
1279 		struct vm_rtc_data rtcdata;
1280 
1281 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1282 			error = EFAULT;
1283 			break;
1284 		}
1285 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1286 		    &rtcdata.value);
1287 		if (error == 0 &&
1288 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1289 			error = EFAULT;
1290 			break;
1291 		}
1292 		break;
1293 	}
1294 	case VM_RTC_SETTIME: {
1295 		struct vm_rtc_time rtctime;
1296 
1297 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1298 			error = EFAULT;
1299 			break;
1300 		}
1301 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1302 		break;
1303 	}
1304 	case VM_RTC_GETTIME: {
1305 		struct vm_rtc_time rtctime;
1306 
1307 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1308 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1309 			error = EFAULT;
1310 			break;
1311 		}
1312 		break;
1313 	}
1314 
1315 	case VM_PMTMR_LOCATE: {
1316 		uint16_t port = arg;
1317 		error = vpmtmr_set_location(sc->vmm_vm, port);
1318 		break;
1319 	}
1320 
1321 	case VM_RESTART_INSTRUCTION:
1322 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1323 		break;
1324 
1325 	case VM_SET_TOPOLOGY: {
1326 		struct vm_cpu_topology topo;
1327 
1328 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1329 			error = EFAULT;
1330 			break;
1331 		}
1332 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1333 		    topo.threads, topo.maxcpus);
1334 		break;
1335 	}
1336 	case VM_GET_TOPOLOGY: {
1337 		struct vm_cpu_topology topo;
1338 
1339 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1340 		    &topo.threads, &topo.maxcpus);
1341 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1342 			error = EFAULT;
1343 			break;
1344 		}
1345 		break;
1346 	}
1347 
1348 	case VM_DEVMEM_GETOFFSET: {
1349 		struct vm_devmem_offset vdo;
1350 		list_t *dl = &sc->vmm_devmem_list;
1351 		vmm_devmem_entry_t *de = NULL;
1352 
1353 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1354 			error = EFAULT;
1355 			break;
1356 		}
1357 
1358 		for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1359 			if (de->vde_segid == vdo.segid) {
1360 				break;
1361 			}
1362 		}
1363 		if (de != NULL) {
1364 			vdo.offset = de->vde_off;
1365 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1366 				error = EFAULT;
1367 			}
1368 		} else {
1369 			error = ENOENT;
1370 		}
1371 		break;
1372 	}
1373 	case VM_WRLOCK_CYCLE: {
1374 		/*
1375 		 * Present a test mechanism to acquire/release the write lock
1376 		 * on the VM without any other effects.
1377 		 */
1378 		break;
1379 	}
1380 
1381 	default:
1382 		error = ENOTTY;
1383 		break;
1384 	}
1385 
1386 	/* Release exclusion resources */
1387 	switch (lock_type) {
1388 	case LOCK_NONE:
1389 		break;
1390 	case LOCK_VCPU:
1391 		vcpu_unlock_one(sc, vcpu);
1392 		break;
1393 	case LOCK_READ_HOLD:
1394 		vmm_read_unlock(sc);
1395 		break;
1396 	case LOCK_WRITE_HOLD:
1397 		vmm_write_unlock(sc);
1398 		break;
1399 	default:
1400 		panic("unexpected lock type");
1401 		break;
1402 	}
1403 
1404 	return (error);
1405 }
1406 
1407 static vmm_softc_t *
vmm_lookup(const char * name)1408 vmm_lookup(const char *name)
1409 {
1410 	list_t *vml = &vmm_list;
1411 	vmm_softc_t *sc;
1412 
1413 	ASSERT(MUTEX_HELD(&vmm_mtx));
1414 
1415 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1416 		if (strcmp(sc->vmm_name, name) == 0) {
1417 			break;
1418 		}
1419 	}
1420 
1421 	return (sc);
1422 }
1423 
1424 /*
1425  * Acquire an HMA registration if not already held.
1426  */
1427 static boolean_t
vmm_hma_acquire(void)1428 vmm_hma_acquire(void)
1429 {
1430 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1431 
1432 	mutex_enter(&vmmdev_mtx);
1433 
1434 	if (vmmdev_hma_reg == NULL) {
1435 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1436 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1437 		if (vmmdev_hma_reg == NULL) {
1438 			cmn_err(CE_WARN, "%s HMA registration failed.",
1439 			    vmmdev_hvm_name);
1440 			mutex_exit(&vmmdev_mtx);
1441 			return (B_FALSE);
1442 		}
1443 	}
1444 
1445 	vmmdev_hma_ref++;
1446 
1447 	mutex_exit(&vmmdev_mtx);
1448 
1449 	return (B_TRUE);
1450 }
1451 
1452 /*
1453  * Release the HMA registration if held and there are no remaining VMs.
1454  */
1455 static void
vmm_hma_release(void)1456 vmm_hma_release(void)
1457 {
1458 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1459 
1460 	mutex_enter(&vmmdev_mtx);
1461 
1462 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1463 
1464 	vmmdev_hma_ref--;
1465 
1466 	if (vmmdev_hma_ref == 0) {
1467 		VERIFY(vmmdev_hma_reg != NULL);
1468 		hma_unregister(vmmdev_hma_reg);
1469 		vmmdev_hma_reg = NULL;
1470 	}
1471 	mutex_exit(&vmmdev_mtx);
1472 }
1473 
1474 static int
vmmdev_do_vm_create(const struct vm_create_req * req,cred_t * cr)1475 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1476 {
1477 	vmm_softc_t	*sc = NULL;
1478 	minor_t		minor;
1479 	int		error = ENOMEM;
1480 	size_t		len;
1481 	const char	*name = req->name;
1482 
1483 	len = strnlen(name, VM_MAX_NAMELEN);
1484 	if (len == 0) {
1485 		return (EINVAL);
1486 	}
1487 	if (len >= VM_MAX_NAMELEN) {
1488 		return (ENAMETOOLONG);
1489 	}
1490 	if (strchr(name, '/') != NULL) {
1491 		return (EINVAL);
1492 	}
1493 
1494 	if (!vmm_hma_acquire())
1495 		return (ENXIO);
1496 
1497 	mutex_enter(&vmm_mtx);
1498 
1499 	/* Look for duplicate names */
1500 	if (vmm_lookup(name) != NULL) {
1501 		mutex_exit(&vmm_mtx);
1502 		vmm_hma_release();
1503 		return (EEXIST);
1504 	}
1505 
1506 	/* Allow only one instance per non-global zone. */
1507 	if (!INGLOBALZONE(curproc)) {
1508 		for (sc = list_head(&vmm_list); sc != NULL;
1509 		    sc = list_next(&vmm_list, sc)) {
1510 			if (sc->vmm_zone == curzone) {
1511 				mutex_exit(&vmm_mtx);
1512 				vmm_hma_release();
1513 				return (EINVAL);
1514 			}
1515 		}
1516 	}
1517 
1518 	minor = id_alloc(vmm_minors);
1519 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1520 		goto fail;
1521 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1522 		ddi_soft_state_free(vmm_statep, minor);
1523 		goto fail;
1524 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1525 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
1526 		goto fail;
1527 	}
1528 
1529 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1530 		goto fail;
1531 	}
1532 
1533 	error = vm_create(req->name, req->flags, &sc->vmm_vm);
1534 	if (error == 0) {
1535 		/* Complete VM intialization and report success. */
1536 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1537 		sc->vmm_minor = minor;
1538 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1539 		    offsetof(vmm_devmem_entry_t, vde_node));
1540 
1541 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1542 		    offsetof(vmm_hold_t, vmh_node));
1543 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1544 
1545 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1546 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1547 		    offsetof(vmm_lease_t, vml_node));
1548 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1549 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1550 
1551 		sc->vmm_zone = crgetzone(cr);
1552 		zone_hold(sc->vmm_zone);
1553 		vmm_zsd_add_vm(sc);
1554 		vmm_kstat_init(sc);
1555 
1556 		list_insert_tail(&vmm_list, sc);
1557 		mutex_exit(&vmm_mtx);
1558 		return (0);
1559 	}
1560 
1561 	vmm_kstat_fini(sc);
1562 	ddi_remove_minor_node(vmmdev_dip, name);
1563 fail:
1564 	id_free(vmm_minors, minor);
1565 	if (sc != NULL) {
1566 		ddi_soft_state_free(vmm_statep, minor);
1567 	}
1568 	mutex_exit(&vmm_mtx);
1569 	vmm_hma_release();
1570 
1571 	return (error);
1572 }
1573 
1574 /*
1575  * Bhyve 'Driver' Interface
1576  *
1577  * While many devices are emulated in the bhyve userspace process, there are
1578  * others with performance constraints which require that they run mostly or
1579  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1580  * needed so they can query/manipulate the portions of VM state needed to
1581  * fulfill their purpose.
1582  *
1583  * This includes:
1584  * - Translating guest-physical addresses to host-virtual pointers
1585  * - Injecting MSIs
1586  * - Hooking IO port addresses
1587  *
1588  * The vmm_drv interface exists to provide that functionality to its consumers.
1589  * (At this time, 'viona' is the only user)
1590  */
1591 int
vmm_drv_hold(file_t * fp,cred_t * cr,vmm_hold_t ** holdp)1592 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1593 {
1594 	vnode_t *vp = fp->f_vnode;
1595 	const dev_t dev = vp->v_rdev;
1596 	vmm_softc_t *sc;
1597 	vmm_hold_t *hold;
1598 	int err = 0;
1599 
1600 	if (vp->v_type != VCHR) {
1601 		return (ENXIO);
1602 	}
1603 	const major_t major = getmajor(dev);
1604 	const minor_t minor = getminor(dev);
1605 
1606 	mutex_enter(&vmmdev_mtx);
1607 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1608 		mutex_exit(&vmmdev_mtx);
1609 		return (ENOENT);
1610 	}
1611 	mutex_enter(&vmm_mtx);
1612 	mutex_exit(&vmmdev_mtx);
1613 
1614 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1615 		err = ENOENT;
1616 		goto out;
1617 	}
1618 	/* XXXJOY: check cred permissions against instance */
1619 
1620 	if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1621 		err = EBUSY;
1622 		goto out;
1623 	}
1624 
1625 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1626 	hold->vmh_sc = sc;
1627 	hold->vmh_release_req = B_FALSE;
1628 
1629 	list_insert_tail(&sc->vmm_holds, hold);
1630 	sc->vmm_flags |= VMM_HELD;
1631 	*holdp = hold;
1632 
1633 out:
1634 	mutex_exit(&vmm_mtx);
1635 	return (err);
1636 }
1637 
1638 void
vmm_drv_rele(vmm_hold_t * hold)1639 vmm_drv_rele(vmm_hold_t *hold)
1640 {
1641 	vmm_softc_t *sc;
1642 
1643 	ASSERT(hold != NULL);
1644 	ASSERT(hold->vmh_sc != NULL);
1645 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
1646 
1647 	mutex_enter(&vmm_mtx);
1648 	sc = hold->vmh_sc;
1649 	list_remove(&sc->vmm_holds, hold);
1650 	if (list_is_empty(&sc->vmm_holds)) {
1651 		sc->vmm_flags &= ~VMM_HELD;
1652 		cv_broadcast(&sc->vmm_cv);
1653 	}
1654 	mutex_exit(&vmm_mtx);
1655 	kmem_free(hold, sizeof (*hold));
1656 }
1657 
1658 boolean_t
vmm_drv_release_reqd(vmm_hold_t * hold)1659 vmm_drv_release_reqd(vmm_hold_t *hold)
1660 {
1661 	ASSERT(hold != NULL);
1662 
1663 	return (hold->vmh_release_req);
1664 }
1665 
1666 vmm_lease_t *
vmm_drv_lease_sign(vmm_hold_t * hold,boolean_t (* expiref)(void *),void * arg)1667 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1668 {
1669 	vmm_softc_t *sc = hold->vmh_sc;
1670 	vmm_lease_t *lease;
1671 
1672 	ASSERT3P(expiref, !=, NULL);
1673 
1674 	if (hold->vmh_release_req) {
1675 		return (NULL);
1676 	}
1677 
1678 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1679 	list_link_init(&lease->vml_node);
1680 	lease->vml_expire_func = expiref;
1681 	lease->vml_expire_arg = arg;
1682 	lease->vml_expired = B_FALSE;
1683 	lease->vml_break_deferred = B_FALSE;
1684 	lease->vml_hold = hold;
1685 	/* cache the VM pointer for one less pointer chase */
1686 	lease->vml_vm = sc->vmm_vm;
1687 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
1688 
1689 	mutex_enter(&sc->vmm_lease_lock);
1690 	while (sc->vmm_lease_blocker != 0) {
1691 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1692 	}
1693 	list_insert_tail(&sc->vmm_lease_list, lease);
1694 	vmm_read_lock(sc);
1695 	mutex_exit(&sc->vmm_lease_lock);
1696 
1697 	return (lease);
1698 }
1699 
1700 static void
vmm_lease_break_locked(vmm_softc_t * sc,vmm_lease_t * lease)1701 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1702 {
1703 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1704 
1705 	list_remove(&sc->vmm_lease_list, lease);
1706 	vmm_read_unlock(sc);
1707 	vmc_destroy(lease->vml_vmclient);
1708 	kmem_free(lease, sizeof (*lease));
1709 }
1710 
1711 static void
vmm_lease_block(vmm_softc_t * sc)1712 vmm_lease_block(vmm_softc_t *sc)
1713 {
1714 	mutex_enter(&sc->vmm_lease_lock);
1715 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
1716 	sc->vmm_lease_blocker++;
1717 	if (sc->vmm_lease_blocker == 1) {
1718 		list_t *list = &sc->vmm_lease_list;
1719 		vmm_lease_t *lease = list_head(list);
1720 
1721 		while (lease != NULL) {
1722 			void *arg = lease->vml_expire_arg;
1723 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
1724 			boolean_t sync_break = B_FALSE;
1725 
1726 			/*
1727 			 * Since the lease expiration notification may
1728 			 * need to take locks which would deadlock with
1729 			 * vmm_lease_lock, drop it across the call.
1730 			 *
1731 			 * We are the only one allowed to manipulate
1732 			 * vmm_lease_list right now, so it is safe to
1733 			 * continue iterating through it after
1734 			 * reacquiring the lock.
1735 			 */
1736 			lease->vml_expired = B_TRUE;
1737 			mutex_exit(&sc->vmm_lease_lock);
1738 			sync_break = expiref(arg);
1739 			mutex_enter(&sc->vmm_lease_lock);
1740 
1741 			if (sync_break) {
1742 				vmm_lease_t *next;
1743 
1744 				/*
1745 				 * These leases which are synchronously broken
1746 				 * result in vmm_read_unlock() calls from a
1747 				 * different thread than the corresponding
1748 				 * vmm_read_lock().  This is acceptable, given
1749 				 * that the rwlock underpinning the whole
1750 				 * mechanism tolerates the behavior.  This
1751 				 * flexibility is _only_ afforded to VM read
1752 				 * lock (RW_READER) holders.
1753 				 */
1754 				next = list_next(list, lease);
1755 				vmm_lease_break_locked(sc, lease);
1756 				lease = next;
1757 			} else {
1758 				lease = list_next(list, lease);
1759 			}
1760 		}
1761 
1762 		/* Process leases which were not broken synchronously. */
1763 		while (!list_is_empty(list)) {
1764 			/*
1765 			 * Although the nested loops are quadratic, the number
1766 			 * of leases is small.
1767 			 */
1768 			lease = list_head(list);
1769 			while (lease != NULL) {
1770 				vmm_lease_t *next = list_next(list, lease);
1771 				if (lease->vml_break_deferred) {
1772 					vmm_lease_break_locked(sc, lease);
1773 				}
1774 				lease = next;
1775 			}
1776 			if (list_is_empty(list)) {
1777 				break;
1778 			}
1779 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1780 		}
1781 		/* Wake anyone else waiting for the lease list to be empty  */
1782 		cv_broadcast(&sc->vmm_lease_cv);
1783 	} else {
1784 		list_t *list = &sc->vmm_lease_list;
1785 
1786 		/*
1787 		 * Some other thread beat us to the duty of lease cleanup.
1788 		 * Wait until that is complete.
1789 		 */
1790 		while (!list_is_empty(list)) {
1791 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1792 		}
1793 	}
1794 	mutex_exit(&sc->vmm_lease_lock);
1795 }
1796 
1797 static void
vmm_lease_unblock(vmm_softc_t * sc)1798 vmm_lease_unblock(vmm_softc_t *sc)
1799 {
1800 	mutex_enter(&sc->vmm_lease_lock);
1801 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
1802 	sc->vmm_lease_blocker--;
1803 	if (sc->vmm_lease_blocker == 0) {
1804 		cv_broadcast(&sc->vmm_lease_cv);
1805 	}
1806 	mutex_exit(&sc->vmm_lease_lock);
1807 }
1808 
1809 void
vmm_drv_lease_break(vmm_hold_t * hold,vmm_lease_t * lease)1810 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1811 {
1812 	vmm_softc_t *sc = hold->vmh_sc;
1813 
1814 	VERIFY3P(hold, ==, lease->vml_hold);
1815 	VERIFY(!lease->vml_break_deferred);
1816 
1817 	mutex_enter(&sc->vmm_lease_lock);
1818 	if (sc->vmm_lease_blocker == 0) {
1819 		vmm_lease_break_locked(sc, lease);
1820 	} else {
1821 		/*
1822 		 * Defer the lease-breaking to whichever thread is currently
1823 		 * cleaning up all leases as part of a vmm_lease_block() call.
1824 		 */
1825 		lease->vml_break_deferred = B_TRUE;
1826 		cv_broadcast(&sc->vmm_lease_cv);
1827 	}
1828 	mutex_exit(&sc->vmm_lease_lock);
1829 }
1830 
1831 boolean_t
vmm_drv_lease_expired(vmm_lease_t * lease)1832 vmm_drv_lease_expired(vmm_lease_t *lease)
1833 {
1834 	return (lease->vml_expired);
1835 }
1836 
1837 void *
vmm_drv_gpa2kva(vmm_lease_t * lease,uintptr_t gpa,size_t sz)1838 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1839 {
1840 	vm_page_t *vmp;
1841 	void *res = NULL;
1842 
1843 	ASSERT(lease != NULL);
1844 	ASSERT3U(sz, ==, PAGESIZE);
1845 	ASSERT0(gpa & PAGEOFFSET);
1846 
1847 	vmp = vmc_hold(lease->vml_vmclient, gpa, PROT_READ | PROT_WRITE);
1848 	/*
1849 	 * Break the rules for now and just extract the pointer.  This is
1850 	 * nominally safe, since holding a driver lease on the VM read-locks it.
1851 	 *
1852 	 * A pointer which would otherwise be at risk of being a use-after-free
1853 	 * vector is made safe since actions such as vmspace_unmap() require
1854 	 * acquisition of the VM write-lock, (causing all driver leases to be
1855 	 * broken) allowing the consumers to cease their access prior to
1856 	 * modification of the vmspace.
1857 	 */
1858 	if (vmp != NULL) {
1859 		res = vmp_get_writable(vmp);
1860 		vmp_release(vmp);
1861 	}
1862 
1863 	return (res);
1864 }
1865 
1866 int
vmm_drv_msi(vmm_lease_t * lease,uint64_t addr,uint64_t msg)1867 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1868 {
1869 	ASSERT(lease != NULL);
1870 
1871 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
1872 }
1873 
1874 int
vmm_drv_ioport_hook(vmm_hold_t * hold,uint16_t ioport,vmm_drv_iop_cb_t func,void * arg,void ** cookie)1875 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1876     void *arg, void **cookie)
1877 {
1878 	vmm_softc_t *sc;
1879 	int err;
1880 
1881 	ASSERT(hold != NULL);
1882 	ASSERT(cookie != NULL);
1883 
1884 	sc = hold->vmh_sc;
1885 	mutex_enter(&vmm_mtx);
1886 	/* Confirm that hook installation is not blocked */
1887 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1888 		mutex_exit(&vmm_mtx);
1889 		return (EBUSY);
1890 	}
1891 	/*
1892 	 * Optimistically record an installed hook which will prevent a block
1893 	 * from being asserted while the mutex is dropped.
1894 	 */
1895 	hold->vmh_ioport_hook_cnt++;
1896 	mutex_exit(&vmm_mtx);
1897 
1898 	vmm_write_lock(sc);
1899 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1900 	    arg, cookie);
1901 	vmm_write_unlock(sc);
1902 
1903 	if (err != 0) {
1904 		mutex_enter(&vmm_mtx);
1905 		/* Walk back optimism about the hook installation */
1906 		hold->vmh_ioport_hook_cnt--;
1907 		mutex_exit(&vmm_mtx);
1908 	}
1909 	return (err);
1910 }
1911 
1912 void
vmm_drv_ioport_unhook(vmm_hold_t * hold,void ** cookie)1913 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1914 {
1915 	vmm_softc_t *sc;
1916 
1917 	ASSERT(hold != NULL);
1918 	ASSERT(cookie != NULL);
1919 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
1920 
1921 	sc = hold->vmh_sc;
1922 	vmm_write_lock(sc);
1923 	vm_ioport_unhook(sc->vmm_vm, cookie);
1924 	vmm_write_unlock(sc);
1925 
1926 	mutex_enter(&vmm_mtx);
1927 	hold->vmh_ioport_hook_cnt--;
1928 	mutex_exit(&vmm_mtx);
1929 }
1930 
1931 static int
vmm_drv_purge(vmm_softc_t * sc)1932 vmm_drv_purge(vmm_softc_t *sc)
1933 {
1934 	ASSERT(MUTEX_HELD(&vmm_mtx));
1935 
1936 	if ((sc->vmm_flags & VMM_HELD) != 0) {
1937 		vmm_hold_t *hold;
1938 
1939 		sc->vmm_flags |= VMM_CLEANUP;
1940 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
1941 		    hold = list_next(&sc->vmm_holds, hold)) {
1942 			hold->vmh_release_req = B_TRUE;
1943 		}
1944 		while ((sc->vmm_flags & VMM_HELD) != 0) {
1945 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1946 				return (EINTR);
1947 			}
1948 		}
1949 		sc->vmm_flags &= ~VMM_CLEANUP;
1950 	}
1951 
1952 	VERIFY(list_is_empty(&sc->vmm_holds));
1953 	sc->vmm_flags |= VMM_PURGED;
1954 	return (0);
1955 }
1956 
1957 static int
vmm_drv_block_hook(vmm_softc_t * sc,boolean_t enable_block)1958 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1959 {
1960 	int err = 0;
1961 
1962 	mutex_enter(&vmm_mtx);
1963 	if (!enable_block) {
1964 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1965 
1966 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1967 		goto done;
1968 	}
1969 
1970 	/* If any holds have hooks installed, the block is a failure */
1971 	if (!list_is_empty(&sc->vmm_holds)) {
1972 		vmm_hold_t *hold;
1973 
1974 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
1975 		    hold = list_next(&sc->vmm_holds, hold)) {
1976 			if (hold->vmh_ioport_hook_cnt != 0) {
1977 				err = EBUSY;
1978 				goto done;
1979 			}
1980 		}
1981 	}
1982 	sc->vmm_flags |= VMM_BLOCK_HOOK;
1983 
1984 done:
1985 	mutex_exit(&vmm_mtx);
1986 	return (err);
1987 }
1988 
1989 static int
vmm_do_vm_destroy_locked(vmm_softc_t * sc,boolean_t clean_zsd,boolean_t * hma_release)1990 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1991     boolean_t *hma_release)
1992 {
1993 	dev_info_t	*pdip = ddi_get_parent(vmmdev_dip);
1994 	minor_t		minor;
1995 
1996 	ASSERT(MUTEX_HELD(&vmm_mtx));
1997 
1998 	*hma_release = B_FALSE;
1999 
2000 	if (vmm_drv_purge(sc) != 0) {
2001 		return (EINTR);
2002 	}
2003 
2004 	if (clean_zsd) {
2005 		vmm_zsd_rem_vm(sc);
2006 	}
2007 
2008 	/* Clean up devmem entries */
2009 	vmmdev_devmem_purge(sc);
2010 
2011 	list_remove(&vmm_list, sc);
2012 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2013 	minor = sc->vmm_minor;
2014 	zone_rele(sc->vmm_zone);
2015 	if (sc->vmm_is_open) {
2016 		list_insert_tail(&vmm_destroy_list, sc);
2017 		sc->vmm_flags |= VMM_DESTROY;
2018 	} else {
2019 		vmm_kstat_fini(sc);
2020 		vm_destroy(sc->vmm_vm);
2021 		ddi_soft_state_free(vmm_statep, minor);
2022 		id_free(vmm_minors, minor);
2023 		*hma_release = B_TRUE;
2024 	}
2025 	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
2026 
2027 	return (0);
2028 }
2029 
2030 int
vmm_do_vm_destroy(vmm_softc_t * sc,boolean_t clean_zsd)2031 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
2032 {
2033 	boolean_t	hma_release = B_FALSE;
2034 	int		err;
2035 
2036 	mutex_enter(&vmm_mtx);
2037 	err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
2038 	mutex_exit(&vmm_mtx);
2039 
2040 	if (hma_release)
2041 		vmm_hma_release();
2042 
2043 	return (err);
2044 }
2045 
2046 /* ARGSUSED */
2047 static int
vmmdev_do_vm_destroy(const struct vm_destroy_req * req,cred_t * cr)2048 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2049 {
2050 	boolean_t	hma_release = B_FALSE;
2051 	vmm_softc_t	*sc;
2052 	int		err;
2053 
2054 	if (crgetuid(cr) != 0)
2055 		return (EPERM);
2056 
2057 	mutex_enter(&vmm_mtx);
2058 
2059 	if ((sc = vmm_lookup(req->name)) == NULL) {
2060 		mutex_exit(&vmm_mtx);
2061 		return (ENOENT);
2062 	}
2063 	/*
2064 	 * We don't check this in vmm_lookup() since that function is also used
2065 	 * for validation during create and currently vmm names must be unique.
2066 	 */
2067 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2068 		mutex_exit(&vmm_mtx);
2069 		return (EPERM);
2070 	}
2071 	err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
2072 
2073 	mutex_exit(&vmm_mtx);
2074 
2075 	if (hma_release)
2076 		vmm_hma_release();
2077 
2078 	return (err);
2079 }
2080 
2081 #define	VCPU_NAME_BUFLEN	32
2082 
2083 static int
vmm_kstat_alloc(vmm_softc_t * sc,minor_t minor,const cred_t * cr)2084 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2085 {
2086 	zoneid_t zid = crgetzoneid(cr);
2087 	int instance = minor;
2088 	kstat_t *ksp;
2089 
2090 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2091 
2092 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2093 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2094 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2095 
2096 	if (ksp == NULL) {
2097 		return (-1);
2098 	}
2099 	sc->vmm_kstat_vm = ksp;
2100 
2101 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2102 		char namebuf[VCPU_NAME_BUFLEN];
2103 
2104 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2105 
2106 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2107 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2108 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2109 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2110 		    0, zid);
2111 		if (ksp == NULL) {
2112 			goto fail;
2113 		}
2114 
2115 		sc->vmm_kstat_vcpu[i] = ksp;
2116 	}
2117 
2118 	/*
2119 	 * If this instance is associated with a non-global zone, make its
2120 	 * kstats visible from the GZ.
2121 	 */
2122 	if (zid != GLOBAL_ZONEID) {
2123 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2124 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2125 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2126 		}
2127 	}
2128 
2129 	return (0);
2130 
2131 fail:
2132 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2133 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2134 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2135 			sc->vmm_kstat_vcpu[i] = NULL;
2136 		} else {
2137 			break;
2138 		}
2139 	}
2140 	kstat_delete(sc->vmm_kstat_vm);
2141 	sc->vmm_kstat_vm = NULL;
2142 	return (-1);
2143 }
2144 
2145 static void
vmm_kstat_init(vmm_softc_t * sc)2146 vmm_kstat_init(vmm_softc_t *sc)
2147 {
2148 	kstat_t *ksp;
2149 
2150 	ASSERT3P(sc->vmm_vm, !=, NULL);
2151 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2152 
2153 	ksp = sc->vmm_kstat_vm;
2154 	vmm_kstats_t *vk = ksp->ks_data;
2155 	ksp->ks_private = sc->vmm_vm;
2156 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2157 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2158 
2159 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2160 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2161 
2162 		ksp = sc->vmm_kstat_vcpu[i];
2163 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2164 
2165 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2166 		vvk->vvk_vcpu.value.ui32 = i;
2167 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2168 		    KSTAT_DATA_UINT64);
2169 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2170 		    KSTAT_DATA_UINT64);
2171 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2172 		    KSTAT_DATA_UINT64);
2173 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2174 		    KSTAT_DATA_UINT64);
2175 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2176 		    KSTAT_DATA_UINT64);
2177 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2178 		    KSTAT_DATA_UINT64);
2179 		ksp->ks_private = sc->vmm_vm;
2180 		ksp->ks_update = vmm_kstat_update_vcpu;
2181 	}
2182 
2183 	kstat_install(sc->vmm_kstat_vm);
2184 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2185 		kstat_install(sc->vmm_kstat_vcpu[i]);
2186 	}
2187 }
2188 
2189 static void
vmm_kstat_fini(vmm_softc_t * sc)2190 vmm_kstat_fini(vmm_softc_t *sc)
2191 {
2192 	ASSERT(sc->vmm_kstat_vm != NULL);
2193 
2194 	kstat_delete(sc->vmm_kstat_vm);
2195 	sc->vmm_kstat_vm = NULL;
2196 
2197 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2198 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2199 
2200 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2201 		sc->vmm_kstat_vcpu[i] = NULL;
2202 	}
2203 }
2204 
2205 static int
vmm_open(dev_t * devp,int flag,int otyp,cred_t * credp)2206 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2207 {
2208 	minor_t		minor;
2209 	vmm_softc_t	*sc;
2210 
2211 	/*
2212 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2213 	 * verified to be safe.
2214 	 */
2215 	if (curproc->p_model != DATAMODEL_LP64) {
2216 		return (EFBIG);
2217 	}
2218 
2219 	minor = getminor(*devp);
2220 	if (minor == VMM_CTL_MINOR) {
2221 		/*
2222 		 * Master control device must be opened exclusively.
2223 		 */
2224 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2225 			return (EINVAL);
2226 		}
2227 
2228 		return (0);
2229 	}
2230 
2231 	mutex_enter(&vmm_mtx);
2232 	sc = ddi_get_soft_state(vmm_statep, minor);
2233 	if (sc == NULL) {
2234 		mutex_exit(&vmm_mtx);
2235 		return (ENXIO);
2236 	}
2237 
2238 	sc->vmm_is_open = B_TRUE;
2239 	mutex_exit(&vmm_mtx);
2240 
2241 	return (0);
2242 }
2243 
2244 static int
vmm_close(dev_t dev,int flag,int otyp,cred_t * credp)2245 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2246 {
2247 	minor_t		minor;
2248 	vmm_softc_t	*sc;
2249 	boolean_t	hma_release = B_FALSE;
2250 
2251 	minor = getminor(dev);
2252 	if (minor == VMM_CTL_MINOR)
2253 		return (0);
2254 
2255 	mutex_enter(&vmm_mtx);
2256 	sc = ddi_get_soft_state(vmm_statep, minor);
2257 	if (sc == NULL) {
2258 		mutex_exit(&vmm_mtx);
2259 		return (ENXIO);
2260 	}
2261 
2262 	VERIFY(sc->vmm_is_open);
2263 	sc->vmm_is_open = B_FALSE;
2264 
2265 	/*
2266 	 * If this VM was destroyed while the vmm device was open, then
2267 	 * clean it up now that it is closed.
2268 	 */
2269 	if (sc->vmm_flags & VMM_DESTROY) {
2270 		list_remove(&vmm_destroy_list, sc);
2271 		vmm_kstat_fini(sc);
2272 		vm_destroy(sc->vmm_vm);
2273 		ddi_soft_state_free(vmm_statep, minor);
2274 		id_free(vmm_minors, minor);
2275 		hma_release = B_TRUE;
2276 	}
2277 	mutex_exit(&vmm_mtx);
2278 
2279 	if (hma_release)
2280 		vmm_hma_release();
2281 
2282 	return (0);
2283 }
2284 
2285 static int
vmm_is_supported(intptr_t arg)2286 vmm_is_supported(intptr_t arg)
2287 {
2288 	int r;
2289 	const char *msg;
2290 
2291 	if (vmm_is_intel()) {
2292 		r = vmx_x86_supported(&msg);
2293 	} else if (vmm_is_svm()) {
2294 		/*
2295 		 * HMA already ensured that the features necessary for SVM
2296 		 * operation were present and online during vmm_attach().
2297 		 */
2298 		r = 0;
2299 	} else {
2300 		r = ENXIO;
2301 		msg = "Unsupported CPU vendor";
2302 	}
2303 
2304 	if (r != 0 && arg != (intptr_t)NULL) {
2305 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2306 			return (EFAULT);
2307 	}
2308 	return (r);
2309 }
2310 
2311 static int
vmm_ctl_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)2312 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2313 {
2314 	void *argp = (void *)arg;
2315 
2316 	switch (cmd) {
2317 	case VMM_CREATE_VM: {
2318 		struct vm_create_req req;
2319 
2320 		if ((md & FWRITE) == 0) {
2321 			return (EPERM);
2322 		}
2323 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2324 			return (EFAULT);
2325 		}
2326 		return (vmmdev_do_vm_create(&req, cr));
2327 	}
2328 	case VMM_DESTROY_VM: {
2329 		struct vm_destroy_req req;
2330 
2331 		if ((md & FWRITE) == 0) {
2332 			return (EPERM);
2333 		}
2334 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2335 			return (EFAULT);
2336 		}
2337 		return (vmmdev_do_vm_destroy(&req, cr));
2338 	}
2339 	case VMM_VM_SUPPORTED:
2340 		return (vmm_is_supported(arg));
2341 	case VMM_RESV_QUERY:
2342 	case VMM_RESV_ADD:
2343 	case VMM_RESV_REMOVE:
2344 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2345 	default:
2346 		break;
2347 	}
2348 	/* No other actions are legal on ctl device */
2349 	return (ENOTTY);
2350 }
2351 
2352 static int
vmm_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)2353 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2354     int *rvalp)
2355 {
2356 	vmm_softc_t	*sc;
2357 	minor_t		minor;
2358 
2359 	/*
2360 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2361 	 * verified to be safe.
2362 	 */
2363 	if (curproc->p_model != DATAMODEL_LP64) {
2364 		return (EFBIG);
2365 	}
2366 
2367 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
2368 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2369 		return (ENOTSUP);
2370 	}
2371 
2372 	minor = getminor(dev);
2373 
2374 	if (minor == VMM_CTL_MINOR) {
2375 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
2376 	}
2377 
2378 	sc = ddi_get_soft_state(vmm_statep, minor);
2379 	ASSERT(sc);
2380 
2381 	if (sc->vmm_flags & VMM_DESTROY)
2382 		return (ENXIO);
2383 
2384 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2385 }
2386 
2387 static int
vmm_segmap(dev_t dev,off_t off,struct as * as,caddr_t * addrp,off_t len,unsigned int prot,unsigned int maxprot,unsigned int flags,cred_t * credp)2388 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2389     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2390 {
2391 	vmm_softc_t *sc;
2392 	const minor_t minor = getminor(dev);
2393 	int err;
2394 
2395 	if (minor == VMM_CTL_MINOR) {
2396 		return (ENODEV);
2397 	}
2398 	if (off < 0 || (off + len) <= 0) {
2399 		return (EINVAL);
2400 	}
2401 	if ((prot & PROT_USER) == 0) {
2402 		return (EACCES);
2403 	}
2404 
2405 	sc = ddi_get_soft_state(vmm_statep, minor);
2406 	ASSERT(sc);
2407 
2408 	if (sc->vmm_flags & VMM_DESTROY)
2409 		return (ENXIO);
2410 
2411 	/* Grab read lock on the VM to prevent any changes to the memory map */
2412 	vmm_read_lock(sc);
2413 
2414 	if (off >= VM_DEVMEM_START) {
2415 		int segid;
2416 		off_t segoff;
2417 
2418 		/* Mapping a devmem "device" */
2419 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
2420 			err = ENODEV;
2421 		} else {
2422 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
2423 			    addrp, prot, maxprot, flags);
2424 		}
2425 	} else {
2426 		/* Mapping a part of the guest physical space */
2427 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
2428 		    maxprot, flags);
2429 	}
2430 
2431 	vmm_read_unlock(sc);
2432 	return (err);
2433 }
2434 
2435 static sdev_plugin_validate_t
vmm_sdev_validate(sdev_ctx_t ctx)2436 vmm_sdev_validate(sdev_ctx_t ctx)
2437 {
2438 	const char *name = sdev_ctx_name(ctx);
2439 	vmm_softc_t *sc;
2440 	sdev_plugin_validate_t ret;
2441 	minor_t minor;
2442 
2443 	if (sdev_ctx_vtype(ctx) != VCHR)
2444 		return (SDEV_VTOR_INVALID);
2445 
2446 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2447 
2448 	mutex_enter(&vmm_mtx);
2449 	if ((sc = vmm_lookup(name)) == NULL)
2450 		ret = SDEV_VTOR_INVALID;
2451 	else if (sc->vmm_minor != minor)
2452 		ret = SDEV_VTOR_STALE;
2453 	else
2454 		ret = SDEV_VTOR_VALID;
2455 	mutex_exit(&vmm_mtx);
2456 
2457 	return (ret);
2458 }
2459 
2460 static int
vmm_sdev_filldir(sdev_ctx_t ctx)2461 vmm_sdev_filldir(sdev_ctx_t ctx)
2462 {
2463 	vmm_softc_t *sc;
2464 	int ret;
2465 
2466 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2467 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2468 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2469 		return (EINVAL);
2470 	}
2471 
2472 	mutex_enter(&vmm_mtx);
2473 	ASSERT(vmmdev_dip != NULL);
2474 	for (sc = list_head(&vmm_list); sc != NULL;
2475 	    sc = list_next(&vmm_list, sc)) {
2476 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2477 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2478 			    S_IFCHR | 0600,
2479 			    makedevice(ddi_driver_major(vmmdev_dip),
2480 			    sc->vmm_minor));
2481 		} else {
2482 			continue;
2483 		}
2484 		if (ret != 0 && ret != EEXIST)
2485 			goto out;
2486 	}
2487 
2488 	ret = 0;
2489 
2490 out:
2491 	mutex_exit(&vmm_mtx);
2492 	return (ret);
2493 }
2494 
2495 /* ARGSUSED */
2496 static void
vmm_sdev_inactive(sdev_ctx_t ctx)2497 vmm_sdev_inactive(sdev_ctx_t ctx)
2498 {
2499 }
2500 
2501 static sdev_plugin_ops_t vmm_sdev_ops = {
2502 	.spo_version = SDEV_PLUGIN_VERSION,
2503 	.spo_flags = SDEV_PLUGIN_SUBDIR,
2504 	.spo_validate = vmm_sdev_validate,
2505 	.spo_filldir = vmm_sdev_filldir,
2506 	.spo_inactive = vmm_sdev_inactive
2507 };
2508 
2509 /* ARGSUSED */
2510 static int
vmm_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)2511 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2512 {
2513 	int error;
2514 
2515 	switch (cmd) {
2516 	case DDI_INFO_DEVT2DEVINFO:
2517 		*result = (void *)vmmdev_dip;
2518 		error = DDI_SUCCESS;
2519 		break;
2520 	case DDI_INFO_DEVT2INSTANCE:
2521 		*result = (void *)0;
2522 		error = DDI_SUCCESS;
2523 		break;
2524 	default:
2525 		error = DDI_FAILURE;
2526 		break;
2527 	}
2528 	return (error);
2529 }
2530 
2531 static int
vmm_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)2532 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2533 {
2534 	sdev_plugin_hdl_t sph;
2535 	hma_reg_t *reg = NULL;
2536 	boolean_t vmm_loaded = B_FALSE;
2537 
2538 	if (cmd != DDI_ATTACH) {
2539 		return (DDI_FAILURE);
2540 	}
2541 
2542 	mutex_enter(&vmmdev_mtx);
2543 	/* Ensure we are not already attached. */
2544 	if (vmmdev_dip != NULL) {
2545 		mutex_exit(&vmmdev_mtx);
2546 		return (DDI_FAILURE);
2547 	}
2548 
2549 	vmm_sol_glue_init();
2550 
2551 	/*
2552 	 * Perform temporary HMA registration to determine if the system
2553 	 * is capable.
2554 	 */
2555 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2556 		goto fail;
2557 	} else if (vmm_mod_load() != 0) {
2558 		goto fail;
2559 	}
2560 	vmm_loaded = B_TRUE;
2561 	hma_unregister(reg);
2562 	reg = NULL;
2563 
2564 	/* Create control node.  Other nodes will be created on demand. */
2565 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2566 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2567 		goto fail;
2568 	}
2569 
2570 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2571 	if (sph == (sdev_plugin_hdl_t)NULL) {
2572 		ddi_remove_minor_node(dip, NULL);
2573 		goto fail;
2574 	}
2575 
2576 	ddi_report_dev(dip);
2577 	vmmdev_sdev_hdl = sph;
2578 	vmmdev_dip = dip;
2579 	mutex_exit(&vmmdev_mtx);
2580 	return (DDI_SUCCESS);
2581 
2582 fail:
2583 	if (vmm_loaded) {
2584 		VERIFY0(vmm_mod_unload());
2585 	}
2586 	if (reg != NULL) {
2587 		hma_unregister(reg);
2588 	}
2589 	vmm_sol_glue_cleanup();
2590 	mutex_exit(&vmmdev_mtx);
2591 	return (DDI_FAILURE);
2592 }
2593 
2594 static int
vmm_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2595 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2596 {
2597 	if (cmd != DDI_DETACH) {
2598 		return (DDI_FAILURE);
2599 	}
2600 
2601 	/*
2602 	 * Ensure that all resources have been cleaned up.
2603 	 *
2604 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2605 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2606 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
2607 	 * devinfo, including our own, while holding vmmdev_mtx.
2608 	 */
2609 	if (mutex_tryenter(&vmmdev_mtx) == 0)
2610 		return (DDI_FAILURE);
2611 
2612 	mutex_enter(&vmm_mtx);
2613 	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2614 		mutex_exit(&vmm_mtx);
2615 		mutex_exit(&vmmdev_mtx);
2616 		return (DDI_FAILURE);
2617 	}
2618 	mutex_exit(&vmm_mtx);
2619 
2620 	if (!vmmr_is_empty()) {
2621 		mutex_exit(&vmmdev_mtx);
2622 		return (DDI_FAILURE);
2623 	}
2624 
2625 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2626 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2627 		mutex_exit(&vmmdev_mtx);
2628 		return (DDI_FAILURE);
2629 	}
2630 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2631 
2632 	/* Remove the control node. */
2633 	ddi_remove_minor_node(dip, "ctl");
2634 	vmmdev_dip = NULL;
2635 
2636 	VERIFY0(vmm_mod_unload());
2637 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
2638 	vmm_sol_glue_cleanup();
2639 
2640 	mutex_exit(&vmmdev_mtx);
2641 
2642 	return (DDI_SUCCESS);
2643 }
2644 
2645 static struct cb_ops vmm_cb_ops = {
2646 	vmm_open,
2647 	vmm_close,
2648 	nodev,		/* strategy */
2649 	nodev,		/* print */
2650 	nodev,		/* dump */
2651 	nodev,		/* read */
2652 	nodev,		/* write */
2653 	vmm_ioctl,
2654 	nodev,		/* devmap */
2655 	nodev,		/* mmap */
2656 	vmm_segmap,
2657 	nochpoll,	/* poll */
2658 	ddi_prop_op,
2659 	NULL,
2660 	D_NEW | D_MP | D_DEVMAP
2661 };
2662 
2663 static struct dev_ops vmm_ops = {
2664 	DEVO_REV,
2665 	0,
2666 	vmm_info,
2667 	nulldev,	/* identify */
2668 	nulldev,	/* probe */
2669 	vmm_attach,
2670 	vmm_detach,
2671 	nodev,		/* reset */
2672 	&vmm_cb_ops,
2673 	(struct bus_ops *)NULL
2674 };
2675 
2676 static struct modldrv modldrv = {
2677 	&mod_driverops,
2678 	"bhyve vmm",
2679 	&vmm_ops
2680 };
2681 
2682 static struct modlinkage modlinkage = {
2683 	MODREV_1,
2684 	&modldrv,
2685 	NULL
2686 };
2687 
2688 int
_init(void)2689 _init(void)
2690 {
2691 	int	error;
2692 
2693 	sysinit();
2694 
2695 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2696 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2697 	list_create(&vmm_list, sizeof (vmm_softc_t),
2698 	    offsetof(vmm_softc_t, vmm_node));
2699 	list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2700 	    offsetof(vmm_softc_t, vmm_node));
2701 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2702 
2703 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2704 	if (error) {
2705 		return (error);
2706 	}
2707 
2708 	vmm_zsd_init();
2709 	vmmr_init();
2710 
2711 	error = mod_install(&modlinkage);
2712 	if (error) {
2713 		ddi_soft_state_fini(&vmm_statep);
2714 		vmm_zsd_fini();
2715 		vmmr_fini();
2716 	}
2717 
2718 	return (error);
2719 }
2720 
2721 int
_fini(void)2722 _fini(void)
2723 {
2724 	int	error;
2725 
2726 	error = mod_remove(&modlinkage);
2727 	if (error) {
2728 		return (error);
2729 	}
2730 
2731 	vmm_zsd_fini();
2732 	vmmr_fini();
2733 
2734 	ddi_soft_state_fini(&vmm_statep);
2735 
2736 	return (0);
2737 }
2738 
2739 int
_info(struct modinfo * modinfop)2740 _info(struct modinfo *modinfop)
2741 {
2742 	return (mod_info(&modlinkage, modinfop));
2743 }
2744