xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision 154972af)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2015 Pluribus Networks Inc.
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
16  */
17 
18 #include <sys/types.h>
19 #include <sys/conf.h>
20 #include <sys/cpuvar.h>
21 #include <sys/ioccom.h>
22 #include <sys/stat.h>
23 #include <sys/vmsystm.h>
24 #include <sys/ddi.h>
25 #include <sys/mkdev.h>
26 #include <sys/sunddi.h>
27 #include <sys/fs/dv_node.h>
28 #include <sys/cpuset.h>
29 #include <sys/id_space.h>
30 #include <sys/fs/sdev_plugin.h>
31 #include <sys/smt.h>
32 
33 #include <sys/kernel.h>
34 #include <sys/hma.h>
35 #include <sys/x86_archext.h>
36 #include <x86/apicreg.h>
37 
38 #include <sys/vmm.h>
39 #include <sys/vmm_instruction_emul.h>
40 #include <sys/vmm_dev.h>
41 #include <sys/vmm_impl.h>
42 #include <sys/vmm_drv.h>
43 
44 #include <vm/vm.h>
45 #include <vm/seg_dev.h>
46 
47 #include "io/ppt.h"
48 #include "io/vatpic.h"
49 #include "io/vioapic.h"
50 #include "io/vrtc.h"
51 #include "io/vhpet.h"
52 #include "vmm_lapic.h"
53 #include "vmm_stat.h"
54 #include "vmm_util.h"
55 #include "vm/vm_glue.h"
56 
57 /*
58  * Locking details:
59  *
60  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
61  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
62  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
63  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
64  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
65  */
66 
67 static kmutex_t		vmmdev_mtx;
68 static dev_info_t	*vmmdev_dip;
69 static hma_reg_t	*vmmdev_hma_reg;
70 static uint_t		vmmdev_hma_ref;
71 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
72 
73 static kmutex_t		vmm_mtx;
74 static list_t		vmm_list;
75 static list_t		vmm_destroy_list;
76 static id_space_t	*vmm_minors;
77 static void		*vmm_statep;
78 
79 static const char *vmmdev_hvm_name = "bhyve";
80 
81 /* For sdev plugin (/dev) */
82 #define	VMM_SDEV_ROOT "/dev/vmm"
83 
84 /* From uts/i86pc/io/vmm/intel/vmx.c */
85 extern int vmx_x86_supported(const char **);
86 
87 /* Holds and hooks from drivers external to vmm */
88 struct vmm_hold {
89 	list_node_t	vmh_node;
90 	vmm_softc_t	*vmh_sc;
91 	boolean_t	vmh_release_req;
92 	uint_t		vmh_ioport_hook_cnt;
93 };
94 
95 struct vmm_lease {
96 	list_node_t		vml_node;
97 	struct vm		*vml_vm;
98 	boolean_t		vml_expired;
99 	boolean_t		(*vml_expire_func)(void *);
100 	void			*vml_expire_arg;
101 	list_node_t		vml_expire_node;
102 	struct vmm_hold		*vml_hold;
103 };
104 
105 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
106 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *);
107 
108 static int
109 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
110 {
111 	int error;
112 	bool sysmem;
113 
114 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
115 	    NULL);
116 	if (error || mseg->len == 0)
117 		return (error);
118 
119 	if (!sysmem) {
120 		vmm_devmem_entry_t *de;
121 		list_t *dl = &sc->vmm_devmem_list;
122 
123 		for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
124 			if (de->vde_segid == mseg->segid) {
125 				break;
126 			}
127 		}
128 		if (de != NULL) {
129 			(void) strlcpy(mseg->name, de->vde_name,
130 			    sizeof (mseg->name));
131 		}
132 	} else {
133 		bzero(mseg->name, sizeof (mseg->name));
134 	}
135 
136 	return (error);
137 }
138 
139 /*
140  * The 'devmem' hack:
141  *
142  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
143  * in the vm which appear with their own name related to the vm under /dev.
144  * Since this would be a hassle from an sdev perspective and would require a
145  * new cdev interface (or complicate the existing one), we choose to implement
146  * this in a different manner.  When 'devmem' mappings are created, an
147  * identifying off_t is communicated back out to userspace.  That off_t,
148  * residing above the normal guest memory space, can be used to mmap the
149  * 'devmem' mapping from the already-open vm device.
150  */
151 
152 static int
153 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
154 {
155 	off_t map_offset;
156 	vmm_devmem_entry_t *entry;
157 
158 	if (list_is_empty(&sc->vmm_devmem_list)) {
159 		map_offset = VM_DEVMEM_START;
160 	} else {
161 		entry = list_tail(&sc->vmm_devmem_list);
162 		map_offset = entry->vde_off + entry->vde_len;
163 		if (map_offset < entry->vde_off) {
164 			/* Do not tolerate overflow */
165 			return (ERANGE);
166 		}
167 		/*
168 		 * XXXJOY: We could choose to search the list for duplicate
169 		 * names and toss an error.  Since we're using the offset
170 		 * method for now, it does not make much of a difference.
171 		 */
172 	}
173 
174 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
175 	entry->vde_segid = mseg->segid;
176 	entry->vde_len = mseg->len;
177 	entry->vde_off = map_offset;
178 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
179 	list_insert_tail(&sc->vmm_devmem_list, entry);
180 
181 	return (0);
182 }
183 
184 static boolean_t
185 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp)
186 {
187 	list_t *dl = &sc->vmm_devmem_list;
188 	vmm_devmem_entry_t *de = NULL;
189 
190 	VERIFY(off >= VM_DEVMEM_START);
191 
192 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
193 		/* XXX: Only hit on direct offset/length matches for now */
194 		if (de->vde_off == off && de->vde_len == len) {
195 			break;
196 		}
197 	}
198 	if (de == NULL) {
199 		return (B_FALSE);
200 	}
201 
202 	*segidp = de->vde_segid;
203 	return (B_TRUE);
204 }
205 
206 static void
207 vmmdev_devmem_purge(vmm_softc_t *sc)
208 {
209 	vmm_devmem_entry_t *entry;
210 
211 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
212 		kmem_free(entry, sizeof (*entry));
213 	}
214 }
215 
216 static int
217 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
218 {
219 	int error;
220 	bool sysmem = true;
221 
222 	if (VM_MEMSEG_NAME(mseg)) {
223 		sysmem = false;
224 	}
225 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
226 
227 	if (error == 0 && VM_MEMSEG_NAME(mseg)) {
228 		/*
229 		 * Rather than create a whole fresh device from which userspace
230 		 * can mmap this segment, instead make it available at an
231 		 * offset above where the main guest memory resides.
232 		 */
233 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
234 		if (error != 0) {
235 			vm_free_memseg(sc->vmm_vm, mseg->segid);
236 		}
237 	}
238 	return (error);
239 }
240 
241 /*
242  * Resource Locking and Exclusion
243  *
244  * Much of bhyve depends on key portions of VM state, such as the guest memory
245  * map, to remain unchanged while the guest is running.  As ported from
246  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
247  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
248  * performing the work of actually running the guest in VMX/SVM, would lock
249  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
250  * state, all of the vCPUs would be first locked, ensuring that the
251  * operation(s) could complete without any other threads stumbling into
252  * intermediate states.
253  *
254  * This approach is largely effective for bhyve.  Common operations, such as
255  * running the vCPUs, steer clear of lock contention.  The model begins to
256  * break down for operations which do not occur in the context of a specific
257  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
258  * thread in the bhyve process.  In order to properly protect those vCPU-less
259  * operations from encountering invalid states, additional locking is required.
260  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
261  * It does mean that class of operations will be serialized on locking the
262  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
263  * undue contention on the VM_MAXCPU-1 vCPU.
264  *
265  * In order to address the shortcomings of this model, the concept of a
266  * read/write lock has been added to bhyve.  Operations which change
267  * fundamental aspects of a VM (such as the memory map) must acquire the write
268  * lock, which also implies locking all of the vCPUs and waiting for all read
269  * lock holders to release.  While it increases the cost and waiting time for
270  * those few operations, it allows most hot-path operations on the VM (which
271  * depend on its configuration remaining stable) to occur with minimal locking.
272  *
273  * Consumers of the Driver API (see below) are a special case when it comes to
274  * this locking, since they may hold a read lock via the drv_lease mechanism
275  * for an extended period of time.  Rather than forcing those consumers to
276  * continuously poll for a write lock attempt, the lease system forces them to
277  * provide a release callback to trigger their clean-up (and potential later
278  * reacquisition) of the read lock.
279  */
280 
281 static void
282 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
283 {
284 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
285 
286 	/*
287 	 * Since this state transition is utilizing from_idle=true, it should
288 	 * not fail, but rather block until it can be successful.
289 	 */
290 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
291 }
292 
293 static void
294 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
295 {
296 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
297 
298 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
299 	vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
300 }
301 
302 static void
303 vmm_read_lock(vmm_softc_t *sc)
304 {
305 	rw_enter(&sc->vmm_rwlock, RW_READER);
306 }
307 
308 static void
309 vmm_read_unlock(vmm_softc_t *sc)
310 {
311 	rw_exit(&sc->vmm_rwlock);
312 }
313 
314 static void
315 vmm_write_lock(vmm_softc_t *sc)
316 {
317 	int maxcpus;
318 
319 	/* First lock all the vCPUs */
320 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
321 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
322 		vcpu_lock_one(sc, vcpu);
323 	}
324 
325 	mutex_enter(&sc->vmm_lease_lock);
326 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
327 	sc->vmm_lease_blocker++;
328 	if (sc->vmm_lease_blocker == 1) {
329 		list_t *list = &sc->vmm_lease_list;
330 		vmm_lease_t *lease = list_head(list);
331 
332 		while (lease != NULL) {
333 			boolean_t sync_break = B_FALSE;
334 
335 			if (!lease->vml_expired) {
336 				void *arg = lease->vml_expire_arg;
337 				lease->vml_expired = B_TRUE;
338 				sync_break = lease->vml_expire_func(arg);
339 			}
340 
341 			if (sync_break) {
342 				vmm_lease_t *next;
343 
344 				/*
345 				 * These leases which are synchronously broken
346 				 * result in vmm_read_unlock() calls from a
347 				 * different thread than the corresponding
348 				 * vmm_read_lock().  This is acceptable, given
349 				 * that the rwlock underpinning the whole
350 				 * mechanism tolerates the behavior.  This
351 				 * flexibility is _only_ afforded to VM read
352 				 * lock (RW_READER) holders.
353 				 */
354 				next = list_next(list, lease);
355 				vmm_lease_break_locked(sc, lease);
356 				lease = next;
357 			} else {
358 				lease = list_next(list, lease);
359 			}
360 		}
361 	}
362 	mutex_exit(&sc->vmm_lease_lock);
363 
364 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
365 	/*
366 	 * For now, the 'maxcpus' value for an instance is fixed at the
367 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
368 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
369 	 * of the write lock will need to be wary of such changes.
370 	 */
371 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
372 }
373 
374 static void
375 vmm_write_unlock(vmm_softc_t *sc)
376 {
377 	int maxcpus;
378 
379 	mutex_enter(&sc->vmm_lease_lock);
380 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
381 	sc->vmm_lease_blocker--;
382 	if (sc->vmm_lease_blocker == 0) {
383 		cv_broadcast(&sc->vmm_lease_cv);
384 	}
385 	mutex_exit(&sc->vmm_lease_lock);
386 
387 	/*
388 	 * The VM write lock _must_ be released from the same thread it was
389 	 * acquired in, unlike the read lock.
390 	 */
391 	VERIFY(rw_write_held(&sc->vmm_rwlock));
392 	rw_exit(&sc->vmm_rwlock);
393 
394 	/* Unlock all the vCPUs */
395 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
396 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
397 		vcpu_unlock_one(sc, vcpu);
398 	}
399 }
400 
401 static int
402 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
403     cred_t *credp, int *rvalp)
404 {
405 	int error = 0, vcpu = -1;
406 	void *datap = (void *)arg;
407 	enum vm_lock_type {
408 		LOCK_NONE = 0,
409 		LOCK_VCPU,
410 		LOCK_READ_HOLD,
411 		LOCK_WRITE_HOLD
412 	} lock_type = LOCK_NONE;
413 
414 	/* Acquire any exclusion resources needed for the operation. */
415 	switch (cmd) {
416 	case VM_RUN:
417 	case VM_GET_REGISTER:
418 	case VM_SET_REGISTER:
419 	case VM_GET_SEGMENT_DESCRIPTOR:
420 	case VM_SET_SEGMENT_DESCRIPTOR:
421 	case VM_GET_REGISTER_SET:
422 	case VM_SET_REGISTER_SET:
423 	case VM_INJECT_EXCEPTION:
424 	case VM_GET_CAPABILITY:
425 	case VM_SET_CAPABILITY:
426 	case VM_PPTDEV_MSI:
427 	case VM_PPTDEV_MSIX:
428 	case VM_SET_X2APIC_STATE:
429 	case VM_GLA2GPA:
430 	case VM_GLA2GPA_NOFAULT:
431 	case VM_ACTIVATE_CPU:
432 	case VM_SET_INTINFO:
433 	case VM_GET_INTINFO:
434 	case VM_RESTART_INSTRUCTION:
435 	case VM_SET_KERNEMU_DEV:
436 	case VM_GET_KERNEMU_DEV:
437 		/*
438 		 * Copy in the ID of the vCPU chosen for this operation.
439 		 * Since a nefarious caller could update their struct between
440 		 * this locking and when the rest of the ioctl data is copied
441 		 * in, it is _critical_ that this local 'vcpu' variable be used
442 		 * rather than the in-struct one when performing the ioctl.
443 		 */
444 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
445 			return (EFAULT);
446 		}
447 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
448 			return (EINVAL);
449 		}
450 		vcpu_lock_one(sc, vcpu);
451 		lock_type = LOCK_VCPU;
452 		break;
453 
454 	case VM_REINIT:
455 	case VM_BIND_PPTDEV:
456 	case VM_UNBIND_PPTDEV:
457 	case VM_MAP_PPTDEV_MMIO:
458 	case VM_ALLOC_MEMSEG:
459 	case VM_MMAP_MEMSEG:
460 	case VM_WRLOCK_CYCLE:
461 		vmm_write_lock(sc);
462 		lock_type = LOCK_WRITE_HOLD;
463 		break;
464 
465 	case VM_GET_GPA_PMAP:
466 	case VM_GET_MEMSEG:
467 	case VM_MMAP_GETNEXT:
468 	case VM_LAPIC_IRQ:
469 	case VM_INJECT_NMI:
470 	case VM_IOAPIC_ASSERT_IRQ:
471 	case VM_IOAPIC_DEASSERT_IRQ:
472 	case VM_IOAPIC_PULSE_IRQ:
473 	case VM_LAPIC_MSI:
474 	case VM_LAPIC_LOCAL_IRQ:
475 	case VM_GET_X2APIC_STATE:
476 	case VM_RTC_READ:
477 	case VM_RTC_WRITE:
478 	case VM_RTC_SETTIME:
479 	case VM_RTC_GETTIME:
480 #ifndef __FreeBSD__
481 	case VM_DEVMEM_GETOFFSET:
482 #endif
483 		vmm_read_lock(sc);
484 		lock_type = LOCK_READ_HOLD;
485 		break;
486 
487 	case VM_IOAPIC_PINCOUNT:
488 	default:
489 		break;
490 	}
491 
492 	/* Execute the primary logic for the ioctl. */
493 	switch (cmd) {
494 	case VM_RUN: {
495 		struct vm_run vmrun;
496 
497 		if (ddi_copyin(datap, &vmrun, sizeof (vmrun), md)) {
498 			error = EFAULT;
499 			break;
500 		}
501 		vmrun.cpuid = vcpu;
502 
503 		if (!(curthread->t_schedflag & TS_VCPU))
504 			smt_mark_as_vcpu();
505 
506 		error = vm_run(sc->vmm_vm, &vmrun);
507 		/*
508 		 * XXXJOY: I think it's necessary to do copyout, even in the
509 		 * face of errors, since the exit state is communicated out.
510 		 */
511 		if (ddi_copyout(&vmrun, datap, sizeof (vmrun), md)) {
512 			error = EFAULT;
513 			break;
514 		}
515 		break;
516 	}
517 	case VM_SUSPEND: {
518 		struct vm_suspend vmsuspend;
519 
520 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
521 			error = EFAULT;
522 			break;
523 		}
524 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
525 		break;
526 	}
527 	case VM_REINIT:
528 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
529 			/*
530 			 * The VM instance should be free of driver-attached
531 			 * hooks during the reinitialization process.
532 			 */
533 			break;
534 		}
535 		error = vm_reinit(sc->vmm_vm);
536 		(void) vmm_drv_block_hook(sc, B_FALSE);
537 		break;
538 	case VM_STAT_DESC: {
539 		struct vm_stat_desc statdesc;
540 
541 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
542 			error = EFAULT;
543 			break;
544 		}
545 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
546 		    sizeof (statdesc.desc));
547 		if (error == 0 &&
548 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
549 			error = EFAULT;
550 			break;
551 		}
552 		break;
553 	}
554 	case VM_STATS_IOC: {
555 		struct vm_stats vmstats;
556 
557 		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
558 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
559 			error = EFAULT;
560 			break;
561 		}
562 		hrt2tv(gethrtime(), &vmstats.tv);
563 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
564 		    &vmstats.num_entries, vmstats.statbuf);
565 		if (error == 0 &&
566 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
567 			error = EFAULT;
568 			break;
569 		}
570 		break;
571 	}
572 
573 	case VM_PPTDEV_MSI: {
574 		struct vm_pptdev_msi pptmsi;
575 
576 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
577 			error = EFAULT;
578 			break;
579 		}
580 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
581 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
582 		break;
583 	}
584 	case VM_PPTDEV_MSIX: {
585 		struct vm_pptdev_msix pptmsix;
586 
587 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
588 			error = EFAULT;
589 			break;
590 		}
591 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
592 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
593 		    pptmsix.vector_control);
594 		break;
595 	}
596 	case VM_MAP_PPTDEV_MMIO: {
597 		struct vm_pptdev_mmio pptmmio;
598 
599 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
600 			error = EFAULT;
601 			break;
602 		}
603 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
604 		    pptmmio.len, pptmmio.hpa);
605 		break;
606 	}
607 	case VM_BIND_PPTDEV: {
608 		struct vm_pptdev pptdev;
609 
610 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
611 			error = EFAULT;
612 			break;
613 		}
614 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
615 		break;
616 	}
617 	case VM_UNBIND_PPTDEV: {
618 		struct vm_pptdev pptdev;
619 
620 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
621 			error = EFAULT;
622 			break;
623 		}
624 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
625 		break;
626 	}
627 	case VM_GET_PPTDEV_LIMITS: {
628 		struct vm_pptdev_limits pptlimits;
629 
630 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
631 			error = EFAULT;
632 			break;
633 		}
634 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
635 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
636 		if (error == 0 &&
637 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
638 			error = EFAULT;
639 			break;
640 		}
641 		break;
642 	}
643 	case VM_INJECT_EXCEPTION: {
644 		struct vm_exception vmexc;
645 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
646 			error = EFAULT;
647 			break;
648 		}
649 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
650 		    vmexc.error_code_valid, vmexc.error_code,
651 		    vmexc.restart_instruction);
652 		break;
653 	}
654 	case VM_INJECT_NMI: {
655 		struct vm_nmi vmnmi;
656 
657 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
658 			error = EFAULT;
659 			break;
660 		}
661 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
662 		break;
663 	}
664 	case VM_LAPIC_IRQ: {
665 		struct vm_lapic_irq vmirq;
666 
667 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
668 			error = EFAULT;
669 			break;
670 		}
671 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
672 		break;
673 	}
674 	case VM_LAPIC_LOCAL_IRQ: {
675 		struct vm_lapic_irq vmirq;
676 
677 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
678 			error = EFAULT;
679 			break;
680 		}
681 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
682 		    vmirq.vector);
683 		break;
684 	}
685 	case VM_LAPIC_MSI: {
686 		struct vm_lapic_msi vmmsi;
687 
688 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
689 			error = EFAULT;
690 			break;
691 		}
692 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
693 		break;
694 	}
695 
696 	case VM_IOAPIC_ASSERT_IRQ: {
697 		struct vm_ioapic_irq ioapic_irq;
698 
699 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
700 			error = EFAULT;
701 			break;
702 		}
703 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
704 		break;
705 	}
706 	case VM_IOAPIC_DEASSERT_IRQ: {
707 		struct vm_ioapic_irq ioapic_irq;
708 
709 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
710 			error = EFAULT;
711 			break;
712 		}
713 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
714 		break;
715 	}
716 	case VM_IOAPIC_PULSE_IRQ: {
717 		struct vm_ioapic_irq ioapic_irq;
718 
719 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
720 			error = EFAULT;
721 			break;
722 		}
723 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
724 		break;
725 	}
726 	case VM_IOAPIC_PINCOUNT: {
727 		int pincount;
728 
729 		pincount = vioapic_pincount(sc->vmm_vm);
730 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
731 			error = EFAULT;
732 			break;
733 		}
734 		break;
735 	}
736 
737 	case VM_ISA_ASSERT_IRQ: {
738 		struct vm_isa_irq isa_irq;
739 
740 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
741 			error = EFAULT;
742 			break;
743 		}
744 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
745 		if (error == 0 && isa_irq.ioapic_irq != -1) {
746 			error = vioapic_assert_irq(sc->vmm_vm,
747 			    isa_irq.ioapic_irq);
748 		}
749 		break;
750 	}
751 	case VM_ISA_DEASSERT_IRQ: {
752 		struct vm_isa_irq isa_irq;
753 
754 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
755 			error = EFAULT;
756 			break;
757 		}
758 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
759 		if (error == 0 && isa_irq.ioapic_irq != -1) {
760 			error = vioapic_deassert_irq(sc->vmm_vm,
761 			    isa_irq.ioapic_irq);
762 		}
763 		break;
764 	}
765 	case VM_ISA_PULSE_IRQ: {
766 		struct vm_isa_irq isa_irq;
767 
768 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
769 			error = EFAULT;
770 			break;
771 		}
772 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
773 		if (error == 0 && isa_irq.ioapic_irq != -1) {
774 			error = vioapic_pulse_irq(sc->vmm_vm,
775 			    isa_irq.ioapic_irq);
776 		}
777 		break;
778 	}
779 	case VM_ISA_SET_IRQ_TRIGGER: {
780 		struct vm_isa_irq_trigger isa_irq_trigger;
781 
782 		if (ddi_copyin(datap, &isa_irq_trigger,
783 		    sizeof (isa_irq_trigger), md)) {
784 			error = EFAULT;
785 			break;
786 		}
787 		error = vatpic_set_irq_trigger(sc->vmm_vm,
788 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
789 		break;
790 	}
791 
792 	case VM_MMAP_GETNEXT: {
793 		struct vm_memmap mm;
794 
795 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
796 			error = EFAULT;
797 			break;
798 		}
799 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
800 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
801 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
802 			error = EFAULT;
803 			break;
804 		}
805 		break;
806 	}
807 	case VM_MMAP_MEMSEG: {
808 		struct vm_memmap mm;
809 
810 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
811 			error = EFAULT;
812 			break;
813 		}
814 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
815 		    mm.len, mm.prot, mm.flags);
816 		break;
817 	}
818 	case VM_ALLOC_MEMSEG: {
819 		struct vm_memseg vmseg;
820 
821 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
822 			error = EFAULT;
823 			break;
824 		}
825 		error = vmmdev_alloc_memseg(sc, &vmseg);
826 		break;
827 	}
828 	case VM_GET_MEMSEG: {
829 		struct vm_memseg vmseg;
830 
831 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
832 			error = EFAULT;
833 			break;
834 		}
835 		error = vmmdev_get_memseg(sc, &vmseg);
836 		if (error == 0 &&
837 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
838 			error = EFAULT;
839 			break;
840 		}
841 		break;
842 	}
843 	case VM_GET_REGISTER: {
844 		struct vm_register vmreg;
845 
846 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
847 			error = EFAULT;
848 			break;
849 		}
850 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
851 		    &vmreg.regval);
852 		if (error == 0 &&
853 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
854 			error = EFAULT;
855 			break;
856 		}
857 		break;
858 	}
859 	case VM_SET_REGISTER: {
860 		struct vm_register vmreg;
861 
862 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
863 			error = EFAULT;
864 			break;
865 		}
866 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
867 		    vmreg.regval);
868 		break;
869 	}
870 	case VM_SET_SEGMENT_DESCRIPTOR: {
871 		struct vm_seg_desc vmsegd;
872 
873 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
874 			error = EFAULT;
875 			break;
876 		}
877 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
878 		    &vmsegd.desc);
879 		break;
880 	}
881 	case VM_GET_SEGMENT_DESCRIPTOR: {
882 		struct vm_seg_desc vmsegd;
883 
884 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
885 			error = EFAULT;
886 			break;
887 		}
888 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
889 		    &vmsegd.desc);
890 		if (error == 0 &&
891 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
892 			error = EFAULT;
893 			break;
894 		}
895 		break;
896 	}
897 	case VM_GET_REGISTER_SET: {
898 		struct vm_register_set vrs;
899 		int regnums[VM_REG_LAST];
900 		uint64_t regvals[VM_REG_LAST];
901 
902 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
903 			error = EFAULT;
904 			break;
905 		}
906 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
907 			error = EINVAL;
908 			break;
909 		}
910 		if (ddi_copyin(vrs.regnums, regnums,
911 		    sizeof (int) * vrs.count, md)) {
912 			error = EFAULT;
913 			break;
914 		}
915 
916 		error = 0;
917 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
918 			if (regnums[i] < 0) {
919 				error = EINVAL;
920 				break;
921 			}
922 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
923 			    &regvals[i]);
924 		}
925 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
926 		    sizeof (uint64_t) * vrs.count, md)) {
927 			error = EFAULT;
928 		}
929 		break;
930 	}
931 	case VM_SET_REGISTER_SET: {
932 		struct vm_register_set vrs;
933 		int regnums[VM_REG_LAST];
934 		uint64_t regvals[VM_REG_LAST];
935 
936 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
937 			error = EFAULT;
938 			break;
939 		}
940 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
941 			error = EINVAL;
942 			break;
943 		}
944 		if (ddi_copyin(vrs.regnums, regnums,
945 		    sizeof (int) * vrs.count, md)) {
946 			error = EFAULT;
947 			break;
948 		}
949 		if (ddi_copyin(vrs.regvals, regvals,
950 		    sizeof (uint64_t) * vrs.count, md)) {
951 			error = EFAULT;
952 			break;
953 		}
954 
955 		error = 0;
956 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
957 			/*
958 			 * Setting registers in a set is not atomic, since a
959 			 * failure in the middle of the set will cause a
960 			 * bail-out and inconsistent register state.  Callers
961 			 * should be wary of this.
962 			 */
963 			if (regnums[i] < 0) {
964 				error = EINVAL;
965 				break;
966 			}
967 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
968 			    regvals[i]);
969 		}
970 		break;
971 	}
972 
973 	case VM_SET_KERNEMU_DEV:
974 	case VM_GET_KERNEMU_DEV: {
975 		struct vm_readwrite_kernemu_device kemu;
976 		size_t size = 0;
977 		mem_region_write_t mwrite = NULL;
978 		mem_region_read_t mread = NULL;
979 		uint64_t ignored = 0;
980 
981 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
982 			error = EFAULT;
983 			break;
984 		}
985 
986 		if (kemu.access_width > 3) {
987 			error = EINVAL;
988 			break;
989 		}
990 		size = (1 << kemu.access_width);
991 		ASSERT(size >= 1 && size <= 8);
992 
993 		if (kemu.gpa >= DEFAULT_APIC_BASE &&
994 		    kemu.gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
995 			mread = lapic_mmio_read;
996 			mwrite = lapic_mmio_write;
997 		} else if (kemu.gpa >= VIOAPIC_BASE &&
998 		    kemu.gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
999 			mread = vioapic_mmio_read;
1000 			mwrite = vioapic_mmio_write;
1001 		} else if (kemu.gpa >= VHPET_BASE &&
1002 		    kemu.gpa < VHPET_BASE + VHPET_SIZE) {
1003 			mread = vhpet_mmio_read;
1004 			mwrite = vhpet_mmio_write;
1005 		} else {
1006 			error = EINVAL;
1007 			break;
1008 		}
1009 
1010 		if (cmd == VM_SET_KERNEMU_DEV) {
1011 			VERIFY(mwrite != NULL);
1012 			error = mwrite(sc->vmm_vm, vcpu, kemu.gpa, kemu.value,
1013 			    size, &ignored);
1014 		} else {
1015 			VERIFY(mread != NULL);
1016 			error = mread(sc->vmm_vm, vcpu, kemu.gpa, &kemu.value,
1017 			    size, &ignored);
1018 		}
1019 
1020 		if (error == 0) {
1021 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1022 				error = EFAULT;
1023 				break;
1024 			}
1025 		}
1026 		break;
1027 	}
1028 
1029 	case VM_GET_CAPABILITY: {
1030 		struct vm_capability vmcap;
1031 
1032 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1033 			error = EFAULT;
1034 			break;
1035 		}
1036 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1037 		    &vmcap.capval);
1038 		if (error == 0 &&
1039 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1040 			error = EFAULT;
1041 			break;
1042 		}
1043 		break;
1044 	}
1045 	case VM_SET_CAPABILITY: {
1046 		struct vm_capability vmcap;
1047 
1048 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1049 			error = EFAULT;
1050 			break;
1051 		}
1052 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1053 		    vmcap.capval);
1054 		break;
1055 	}
1056 	case VM_SET_X2APIC_STATE: {
1057 		struct vm_x2apic x2apic;
1058 
1059 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1060 			error = EFAULT;
1061 			break;
1062 		}
1063 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1064 		break;
1065 	}
1066 	case VM_GET_X2APIC_STATE: {
1067 		struct vm_x2apic x2apic;
1068 
1069 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1070 			error = EFAULT;
1071 			break;
1072 		}
1073 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1074 		    &x2apic.state);
1075 		if (error == 0 &&
1076 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1077 			error = EFAULT;
1078 			break;
1079 		}
1080 		break;
1081 	}
1082 	case VM_GET_GPA_PMAP: {
1083 		struct vm_gpa_pte gpapte;
1084 
1085 		if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) {
1086 			error = EFAULT;
1087 			break;
1088 		}
1089 #ifdef __FreeBSD__
1090 		/* XXXJOY: add function? */
1091 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)),
1092 		    gpapte.gpa, gpapte.pte, &gpapte.ptenum);
1093 #endif
1094 		error = 0;
1095 		break;
1096 	}
1097 	case VM_GET_HPET_CAPABILITIES: {
1098 		struct vm_hpet_cap hpetcap;
1099 
1100 		error = vhpet_getcap(&hpetcap);
1101 		if (error == 0 &&
1102 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1103 			error = EFAULT;
1104 			break;
1105 		}
1106 		break;
1107 	}
1108 	case VM_GLA2GPA: {
1109 		struct vm_gla2gpa gg;
1110 
1111 		CTASSERT(PROT_READ == VM_PROT_READ);
1112 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1113 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1114 
1115 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1116 			error = EFAULT;
1117 			break;
1118 		}
1119 		gg.vcpuid = vcpu;
1120 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1121 		    gg.prot, &gg.gpa, &gg.fault);
1122 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1123 			error = EFAULT;
1124 			break;
1125 		}
1126 		break;
1127 	}
1128 	case VM_GLA2GPA_NOFAULT: {
1129 		struct vm_gla2gpa gg;
1130 
1131 		CTASSERT(PROT_READ == VM_PROT_READ);
1132 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
1133 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
1134 
1135 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1136 			error = EFAULT;
1137 			break;
1138 		}
1139 		gg.vcpuid = vcpu;
1140 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1141 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1142 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1143 			error = EFAULT;
1144 			break;
1145 		}
1146 		break;
1147 	}
1148 
1149 	case VM_ACTIVATE_CPU:
1150 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1151 		break;
1152 
1153 	case VM_SUSPEND_CPU:
1154 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1155 			error = EFAULT;
1156 		} else {
1157 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1158 		}
1159 		break;
1160 
1161 	case VM_RESUME_CPU:
1162 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1163 			error = EFAULT;
1164 		} else {
1165 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1166 		}
1167 		break;
1168 
1169 	case VM_GET_CPUS: {
1170 		struct vm_cpuset vm_cpuset;
1171 		cpuset_t tempset;
1172 		void *srcp = &tempset;
1173 		int size;
1174 
1175 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1176 			error = EFAULT;
1177 			break;
1178 		}
1179 
1180 		/* Be more generous about sizing since our cpuset_t is large. */
1181 		size = vm_cpuset.cpusetsize;
1182 		if (size <= 0 || size > sizeof (cpuset_t)) {
1183 			error = ERANGE;
1184 		}
1185 		/*
1186 		 * If they want a ulong_t or less, make sure they receive the
1187 		 * low bits with all the useful information.
1188 		 */
1189 		if (size <= sizeof (tempset.cpub[0])) {
1190 			srcp = &tempset.cpub[0];
1191 		}
1192 
1193 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1194 			tempset = vm_active_cpus(sc->vmm_vm);
1195 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1196 			tempset = vm_suspended_cpus(sc->vmm_vm);
1197 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1198 			tempset = vm_debug_cpus(sc->vmm_vm);
1199 		} else {
1200 			error = EINVAL;
1201 		}
1202 
1203 		ASSERT(size > 0 && size <= sizeof (tempset));
1204 		if (error == 0 &&
1205 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1206 			error = EFAULT;
1207 			break;
1208 		}
1209 		break;
1210 	}
1211 	case VM_SET_INTINFO: {
1212 		struct vm_intinfo vmii;
1213 
1214 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1215 			error = EFAULT;
1216 			break;
1217 		}
1218 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1219 		break;
1220 	}
1221 	case VM_GET_INTINFO: {
1222 		struct vm_intinfo vmii;
1223 
1224 		vmii.vcpuid = vcpu;
1225 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1226 		    &vmii.info2);
1227 		if (error == 0 &&
1228 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1229 			error = EFAULT;
1230 			break;
1231 		}
1232 		break;
1233 	}
1234 	case VM_RTC_WRITE: {
1235 		struct vm_rtc_data rtcdata;
1236 
1237 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1238 			error = EFAULT;
1239 			break;
1240 		}
1241 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1242 		    rtcdata.value);
1243 		break;
1244 	}
1245 	case VM_RTC_READ: {
1246 		struct vm_rtc_data rtcdata;
1247 
1248 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1249 			error = EFAULT;
1250 			break;
1251 		}
1252 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1253 		    &rtcdata.value);
1254 		if (error == 0 &&
1255 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1256 			error = EFAULT;
1257 			break;
1258 		}
1259 		break;
1260 	}
1261 	case VM_RTC_SETTIME: {
1262 		struct vm_rtc_time rtctime;
1263 
1264 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1265 			error = EFAULT;
1266 			break;
1267 		}
1268 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1269 		break;
1270 	}
1271 	case VM_RTC_GETTIME: {
1272 		struct vm_rtc_time rtctime;
1273 
1274 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1275 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1276 			error = EFAULT;
1277 			break;
1278 		}
1279 		break;
1280 	}
1281 
1282 	case VM_RESTART_INSTRUCTION:
1283 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1284 		break;
1285 
1286 	case VM_SET_TOPOLOGY: {
1287 		struct vm_cpu_topology topo;
1288 
1289 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1290 			error = EFAULT;
1291 			break;
1292 		}
1293 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1294 		    topo.threads, topo.maxcpus);
1295 		break;
1296 	}
1297 	case VM_GET_TOPOLOGY: {
1298 		struct vm_cpu_topology topo;
1299 
1300 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1301 		    &topo.threads, &topo.maxcpus);
1302 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1303 			error = EFAULT;
1304 			break;
1305 		}
1306 		break;
1307 	}
1308 
1309 #ifndef __FreeBSD__
1310 	case VM_DEVMEM_GETOFFSET: {
1311 		struct vm_devmem_offset vdo;
1312 		list_t *dl = &sc->vmm_devmem_list;
1313 		vmm_devmem_entry_t *de = NULL;
1314 
1315 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1316 			error = EFAULT;
1317 			break;
1318 		}
1319 
1320 		for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1321 			if (de->vde_segid == vdo.segid) {
1322 				break;
1323 			}
1324 		}
1325 		if (de != NULL) {
1326 			vdo.offset = de->vde_off;
1327 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1328 				error = EFAULT;
1329 			}
1330 		} else {
1331 			error = ENOENT;
1332 		}
1333 		break;
1334 	}
1335 	case VM_WRLOCK_CYCLE: {
1336 		/*
1337 		 * Present a test mechanism to acquire/release the write lock
1338 		 * on the VM without any other effects.
1339 		 */
1340 		break;
1341 	}
1342 #endif
1343 	default:
1344 		error = ENOTTY;
1345 		break;
1346 	}
1347 
1348 	/* Release exclusion resources */
1349 	switch (lock_type) {
1350 	case LOCK_NONE:
1351 		break;
1352 	case LOCK_VCPU:
1353 		vcpu_unlock_one(sc, vcpu);
1354 		break;
1355 	case LOCK_READ_HOLD:
1356 		vmm_read_unlock(sc);
1357 		break;
1358 	case LOCK_WRITE_HOLD:
1359 		vmm_write_unlock(sc);
1360 		break;
1361 	default:
1362 		panic("unexpected lock type");
1363 		break;
1364 	}
1365 
1366 	return (error);
1367 }
1368 
1369 static vmm_softc_t *
1370 vmm_lookup(const char *name)
1371 {
1372 	list_t *vml = &vmm_list;
1373 	vmm_softc_t *sc;
1374 
1375 	ASSERT(MUTEX_HELD(&vmm_mtx));
1376 
1377 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1378 		if (strcmp(sc->vmm_name, name) == 0) {
1379 			break;
1380 		}
1381 	}
1382 
1383 	return (sc);
1384 }
1385 
1386 /*
1387  * Acquire an HMA registration if not already held.
1388  */
1389 static boolean_t
1390 vmm_hma_acquire(void)
1391 {
1392 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1393 
1394 	mutex_enter(&vmmdev_mtx);
1395 
1396 	if (vmmdev_hma_reg == NULL) {
1397 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1398 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1399 		if (vmmdev_hma_reg == NULL) {
1400 			cmn_err(CE_WARN, "%s HMA registration failed.",
1401 			    vmmdev_hvm_name);
1402 			mutex_exit(&vmmdev_mtx);
1403 			return (B_FALSE);
1404 		}
1405 	}
1406 
1407 	vmmdev_hma_ref++;
1408 
1409 	mutex_exit(&vmmdev_mtx);
1410 
1411 	return (B_TRUE);
1412 }
1413 
1414 /*
1415  * Release the HMA registration if held and there are no remaining VMs.
1416  */
1417 static void
1418 vmm_hma_release(void)
1419 {
1420 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1421 
1422 	mutex_enter(&vmmdev_mtx);
1423 
1424 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1425 
1426 	vmmdev_hma_ref--;
1427 
1428 	if (vmmdev_hma_ref == 0) {
1429 		VERIFY(vmmdev_hma_reg != NULL);
1430 		hma_unregister(vmmdev_hma_reg);
1431 		vmmdev_hma_reg = NULL;
1432 	}
1433 	mutex_exit(&vmmdev_mtx);
1434 }
1435 
1436 static int
1437 vmmdev_do_vm_create(char *name, cred_t *cr)
1438 {
1439 	vmm_softc_t	*sc = NULL;
1440 	minor_t		minor;
1441 	int		error = ENOMEM;
1442 
1443 	if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) {
1444 		return (EINVAL);
1445 	}
1446 
1447 	if (!vmm_hma_acquire())
1448 		return (ENXIO);
1449 
1450 	mutex_enter(&vmm_mtx);
1451 
1452 	/* Look for duplicate names */
1453 	if (vmm_lookup(name) != NULL) {
1454 		mutex_exit(&vmm_mtx);
1455 		vmm_hma_release();
1456 		return (EEXIST);
1457 	}
1458 
1459 	/* Allow only one instance per non-global zone. */
1460 	if (!INGLOBALZONE(curproc)) {
1461 		for (sc = list_head(&vmm_list); sc != NULL;
1462 		    sc = list_next(&vmm_list, sc)) {
1463 			if (sc->vmm_zone == curzone) {
1464 				mutex_exit(&vmm_mtx);
1465 				vmm_hma_release();
1466 				return (EINVAL);
1467 			}
1468 		}
1469 	}
1470 
1471 	minor = id_alloc(vmm_minors);
1472 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1473 		goto fail;
1474 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1475 		ddi_soft_state_free(vmm_statep, minor);
1476 		goto fail;
1477 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1478 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
1479 		goto fail;
1480 	}
1481 
1482 	error = vm_create(name, &sc->vmm_vm);
1483 	if (error == 0) {
1484 		/* Complete VM intialization and report success. */
1485 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1486 		sc->vmm_minor = minor;
1487 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1488 		    offsetof(vmm_devmem_entry_t, vde_node));
1489 
1490 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1491 		    offsetof(vmm_hold_t, vmh_node));
1492 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1493 
1494 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1495 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1496 		    offsetof(vmm_lease_t, vml_node));
1497 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1498 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1499 
1500 		sc->vmm_zone = crgetzone(cr);
1501 		zone_hold(sc->vmm_zone);
1502 		vmm_zsd_add_vm(sc);
1503 
1504 		list_insert_tail(&vmm_list, sc);
1505 		mutex_exit(&vmm_mtx);
1506 		return (0);
1507 	}
1508 
1509 	ddi_remove_minor_node(vmmdev_dip, name);
1510 fail:
1511 	id_free(vmm_minors, minor);
1512 	if (sc != NULL) {
1513 		ddi_soft_state_free(vmm_statep, minor);
1514 	}
1515 	mutex_exit(&vmm_mtx);
1516 	vmm_hma_release();
1517 
1518 	return (error);
1519 }
1520 
1521 /*
1522  * Bhyve 'Driver' Interface
1523  *
1524  * While many devices are emulated in the bhyve userspace process, there are
1525  * others with performance constraints which require that they run mostly or
1526  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
1527  * needed so they can query/manipulate the portions of VM state needed to
1528  * fulfill their purpose.
1529  *
1530  * This includes:
1531  * - Translating guest-physical addresses to host-virtual pointers
1532  * - Injecting MSIs
1533  * - Hooking IO port addresses
1534  *
1535  * The vmm_drv interface exists to provide that functionality to its consumers.
1536  * (At this time, 'viona' is the only user)
1537  */
1538 int
1539 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1540 {
1541 	vnode_t *vp = fp->f_vnode;
1542 	const dev_t dev = vp->v_rdev;
1543 	vmm_softc_t *sc;
1544 	vmm_hold_t *hold;
1545 	int err = 0;
1546 
1547 	if (vp->v_type != VCHR) {
1548 		return (ENXIO);
1549 	}
1550 	const major_t major = getmajor(dev);
1551 	const minor_t minor = getminor(dev);
1552 
1553 	mutex_enter(&vmmdev_mtx);
1554 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1555 		mutex_exit(&vmmdev_mtx);
1556 		return (ENOENT);
1557 	}
1558 	mutex_enter(&vmm_mtx);
1559 	mutex_exit(&vmmdev_mtx);
1560 
1561 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1562 		err = ENOENT;
1563 		goto out;
1564 	}
1565 	/* XXXJOY: check cred permissions against instance */
1566 
1567 	if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1568 		err = EBUSY;
1569 		goto out;
1570 	}
1571 
1572 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1573 	hold->vmh_sc = sc;
1574 	hold->vmh_release_req = B_FALSE;
1575 
1576 	list_insert_tail(&sc->vmm_holds, hold);
1577 	sc->vmm_flags |= VMM_HELD;
1578 	*holdp = hold;
1579 
1580 out:
1581 	mutex_exit(&vmm_mtx);
1582 	return (err);
1583 }
1584 
1585 void
1586 vmm_drv_rele(vmm_hold_t *hold)
1587 {
1588 	vmm_softc_t *sc;
1589 
1590 	ASSERT(hold != NULL);
1591 	ASSERT(hold->vmh_sc != NULL);
1592 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
1593 
1594 	mutex_enter(&vmm_mtx);
1595 	sc = hold->vmh_sc;
1596 	list_remove(&sc->vmm_holds, hold);
1597 	if (list_is_empty(&sc->vmm_holds)) {
1598 		sc->vmm_flags &= ~VMM_HELD;
1599 		cv_broadcast(&sc->vmm_cv);
1600 	}
1601 	mutex_exit(&vmm_mtx);
1602 	kmem_free(hold, sizeof (*hold));
1603 }
1604 
1605 boolean_t
1606 vmm_drv_release_reqd(vmm_hold_t *hold)
1607 {
1608 	ASSERT(hold != NULL);
1609 
1610 	return (hold->vmh_release_req);
1611 }
1612 
1613 vmm_lease_t *
1614 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1615 {
1616 	vmm_softc_t *sc = hold->vmh_sc;
1617 	vmm_lease_t *lease;
1618 
1619 	ASSERT3P(expiref, !=, NULL);
1620 
1621 	if (hold->vmh_release_req) {
1622 		return (NULL);
1623 	}
1624 
1625 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1626 	list_link_init(&lease->vml_node);
1627 	lease->vml_expire_func = expiref;
1628 	lease->vml_expire_arg = arg;
1629 	lease->vml_expired = B_FALSE;
1630 	lease->vml_hold = hold;
1631 	/* cache the VM pointer for one less pointer chase */
1632 	lease->vml_vm = sc->vmm_vm;
1633 
1634 	mutex_enter(&sc->vmm_lease_lock);
1635 	while (sc->vmm_lease_blocker != 0) {
1636 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1637 	}
1638 	list_insert_tail(&sc->vmm_lease_list, lease);
1639 	vmm_read_lock(sc);
1640 	mutex_exit(&sc->vmm_lease_lock);
1641 
1642 	return (lease);
1643 }
1644 
1645 static void
1646 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1647 {
1648 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1649 
1650 	list_remove(&sc->vmm_lease_list, lease);
1651 	vmm_read_unlock(sc);
1652 	kmem_free(lease, sizeof (*lease));
1653 }
1654 
1655 void
1656 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1657 {
1658 	vmm_softc_t *sc = hold->vmh_sc;
1659 
1660 	VERIFY3P(hold, ==, lease->vml_hold);
1661 
1662 	mutex_enter(&sc->vmm_lease_lock);
1663 	vmm_lease_break_locked(sc, lease);
1664 	mutex_exit(&sc->vmm_lease_lock);
1665 }
1666 
1667 boolean_t
1668 vmm_drv_lease_expired(vmm_lease_t *lease)
1669 {
1670 	return (lease->vml_expired);
1671 }
1672 
1673 void *
1674 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1675 {
1676 	ASSERT(lease != NULL);
1677 
1678 	return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz));
1679 }
1680 
1681 int
1682 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1683 {
1684 	ASSERT(lease != NULL);
1685 
1686 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
1687 }
1688 
1689 int
1690 vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc,
1691     vmm_drv_wmem_cb_t wfunc, void *arg, void **cookie)
1692 {
1693 	vmm_softc_t *sc;
1694 	int err;
1695 
1696 	ASSERT(hold != NULL);
1697 	ASSERT(cookie != NULL);
1698 
1699 	sc = hold->vmh_sc;
1700 	mutex_enter(&vmm_mtx);
1701 	/* Confirm that hook installation is not blocked */
1702 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1703 		mutex_exit(&vmm_mtx);
1704 		return (EBUSY);
1705 	}
1706 	/*
1707 	 * Optimistically record an installed hook which will prevent a block
1708 	 * from being asserted while the mutex is dropped.
1709 	 */
1710 	hold->vmh_ioport_hook_cnt++;
1711 	mutex_exit(&vmm_mtx);
1712 
1713 	vmm_write_lock(sc);
1714 	err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc,
1715 	    (vmm_wmem_cb_t)wfunc, arg, cookie);
1716 	vmm_write_unlock(sc);
1717 
1718 	if (err != 0) {
1719 		mutex_enter(&vmm_mtx);
1720 		/* Walk back optimism about the hook installation */
1721 		hold->vmh_ioport_hook_cnt--;
1722 		mutex_exit(&vmm_mtx);
1723 	}
1724 	return (err);
1725 }
1726 
1727 void
1728 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1729 {
1730 	vmm_softc_t *sc;
1731 
1732 	ASSERT(hold != NULL);
1733 	ASSERT(cookie != NULL);
1734 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
1735 
1736 	sc = hold->vmh_sc;
1737 	vmm_write_lock(sc);
1738 	vm_ioport_unhook(sc->vmm_vm, cookie);
1739 	vmm_write_unlock(sc);
1740 
1741 	mutex_enter(&vmm_mtx);
1742 	hold->vmh_ioport_hook_cnt--;
1743 	mutex_exit(&vmm_mtx);
1744 }
1745 
1746 static int
1747 vmm_drv_purge(vmm_softc_t *sc)
1748 {
1749 	ASSERT(MUTEX_HELD(&vmm_mtx));
1750 
1751 	if ((sc->vmm_flags & VMM_HELD) != 0) {
1752 		vmm_hold_t *hold;
1753 
1754 		sc->vmm_flags |= VMM_CLEANUP;
1755 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
1756 		    hold = list_next(&sc->vmm_holds, hold)) {
1757 			hold->vmh_release_req = B_TRUE;
1758 		}
1759 		while ((sc->vmm_flags & VMM_HELD) != 0) {
1760 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1761 				return (EINTR);
1762 			}
1763 		}
1764 		sc->vmm_flags &= ~VMM_CLEANUP;
1765 	}
1766 
1767 	VERIFY(list_is_empty(&sc->vmm_holds));
1768 	sc->vmm_flags |= VMM_PURGED;
1769 	return (0);
1770 }
1771 
1772 static int
1773 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1774 {
1775 	int err = 0;
1776 
1777 	mutex_enter(&vmm_mtx);
1778 	if (!enable_block) {
1779 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1780 
1781 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1782 		goto done;
1783 	}
1784 
1785 	/* If any holds have hooks installed, the block is a failure */
1786 	if (!list_is_empty(&sc->vmm_holds)) {
1787 		vmm_hold_t *hold;
1788 
1789 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
1790 		    hold = list_next(&sc->vmm_holds, hold)) {
1791 			if (hold->vmh_ioport_hook_cnt != 0) {
1792 				err = EBUSY;
1793 				goto done;
1794 			}
1795 		}
1796 	}
1797 	sc->vmm_flags |= VMM_BLOCK_HOOK;
1798 
1799 done:
1800 	mutex_exit(&vmm_mtx);
1801 	return (err);
1802 }
1803 
1804 static int
1805 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1806     boolean_t *hma_release)
1807 {
1808 	dev_info_t	*pdip = ddi_get_parent(vmmdev_dip);
1809 	minor_t		minor;
1810 
1811 	ASSERT(MUTEX_HELD(&vmm_mtx));
1812 
1813 	*hma_release = B_FALSE;
1814 
1815 	if (clean_zsd) {
1816 		vmm_zsd_rem_vm(sc);
1817 	}
1818 
1819 	if (vmm_drv_purge(sc) != 0) {
1820 		return (EINTR);
1821 	}
1822 
1823 	/* Clean up devmem entries */
1824 	vmmdev_devmem_purge(sc);
1825 
1826 	list_remove(&vmm_list, sc);
1827 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
1828 	minor = sc->vmm_minor;
1829 	zone_rele(sc->vmm_zone);
1830 	if (sc->vmm_is_open) {
1831 		list_insert_tail(&vmm_destroy_list, sc);
1832 		sc->vmm_flags |= VMM_DESTROY;
1833 	} else {
1834 		vm_destroy(sc->vmm_vm);
1835 		ddi_soft_state_free(vmm_statep, minor);
1836 		id_free(vmm_minors, minor);
1837 		*hma_release = B_TRUE;
1838 	}
1839 	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
1840 
1841 	return (0);
1842 }
1843 
1844 int
1845 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
1846 {
1847 	boolean_t	hma_release = B_FALSE;
1848 	int		err;
1849 
1850 	mutex_enter(&vmm_mtx);
1851 	err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
1852 	mutex_exit(&vmm_mtx);
1853 
1854 	if (hma_release)
1855 		vmm_hma_release();
1856 
1857 	return (err);
1858 }
1859 
1860 /* ARGSUSED */
1861 static int
1862 vmmdev_do_vm_destroy(const char *name, cred_t *cr)
1863 {
1864 	boolean_t	hma_release = B_FALSE;
1865 	vmm_softc_t	*sc;
1866 	int		err;
1867 
1868 	if (crgetuid(cr) != 0)
1869 		return (EPERM);
1870 
1871 	mutex_enter(&vmm_mtx);
1872 
1873 	if ((sc = vmm_lookup(name)) == NULL) {
1874 		mutex_exit(&vmm_mtx);
1875 		return (ENOENT);
1876 	}
1877 	/*
1878 	 * We don't check this in vmm_lookup() since that function is also used
1879 	 * for validation during create and currently vmm names must be unique.
1880 	 */
1881 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
1882 		mutex_exit(&vmm_mtx);
1883 		return (EPERM);
1884 	}
1885 	err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
1886 
1887 	mutex_exit(&vmm_mtx);
1888 
1889 	if (hma_release)
1890 		vmm_hma_release();
1891 
1892 	return (err);
1893 }
1894 
1895 static int
1896 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1897 {
1898 	minor_t		minor;
1899 	vmm_softc_t	*sc;
1900 
1901 	minor = getminor(*devp);
1902 	if (minor == VMM_CTL_MINOR) {
1903 		/*
1904 		 * Master control device must be opened exclusively.
1905 		 */
1906 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
1907 			return (EINVAL);
1908 		}
1909 
1910 		return (0);
1911 	}
1912 
1913 	mutex_enter(&vmm_mtx);
1914 	sc = ddi_get_soft_state(vmm_statep, minor);
1915 	if (sc == NULL) {
1916 		mutex_exit(&vmm_mtx);
1917 		return (ENXIO);
1918 	}
1919 
1920 	sc->vmm_is_open = B_TRUE;
1921 	mutex_exit(&vmm_mtx);
1922 
1923 	return (0);
1924 }
1925 
1926 static int
1927 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
1928 {
1929 	minor_t		minor;
1930 	vmm_softc_t	*sc;
1931 	boolean_t	hma_release = B_FALSE;
1932 
1933 	minor = getminor(dev);
1934 	if (minor == VMM_CTL_MINOR)
1935 		return (0);
1936 
1937 	mutex_enter(&vmm_mtx);
1938 	sc = ddi_get_soft_state(vmm_statep, minor);
1939 	if (sc == NULL) {
1940 		mutex_exit(&vmm_mtx);
1941 		return (ENXIO);
1942 	}
1943 
1944 	VERIFY(sc->vmm_is_open);
1945 	sc->vmm_is_open = B_FALSE;
1946 
1947 	/*
1948 	 * If this VM was destroyed while the vmm device was open, then
1949 	 * clean it up now that it is closed.
1950 	 */
1951 	if (sc->vmm_flags & VMM_DESTROY) {
1952 		list_remove(&vmm_destroy_list, sc);
1953 		vm_destroy(sc->vmm_vm);
1954 		ddi_soft_state_free(vmm_statep, minor);
1955 		id_free(vmm_minors, minor);
1956 		hma_release = B_TRUE;
1957 	}
1958 	mutex_exit(&vmm_mtx);
1959 
1960 	if (hma_release)
1961 		vmm_hma_release();
1962 
1963 	return (0);
1964 }
1965 
1966 static int
1967 vmm_is_supported(intptr_t arg)
1968 {
1969 	int r;
1970 	const char *msg;
1971 
1972 	if (vmm_is_intel()) {
1973 		r = vmx_x86_supported(&msg);
1974 	} else if (vmm_is_svm()) {
1975 		/*
1976 		 * HMA already ensured that the features necessary for SVM
1977 		 * operation were present and online during vmm_attach().
1978 		 */
1979 		r = 0;
1980 	} else {
1981 		r = ENXIO;
1982 		msg = "Unsupported CPU vendor";
1983 	}
1984 
1985 	if (r != 0 && arg != (intptr_t)NULL) {
1986 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
1987 			return (EFAULT);
1988 	}
1989 	return (r);
1990 }
1991 
1992 static int
1993 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1994     int *rvalp)
1995 {
1996 	vmm_softc_t	*sc;
1997 	minor_t		minor;
1998 
1999 	minor = getminor(dev);
2000 
2001 	if (minor == VMM_CTL_MINOR) {
2002 		void *argp = (void *)arg;
2003 		char name[VM_MAX_NAMELEN] = { 0 };
2004 		size_t len = 0;
2005 
2006 		if ((mode & FKIOCTL) != 0) {
2007 			len = strlcpy(name, argp, sizeof (name));
2008 		} else {
2009 			if (copyinstr(argp, name, sizeof (name), &len) != 0) {
2010 				return (EFAULT);
2011 			}
2012 		}
2013 		if (len >= VM_MAX_NAMELEN) {
2014 			return (ENAMETOOLONG);
2015 		}
2016 
2017 		switch (cmd) {
2018 		case VMM_CREATE_VM:
2019 			if ((mode & FWRITE) == 0)
2020 				return (EPERM);
2021 			return (vmmdev_do_vm_create(name, credp));
2022 		case VMM_DESTROY_VM:
2023 			if ((mode & FWRITE) == 0)
2024 				return (EPERM);
2025 			return (vmmdev_do_vm_destroy(name, credp));
2026 		case VMM_VM_SUPPORTED:
2027 			return (vmm_is_supported(arg));
2028 		default:
2029 			/* No other actions are legal on ctl device */
2030 			return (ENOTTY);
2031 		}
2032 	}
2033 
2034 	sc = ddi_get_soft_state(vmm_statep, minor);
2035 	ASSERT(sc);
2036 
2037 	if (sc->vmm_flags & VMM_DESTROY)
2038 		return (ENXIO);
2039 
2040 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2041 }
2042 
2043 static int
2044 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2045     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2046 {
2047 	vmm_softc_t *sc;
2048 	const minor_t minor = getminor(dev);
2049 	struct vm *vm;
2050 	int err;
2051 	vm_object_t vmo = NULL;
2052 	struct vmspace *vms;
2053 
2054 	if (minor == VMM_CTL_MINOR) {
2055 		return (ENODEV);
2056 	}
2057 	if (off < 0 || (off + len) <= 0) {
2058 		return (EINVAL);
2059 	}
2060 	if ((prot & PROT_USER) == 0) {
2061 		return (EACCES);
2062 	}
2063 
2064 	sc = ddi_get_soft_state(vmm_statep, minor);
2065 	ASSERT(sc);
2066 
2067 	if (sc->vmm_flags & VMM_DESTROY)
2068 		return (ENXIO);
2069 
2070 	/* Grab read lock on the VM to prevent any changes to the memory map */
2071 	vmm_read_lock(sc);
2072 
2073 	vm = sc->vmm_vm;
2074 	vms = vm_get_vmspace(vm);
2075 	if (off >= VM_DEVMEM_START) {
2076 		int segid;
2077 
2078 		/* Mapping a devmem "device" */
2079 		if (!vmmdev_devmem_segid(sc, off, len, &segid)) {
2080 			err = ENODEV;
2081 			goto out;
2082 		}
2083 		err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
2084 		if (err != 0) {
2085 			goto out;
2086 		}
2087 		err = vm_segmap_obj(vms, vmo, as, addrp, prot, maxprot, flags);
2088 	} else {
2089 		/* Mapping a part of the guest physical space */
2090 		err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot,
2091 		    flags);
2092 	}
2093 
2094 
2095 out:
2096 	vmm_read_unlock(sc);
2097 	return (err);
2098 }
2099 
2100 static sdev_plugin_validate_t
2101 vmm_sdev_validate(sdev_ctx_t ctx)
2102 {
2103 	const char *name = sdev_ctx_name(ctx);
2104 	vmm_softc_t *sc;
2105 	sdev_plugin_validate_t ret;
2106 	minor_t minor;
2107 
2108 	if (sdev_ctx_vtype(ctx) != VCHR)
2109 		return (SDEV_VTOR_INVALID);
2110 
2111 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2112 
2113 	mutex_enter(&vmm_mtx);
2114 	if ((sc = vmm_lookup(name)) == NULL)
2115 		ret = SDEV_VTOR_INVALID;
2116 	else if (sc->vmm_minor != minor)
2117 		ret = SDEV_VTOR_STALE;
2118 	else
2119 		ret = SDEV_VTOR_VALID;
2120 	mutex_exit(&vmm_mtx);
2121 
2122 	return (ret);
2123 }
2124 
2125 static int
2126 vmm_sdev_filldir(sdev_ctx_t ctx)
2127 {
2128 	vmm_softc_t *sc;
2129 	int ret;
2130 
2131 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2132 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2133 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2134 		return (EINVAL);
2135 	}
2136 
2137 	mutex_enter(&vmm_mtx);
2138 	ASSERT(vmmdev_dip != NULL);
2139 	for (sc = list_head(&vmm_list); sc != NULL;
2140 	    sc = list_next(&vmm_list, sc)) {
2141 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2142 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2143 			    S_IFCHR | 0600,
2144 			    makedevice(ddi_driver_major(vmmdev_dip),
2145 			    sc->vmm_minor));
2146 		} else {
2147 			continue;
2148 		}
2149 		if (ret != 0 && ret != EEXIST)
2150 			goto out;
2151 	}
2152 
2153 	ret = 0;
2154 
2155 out:
2156 	mutex_exit(&vmm_mtx);
2157 	return (ret);
2158 }
2159 
2160 /* ARGSUSED */
2161 static void
2162 vmm_sdev_inactive(sdev_ctx_t ctx)
2163 {
2164 }
2165 
2166 static sdev_plugin_ops_t vmm_sdev_ops = {
2167 	.spo_version = SDEV_PLUGIN_VERSION,
2168 	.spo_flags = SDEV_PLUGIN_SUBDIR,
2169 	.spo_validate = vmm_sdev_validate,
2170 	.spo_filldir = vmm_sdev_filldir,
2171 	.spo_inactive = vmm_sdev_inactive
2172 };
2173 
2174 /* ARGSUSED */
2175 static int
2176 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2177 {
2178 	int error;
2179 
2180 	switch (cmd) {
2181 	case DDI_INFO_DEVT2DEVINFO:
2182 		*result = (void *)vmmdev_dip;
2183 		error = DDI_SUCCESS;
2184 		break;
2185 	case DDI_INFO_DEVT2INSTANCE:
2186 		*result = (void *)0;
2187 		error = DDI_SUCCESS;
2188 		break;
2189 	default:
2190 		error = DDI_FAILURE;
2191 		break;
2192 	}
2193 	return (error);
2194 }
2195 
2196 static int
2197 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2198 {
2199 	sdev_plugin_hdl_t sph;
2200 	hma_reg_t *reg = NULL;
2201 	boolean_t vmm_loaded = B_FALSE;
2202 
2203 	if (cmd != DDI_ATTACH) {
2204 		return (DDI_FAILURE);
2205 	}
2206 
2207 	mutex_enter(&vmmdev_mtx);
2208 	/* Ensure we are not already attached. */
2209 	if (vmmdev_dip != NULL) {
2210 		mutex_exit(&vmmdev_mtx);
2211 		return (DDI_FAILURE);
2212 	}
2213 
2214 	vmm_sol_glue_init();
2215 	vmm_arena_init();
2216 
2217 	/*
2218 	 * Perform temporary HMA registration to determine if the system
2219 	 * is capable.
2220 	 */
2221 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2222 		goto fail;
2223 	} else if (vmm_mod_load() != 0) {
2224 		goto fail;
2225 	}
2226 	vmm_loaded = B_TRUE;
2227 	hma_unregister(reg);
2228 	reg = NULL;
2229 
2230 	/* Create control node.  Other nodes will be created on demand. */
2231 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2232 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2233 		goto fail;
2234 	}
2235 
2236 	if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) ==
2237 	    (sdev_plugin_hdl_t)NULL) {
2238 		ddi_remove_minor_node(dip, NULL);
2239 		goto fail;
2240 	}
2241 
2242 	ddi_report_dev(dip);
2243 	vmmdev_sdev_hdl = sph;
2244 	vmmdev_dip = dip;
2245 	mutex_exit(&vmmdev_mtx);
2246 	return (DDI_SUCCESS);
2247 
2248 fail:
2249 	if (vmm_loaded) {
2250 		VERIFY0(vmm_mod_unload());
2251 	}
2252 	if (reg != NULL) {
2253 		hma_unregister(reg);
2254 	}
2255 	vmm_arena_fini();
2256 	vmm_sol_glue_cleanup();
2257 	mutex_exit(&vmmdev_mtx);
2258 	return (DDI_FAILURE);
2259 }
2260 
2261 static int
2262 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2263 {
2264 	if (cmd != DDI_DETACH) {
2265 		return (DDI_FAILURE);
2266 	}
2267 
2268 	/*
2269 	 * Ensure that all resources have been cleaned up.
2270 	 *
2271 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2272 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2273 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
2274 	 * devinfo, including our own, while holding vmmdev_mtx.
2275 	 */
2276 	if (mutex_tryenter(&vmmdev_mtx) == 0)
2277 		return (DDI_FAILURE);
2278 
2279 	mutex_enter(&vmm_mtx);
2280 	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2281 		mutex_exit(&vmm_mtx);
2282 		mutex_exit(&vmmdev_mtx);
2283 		return (DDI_FAILURE);
2284 	}
2285 	mutex_exit(&vmm_mtx);
2286 
2287 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2288 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2289 		mutex_exit(&vmmdev_mtx);
2290 		return (DDI_FAILURE);
2291 	}
2292 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2293 
2294 	/* Remove the control node. */
2295 	ddi_remove_minor_node(dip, "ctl");
2296 	vmmdev_dip = NULL;
2297 
2298 	VERIFY0(vmm_mod_unload());
2299 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
2300 	vmm_arena_fini();
2301 	vmm_sol_glue_cleanup();
2302 
2303 	mutex_exit(&vmmdev_mtx);
2304 
2305 	return (DDI_SUCCESS);
2306 }
2307 
2308 static struct cb_ops vmm_cb_ops = {
2309 	vmm_open,
2310 	vmm_close,
2311 	nodev,		/* strategy */
2312 	nodev,		/* print */
2313 	nodev,		/* dump */
2314 	nodev,		/* read */
2315 	nodev,		/* write */
2316 	vmm_ioctl,
2317 	nodev,		/* devmap */
2318 	nodev,		/* mmap */
2319 	vmm_segmap,
2320 	nochpoll,	/* poll */
2321 	ddi_prop_op,
2322 	NULL,
2323 	D_NEW | D_MP | D_DEVMAP
2324 };
2325 
2326 static struct dev_ops vmm_ops = {
2327 	DEVO_REV,
2328 	0,
2329 	vmm_info,
2330 	nulldev,	/* identify */
2331 	nulldev,	/* probe */
2332 	vmm_attach,
2333 	vmm_detach,
2334 	nodev,		/* reset */
2335 	&vmm_cb_ops,
2336 	(struct bus_ops *)NULL
2337 };
2338 
2339 static struct modldrv modldrv = {
2340 	&mod_driverops,
2341 	"bhyve vmm",
2342 	&vmm_ops
2343 };
2344 
2345 static struct modlinkage modlinkage = {
2346 	MODREV_1,
2347 	&modldrv,
2348 	NULL
2349 };
2350 
2351 int
2352 _init(void)
2353 {
2354 	int	error;
2355 
2356 	sysinit();
2357 
2358 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2359 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2360 	list_create(&vmm_list, sizeof (vmm_softc_t),
2361 	    offsetof(vmm_softc_t, vmm_node));
2362 	list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2363 	    offsetof(vmm_softc_t, vmm_node));
2364 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2365 
2366 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2367 	if (error) {
2368 		return (error);
2369 	}
2370 
2371 	vmm_zsd_init();
2372 
2373 	error = mod_install(&modlinkage);
2374 	if (error) {
2375 		ddi_soft_state_fini(&vmm_statep);
2376 		vmm_zsd_fini();
2377 	}
2378 
2379 	return (error);
2380 }
2381 
2382 int
2383 _fini(void)
2384 {
2385 	int	error;
2386 
2387 	error = mod_remove(&modlinkage);
2388 	if (error) {
2389 		return (error);
2390 	}
2391 
2392 	vmm_zsd_fini();
2393 
2394 	ddi_soft_state_fini(&vmm_statep);
2395 
2396 	return (0);
2397 }
2398 
2399 int
2400 _info(struct modinfo *modinfop)
2401 {
2402 	return (mod_info(&modlinkage, modinfop));
2403 }
2404