1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2015 Pluribus Networks Inc.
15 * Copyright 2019 Joyent, Inc.
16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 * Copyright 2021 Oxide Computer Company
18 */
19
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49
50 #include <vm/seg_dev.h>
51
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61
62 /*
63 * Locking details:
64 *
65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70 */
71
72 static kmutex_t vmmdev_mtx;
73 static dev_info_t *vmmdev_dip;
74 static hma_reg_t *vmmdev_hma_reg;
75 static uint_t vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77
78 static kmutex_t vmm_mtx;
79 static list_t vmm_list;
80 static list_t vmm_destroy_list;
81 static id_space_t *vmm_minors;
82 static void *vmm_statep;
83
84 static const char *vmmdev_hvm_name = "bhyve";
85
86 /* For sdev plugin (/dev) */
87 #define VMM_SDEV_ROOT "/dev/vmm"
88
89 /* From uts/i86pc/io/vmm/intel/vmx.c */
90 extern int vmx_x86_supported(const char **);
91
92 /* Holds and hooks from drivers external to vmm */
93 struct vmm_hold {
94 list_node_t vmh_node;
95 vmm_softc_t *vmh_sc;
96 boolean_t vmh_release_req;
97 uint_t vmh_ioport_hook_cnt;
98 };
99
100 struct vmm_lease {
101 list_node_t vml_node;
102 struct vm *vml_vm;
103 vm_client_t *vml_vmclient;
104 boolean_t vml_expired;
105 boolean_t vml_break_deferred;
106 boolean_t (*vml_expire_func)(void *);
107 void *vml_expire_arg;
108 struct vmm_hold *vml_hold;
109 };
110
111 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
112 static void vmm_lease_block(vmm_softc_t *);
113 static void vmm_lease_unblock(vmm_softc_t *);
114 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
115 static void vmm_kstat_init(vmm_softc_t *);
116 static void vmm_kstat_fini(vmm_softc_t *);
117
118 static int
vmmdev_get_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)119 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
120 {
121 int error;
122 bool sysmem;
123
124 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
125 NULL);
126 if (error || mseg->len == 0)
127 return (error);
128
129 if (!sysmem) {
130 vmm_devmem_entry_t *de;
131 list_t *dl = &sc->vmm_devmem_list;
132
133 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
134 if (de->vde_segid == mseg->segid) {
135 break;
136 }
137 }
138 if (de != NULL) {
139 (void) strlcpy(mseg->name, de->vde_name,
140 sizeof (mseg->name));
141 }
142 } else {
143 bzero(mseg->name, sizeof (mseg->name));
144 }
145
146 return (error);
147 }
148
149 /*
150 * The 'devmem' hack:
151 *
152 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
153 * in the vm which appear with their own name related to the vm under /dev.
154 * Since this would be a hassle from an sdev perspective and would require a
155 * new cdev interface (or complicate the existing one), we choose to implement
156 * this in a different manner. When 'devmem' mappings are created, an
157 * identifying off_t is communicated back out to userspace. That off_t,
158 * residing above the normal guest memory space, can be used to mmap the
159 * 'devmem' mapping from the already-open vm device.
160 */
161
162 static int
vmmdev_devmem_create(vmm_softc_t * sc,struct vm_memseg * mseg,const char * name)163 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
164 {
165 off_t map_offset;
166 vmm_devmem_entry_t *entry;
167
168 if (list_is_empty(&sc->vmm_devmem_list)) {
169 map_offset = VM_DEVMEM_START;
170 } else {
171 entry = list_tail(&sc->vmm_devmem_list);
172 map_offset = entry->vde_off + entry->vde_len;
173 if (map_offset < entry->vde_off) {
174 /* Do not tolerate overflow */
175 return (ERANGE);
176 }
177 /*
178 * XXXJOY: We could choose to search the list for duplicate
179 * names and toss an error. Since we're using the offset
180 * method for now, it does not make much of a difference.
181 */
182 }
183
184 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
185 entry->vde_segid = mseg->segid;
186 entry->vde_len = mseg->len;
187 entry->vde_off = map_offset;
188 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
189 list_insert_tail(&sc->vmm_devmem_list, entry);
190
191 return (0);
192 }
193
194 static boolean_t
vmmdev_devmem_segid(vmm_softc_t * sc,off_t off,off_t len,int * segidp,off_t * map_offp)195 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
196 off_t *map_offp)
197 {
198 list_t *dl = &sc->vmm_devmem_list;
199 vmm_devmem_entry_t *de = NULL;
200 const off_t map_end = off + len;
201
202 VERIFY(off >= VM_DEVMEM_START);
203
204 if (map_end < off) {
205 /* No match on overflow */
206 return (B_FALSE);
207 }
208
209 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
210 const off_t item_end = de->vde_off + de->vde_len;
211
212 if (de->vde_off <= off && item_end >= map_end) {
213 *segidp = de->vde_segid;
214 *map_offp = off - de->vde_off;
215 return (B_TRUE);
216 }
217 }
218 return (B_FALSE);
219 }
220
221 static void
vmmdev_devmem_purge(vmm_softc_t * sc)222 vmmdev_devmem_purge(vmm_softc_t *sc)
223 {
224 vmm_devmem_entry_t *entry;
225
226 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
227 kmem_free(entry, sizeof (*entry));
228 }
229 }
230
231 static int
vmmdev_alloc_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)232 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
233 {
234 int error;
235 bool sysmem = true;
236
237 if (VM_MEMSEG_NAME(mseg)) {
238 sysmem = false;
239 }
240 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
241
242 if (error == 0 && VM_MEMSEG_NAME(mseg)) {
243 /*
244 * Rather than create a whole fresh device from which userspace
245 * can mmap this segment, instead make it available at an
246 * offset above where the main guest memory resides.
247 */
248 error = vmmdev_devmem_create(sc, mseg, mseg->name);
249 if (error != 0) {
250 vm_free_memseg(sc->vmm_vm, mseg->segid);
251 }
252 }
253 return (error);
254 }
255
256 /*
257 * Resource Locking and Exclusion
258 *
259 * Much of bhyve depends on key portions of VM state, such as the guest memory
260 * map, to remain unchanged while the guest is running. As ported from
261 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
262 * access to the instance vCPUs. Threads acting on a single vCPU, like those
263 * performing the work of actually running the guest in VMX/SVM, would lock
264 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
265 * state, all of the vCPUs would be first locked, ensuring that the
266 * operation(s) could complete without any other threads stumbling into
267 * intermediate states.
268 *
269 * This approach is largely effective for bhyve. Common operations, such as
270 * running the vCPUs, steer clear of lock contention. The model begins to
271 * break down for operations which do not occur in the context of a specific
272 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
273 * thread in the bhyve process. In order to properly protect those vCPU-less
274 * operations from encountering invalid states, additional locking is required.
275 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
276 * It does mean that class of operations will be serialized on locking the
277 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
278 * undue contention on the VM_MAXCPU-1 vCPU.
279 *
280 * In order to address the shortcomings of this model, the concept of a
281 * read/write lock has been added to bhyve. Operations which change
282 * fundamental aspects of a VM (such as the memory map) must acquire the write
283 * lock, which also implies locking all of the vCPUs and waiting for all read
284 * lock holders to release. While it increases the cost and waiting time for
285 * those few operations, it allows most hot-path operations on the VM (which
286 * depend on its configuration remaining stable) to occur with minimal locking.
287 *
288 * Consumers of the Driver API (see below) are a special case when it comes to
289 * this locking, since they may hold a read lock via the drv_lease mechanism
290 * for an extended period of time. Rather than forcing those consumers to
291 * continuously poll for a write lock attempt, the lease system forces them to
292 * provide a release callback to trigger their clean-up (and potential later
293 * reacquisition) of the read lock.
294 */
295
296 static void
vcpu_lock_one(vmm_softc_t * sc,int vcpu)297 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
298 {
299 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
300
301 /*
302 * Since this state transition is utilizing from_idle=true, it should
303 * not fail, but rather block until it can be successful.
304 */
305 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
306 }
307
308 static void
vcpu_unlock_one(vmm_softc_t * sc,int vcpu)309 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
310 {
311 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
312
313 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
314 vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false);
315 }
316
317 static void
vmm_read_lock(vmm_softc_t * sc)318 vmm_read_lock(vmm_softc_t *sc)
319 {
320 rw_enter(&sc->vmm_rwlock, RW_READER);
321 }
322
323 static void
vmm_read_unlock(vmm_softc_t * sc)324 vmm_read_unlock(vmm_softc_t *sc)
325 {
326 rw_exit(&sc->vmm_rwlock);
327 }
328
329 static void
vmm_write_lock(vmm_softc_t * sc)330 vmm_write_lock(vmm_softc_t *sc)
331 {
332 int maxcpus;
333
334 /* First lock all the vCPUs */
335 maxcpus = vm_get_maxcpus(sc->vmm_vm);
336 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
337 vcpu_lock_one(sc, vcpu);
338 }
339
340 /*
341 * Block vmm_drv leases from being acquired or held while the VM write
342 * lock is held.
343 */
344 vmm_lease_block(sc);
345
346 rw_enter(&sc->vmm_rwlock, RW_WRITER);
347 /*
348 * For now, the 'maxcpus' value for an instance is fixed at the
349 * compile-time constant of VM_MAXCPU at creation. If this changes in
350 * the future, allowing for dynamic vCPU resource sizing, acquisition
351 * of the write lock will need to be wary of such changes.
352 */
353 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
354 }
355
356 static void
vmm_write_unlock(vmm_softc_t * sc)357 vmm_write_unlock(vmm_softc_t *sc)
358 {
359 int maxcpus;
360
361 /* Allow vmm_drv leases to be acquired once write lock is dropped */
362 vmm_lease_unblock(sc);
363
364 /*
365 * The VM write lock _must_ be released from the same thread it was
366 * acquired in, unlike the read lock.
367 */
368 VERIFY(rw_write_held(&sc->vmm_rwlock));
369 rw_exit(&sc->vmm_rwlock);
370
371 /* Unlock all the vCPUs */
372 maxcpus = vm_get_maxcpus(sc->vmm_vm);
373 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
374 vcpu_unlock_one(sc, vcpu);
375 }
376 }
377
378 static int
vmmdev_do_ioctl(vmm_softc_t * sc,int cmd,intptr_t arg,int md,cred_t * credp,int * rvalp)379 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
380 cred_t *credp, int *rvalp)
381 {
382 int error = 0, vcpu = -1;
383 void *datap = (void *)arg;
384 enum vm_lock_type {
385 LOCK_NONE = 0,
386 LOCK_VCPU,
387 LOCK_READ_HOLD,
388 LOCK_WRITE_HOLD
389 } lock_type = LOCK_NONE;
390
391 /* Acquire any exclusion resources needed for the operation. */
392 switch (cmd) {
393 case VM_RUN:
394 case VM_GET_REGISTER:
395 case VM_SET_REGISTER:
396 case VM_GET_SEGMENT_DESCRIPTOR:
397 case VM_SET_SEGMENT_DESCRIPTOR:
398 case VM_GET_REGISTER_SET:
399 case VM_SET_REGISTER_SET:
400 case VM_INJECT_EXCEPTION:
401 case VM_GET_CAPABILITY:
402 case VM_SET_CAPABILITY:
403 case VM_PPTDEV_MSI:
404 case VM_PPTDEV_MSIX:
405 case VM_SET_X2APIC_STATE:
406 case VM_GLA2GPA:
407 case VM_GLA2GPA_NOFAULT:
408 case VM_ACTIVATE_CPU:
409 case VM_SET_INTINFO:
410 case VM_GET_INTINFO:
411 case VM_RESTART_INSTRUCTION:
412 case VM_SET_KERNEMU_DEV:
413 case VM_GET_KERNEMU_DEV:
414 case VM_RESET_CPU:
415 case VM_GET_RUN_STATE:
416 case VM_SET_RUN_STATE:
417 /*
418 * Copy in the ID of the vCPU chosen for this operation.
419 * Since a nefarious caller could update their struct between
420 * this locking and when the rest of the ioctl data is copied
421 * in, it is _critical_ that this local 'vcpu' variable be used
422 * rather than the in-struct one when performing the ioctl.
423 */
424 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
425 return (EFAULT);
426 }
427 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
428 return (EINVAL);
429 }
430 vcpu_lock_one(sc, vcpu);
431 lock_type = LOCK_VCPU;
432 break;
433
434 case VM_REINIT:
435 case VM_BIND_PPTDEV:
436 case VM_UNBIND_PPTDEV:
437 case VM_MAP_PPTDEV_MMIO:
438 case VM_UNMAP_PPTDEV_MMIO:
439 case VM_ALLOC_MEMSEG:
440 case VM_MMAP_MEMSEG:
441 case VM_MUNMAP_MEMSEG:
442 case VM_WRLOCK_CYCLE:
443 case VM_PMTMR_LOCATE:
444 vmm_write_lock(sc);
445 lock_type = LOCK_WRITE_HOLD;
446 break;
447
448 case VM_GET_MEMSEG:
449 case VM_MMAP_GETNEXT:
450 case VM_LAPIC_IRQ:
451 case VM_INJECT_NMI:
452 case VM_IOAPIC_ASSERT_IRQ:
453 case VM_IOAPIC_DEASSERT_IRQ:
454 case VM_IOAPIC_PULSE_IRQ:
455 case VM_LAPIC_MSI:
456 case VM_LAPIC_LOCAL_IRQ:
457 case VM_GET_X2APIC_STATE:
458 case VM_RTC_READ:
459 case VM_RTC_WRITE:
460 case VM_RTC_SETTIME:
461 case VM_RTC_GETTIME:
462 case VM_PPTDEV_DISABLE_MSIX:
463 case VM_DEVMEM_GETOFFSET:
464 vmm_read_lock(sc);
465 lock_type = LOCK_READ_HOLD;
466 break;
467
468 case VM_GET_GPA_PMAP:
469 case VM_IOAPIC_PINCOUNT:
470 case VM_SUSPEND:
471 default:
472 break;
473 }
474
475 /* Execute the primary logic for the ioctl. */
476 switch (cmd) {
477 case VM_RUN: {
478 struct vm_entry entry;
479
480 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
481 error = EFAULT;
482 break;
483 }
484
485 if (!(curthread->t_schedflag & TS_VCPU))
486 smt_mark_as_vcpu();
487
488 error = vm_run(sc->vmm_vm, vcpu, &entry);
489
490 /*
491 * Unexpected states in vm_run() are expressed through positive
492 * errno-oriented return values. VM states which expect further
493 * processing in userspace (necessary context via exitinfo) are
494 * expressed through negative return values. For the time being
495 * a return value of 0 is not expected from vm_run().
496 */
497 ASSERT(error != 0);
498 if (error < 0) {
499 const struct vm_exit *vme;
500 void *outp = entry.exit_data;
501
502 error = 0;
503 vme = vm_exitinfo(sc->vmm_vm, vcpu);
504 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
505 error = EFAULT;
506 }
507 }
508 break;
509 }
510 case VM_SUSPEND: {
511 struct vm_suspend vmsuspend;
512
513 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
514 error = EFAULT;
515 break;
516 }
517 error = vm_suspend(sc->vmm_vm, vmsuspend.how);
518 break;
519 }
520 case VM_REINIT: {
521 struct vm_reinit reinit;
522
523 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
524 error = EFAULT;
525 break;
526 }
527 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
528 /*
529 * The VM instance should be free of driver-attached
530 * hooks during the reinitialization process.
531 */
532 break;
533 }
534 error = vm_reinit(sc->vmm_vm, reinit.flags);
535 (void) vmm_drv_block_hook(sc, B_FALSE);
536 break;
537 }
538 case VM_STAT_DESC: {
539 struct vm_stat_desc statdesc;
540
541 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
542 error = EFAULT;
543 break;
544 }
545 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
546 sizeof (statdesc.desc));
547 if (error == 0 &&
548 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
549 error = EFAULT;
550 break;
551 }
552 break;
553 }
554 case VM_STATS_IOC: {
555 struct vm_stats vmstats;
556
557 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
558 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
559 error = EFAULT;
560 break;
561 }
562 hrt2tv(gethrtime(), &vmstats.tv);
563 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid,
564 &vmstats.num_entries, vmstats.statbuf);
565 if (error == 0 &&
566 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
567 error = EFAULT;
568 break;
569 }
570 break;
571 }
572
573 case VM_PPTDEV_MSI: {
574 struct vm_pptdev_msi pptmsi;
575
576 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
577 error = EFAULT;
578 break;
579 }
580 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
581 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
582 break;
583 }
584 case VM_PPTDEV_MSIX: {
585 struct vm_pptdev_msix pptmsix;
586
587 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
588 error = EFAULT;
589 break;
590 }
591 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
592 pptmsix.idx, pptmsix.addr, pptmsix.msg,
593 pptmsix.vector_control);
594 break;
595 }
596 case VM_PPTDEV_DISABLE_MSIX: {
597 struct vm_pptdev pptdev;
598
599 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
600 error = EFAULT;
601 break;
602 }
603 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
604 break;
605 }
606 case VM_MAP_PPTDEV_MMIO: {
607 struct vm_pptdev_mmio pptmmio;
608
609 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
610 error = EFAULT;
611 break;
612 }
613 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
614 pptmmio.len, pptmmio.hpa);
615 break;
616 }
617 case VM_UNMAP_PPTDEV_MMIO: {
618 struct vm_pptdev_mmio pptmmio;
619
620 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
621 error = EFAULT;
622 break;
623 }
624 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
625 pptmmio.len);
626 break;
627 }
628 case VM_BIND_PPTDEV: {
629 struct vm_pptdev pptdev;
630
631 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
632 error = EFAULT;
633 break;
634 }
635 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
636 break;
637 }
638 case VM_UNBIND_PPTDEV: {
639 struct vm_pptdev pptdev;
640
641 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
642 error = EFAULT;
643 break;
644 }
645 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
646 break;
647 }
648 case VM_GET_PPTDEV_LIMITS: {
649 struct vm_pptdev_limits pptlimits;
650
651 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
652 error = EFAULT;
653 break;
654 }
655 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
656 &pptlimits.msi_limit, &pptlimits.msix_limit);
657 if (error == 0 &&
658 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
659 error = EFAULT;
660 break;
661 }
662 break;
663 }
664 case VM_INJECT_EXCEPTION: {
665 struct vm_exception vmexc;
666 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
667 error = EFAULT;
668 break;
669 }
670 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
671 vmexc.error_code_valid, vmexc.error_code,
672 vmexc.restart_instruction);
673 break;
674 }
675 case VM_INJECT_NMI: {
676 struct vm_nmi vmnmi;
677
678 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
679 error = EFAULT;
680 break;
681 }
682 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
683 break;
684 }
685 case VM_LAPIC_IRQ: {
686 struct vm_lapic_irq vmirq;
687
688 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
689 error = EFAULT;
690 break;
691 }
692 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
693 break;
694 }
695 case VM_LAPIC_LOCAL_IRQ: {
696 struct vm_lapic_irq vmirq;
697
698 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
699 error = EFAULT;
700 break;
701 }
702 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
703 vmirq.vector);
704 break;
705 }
706 case VM_LAPIC_MSI: {
707 struct vm_lapic_msi vmmsi;
708
709 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
710 error = EFAULT;
711 break;
712 }
713 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
714 break;
715 }
716
717 case VM_IOAPIC_ASSERT_IRQ: {
718 struct vm_ioapic_irq ioapic_irq;
719
720 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
721 error = EFAULT;
722 break;
723 }
724 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
725 break;
726 }
727 case VM_IOAPIC_DEASSERT_IRQ: {
728 struct vm_ioapic_irq ioapic_irq;
729
730 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
731 error = EFAULT;
732 break;
733 }
734 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
735 break;
736 }
737 case VM_IOAPIC_PULSE_IRQ: {
738 struct vm_ioapic_irq ioapic_irq;
739
740 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
741 error = EFAULT;
742 break;
743 }
744 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
745 break;
746 }
747 case VM_IOAPIC_PINCOUNT: {
748 int pincount;
749
750 pincount = vioapic_pincount(sc->vmm_vm);
751 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
752 error = EFAULT;
753 break;
754 }
755 break;
756 }
757
758 case VM_ISA_ASSERT_IRQ: {
759 struct vm_isa_irq isa_irq;
760
761 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
762 error = EFAULT;
763 break;
764 }
765 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
766 if (error == 0 && isa_irq.ioapic_irq != -1) {
767 error = vioapic_assert_irq(sc->vmm_vm,
768 isa_irq.ioapic_irq);
769 }
770 break;
771 }
772 case VM_ISA_DEASSERT_IRQ: {
773 struct vm_isa_irq isa_irq;
774
775 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
776 error = EFAULT;
777 break;
778 }
779 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
780 if (error == 0 && isa_irq.ioapic_irq != -1) {
781 error = vioapic_deassert_irq(sc->vmm_vm,
782 isa_irq.ioapic_irq);
783 }
784 break;
785 }
786 case VM_ISA_PULSE_IRQ: {
787 struct vm_isa_irq isa_irq;
788
789 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
790 error = EFAULT;
791 break;
792 }
793 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
794 if (error == 0 && isa_irq.ioapic_irq != -1) {
795 error = vioapic_pulse_irq(sc->vmm_vm,
796 isa_irq.ioapic_irq);
797 }
798 break;
799 }
800 case VM_ISA_SET_IRQ_TRIGGER: {
801 struct vm_isa_irq_trigger isa_irq_trigger;
802
803 if (ddi_copyin(datap, &isa_irq_trigger,
804 sizeof (isa_irq_trigger), md)) {
805 error = EFAULT;
806 break;
807 }
808 error = vatpic_set_irq_trigger(sc->vmm_vm,
809 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
810 break;
811 }
812
813 case VM_MMAP_GETNEXT: {
814 struct vm_memmap mm;
815
816 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
817 error = EFAULT;
818 break;
819 }
820 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
821 &mm.segoff, &mm.len, &mm.prot, &mm.flags);
822 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
823 error = EFAULT;
824 break;
825 }
826 break;
827 }
828 case VM_MMAP_MEMSEG: {
829 struct vm_memmap mm;
830
831 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
832 error = EFAULT;
833 break;
834 }
835 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
836 mm.len, mm.prot, mm.flags);
837 break;
838 }
839 case VM_MUNMAP_MEMSEG: {
840 struct vm_munmap mu;
841
842 if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
843 error = EFAULT;
844 break;
845 }
846 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
847 break;
848 }
849 case VM_ALLOC_MEMSEG: {
850 struct vm_memseg vmseg;
851
852 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
853 error = EFAULT;
854 break;
855 }
856 error = vmmdev_alloc_memseg(sc, &vmseg);
857 break;
858 }
859 case VM_GET_MEMSEG: {
860 struct vm_memseg vmseg;
861
862 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
863 error = EFAULT;
864 break;
865 }
866 error = vmmdev_get_memseg(sc, &vmseg);
867 if (error == 0 &&
868 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
869 error = EFAULT;
870 break;
871 }
872 break;
873 }
874 case VM_GET_REGISTER: {
875 struct vm_register vmreg;
876
877 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
878 error = EFAULT;
879 break;
880 }
881 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
882 &vmreg.regval);
883 if (error == 0 &&
884 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
885 error = EFAULT;
886 break;
887 }
888 break;
889 }
890 case VM_SET_REGISTER: {
891 struct vm_register vmreg;
892
893 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
894 error = EFAULT;
895 break;
896 }
897 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
898 vmreg.regval);
899 break;
900 }
901 case VM_SET_SEGMENT_DESCRIPTOR: {
902 struct vm_seg_desc vmsegd;
903
904 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
905 error = EFAULT;
906 break;
907 }
908 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
909 &vmsegd.desc);
910 break;
911 }
912 case VM_GET_SEGMENT_DESCRIPTOR: {
913 struct vm_seg_desc vmsegd;
914
915 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
916 error = EFAULT;
917 break;
918 }
919 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
920 &vmsegd.desc);
921 if (error == 0 &&
922 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
923 error = EFAULT;
924 break;
925 }
926 break;
927 }
928 case VM_GET_REGISTER_SET: {
929 struct vm_register_set vrs;
930 int regnums[VM_REG_LAST];
931 uint64_t regvals[VM_REG_LAST];
932
933 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
934 error = EFAULT;
935 break;
936 }
937 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
938 error = EINVAL;
939 break;
940 }
941 if (ddi_copyin(vrs.regnums, regnums,
942 sizeof (int) * vrs.count, md)) {
943 error = EFAULT;
944 break;
945 }
946
947 error = 0;
948 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
949 if (regnums[i] < 0) {
950 error = EINVAL;
951 break;
952 }
953 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
954 ®vals[i]);
955 }
956 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
957 sizeof (uint64_t) * vrs.count, md)) {
958 error = EFAULT;
959 }
960 break;
961 }
962 case VM_SET_REGISTER_SET: {
963 struct vm_register_set vrs;
964 int regnums[VM_REG_LAST];
965 uint64_t regvals[VM_REG_LAST];
966
967 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
968 error = EFAULT;
969 break;
970 }
971 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
972 error = EINVAL;
973 break;
974 }
975 if (ddi_copyin(vrs.regnums, regnums,
976 sizeof (int) * vrs.count, md)) {
977 error = EFAULT;
978 break;
979 }
980 if (ddi_copyin(vrs.regvals, regvals,
981 sizeof (uint64_t) * vrs.count, md)) {
982 error = EFAULT;
983 break;
984 }
985
986 error = 0;
987 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
988 /*
989 * Setting registers in a set is not atomic, since a
990 * failure in the middle of the set will cause a
991 * bail-out and inconsistent register state. Callers
992 * should be wary of this.
993 */
994 if (regnums[i] < 0) {
995 error = EINVAL;
996 break;
997 }
998 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
999 regvals[i]);
1000 }
1001 break;
1002 }
1003 case VM_RESET_CPU: {
1004 struct vm_vcpu_reset vvr;
1005
1006 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1007 error = EFAULT;
1008 break;
1009 }
1010 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1011 error = EINVAL;
1012 }
1013
1014 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1015 break;
1016 }
1017 case VM_GET_RUN_STATE: {
1018 struct vm_run_state vrs;
1019
1020 bzero(&vrs, sizeof (vrs));
1021 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1022 &vrs.sipi_vector);
1023 if (error == 0) {
1024 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1025 error = EFAULT;
1026 break;
1027 }
1028 }
1029 break;
1030 }
1031 case VM_SET_RUN_STATE: {
1032 struct vm_run_state vrs;
1033
1034 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1035 error = EFAULT;
1036 break;
1037 }
1038 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1039 vrs.sipi_vector);
1040 break;
1041 }
1042
1043 case VM_SET_KERNEMU_DEV:
1044 case VM_GET_KERNEMU_DEV: {
1045 struct vm_readwrite_kernemu_device kemu;
1046 size_t size = 0;
1047
1048 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1049 error = EFAULT;
1050 break;
1051 }
1052
1053 if (kemu.access_width > 3) {
1054 error = EINVAL;
1055 break;
1056 }
1057 size = (1 << kemu.access_width);
1058 ASSERT(size >= 1 && size <= 8);
1059
1060 if (cmd == VM_SET_KERNEMU_DEV) {
1061 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1062 kemu.gpa, kemu.value, size);
1063 } else {
1064 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1065 kemu.gpa, &kemu.value, size);
1066 }
1067
1068 if (error == 0) {
1069 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1070 error = EFAULT;
1071 break;
1072 }
1073 }
1074 break;
1075 }
1076
1077 case VM_GET_CAPABILITY: {
1078 struct vm_capability vmcap;
1079
1080 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1081 error = EFAULT;
1082 break;
1083 }
1084 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1085 &vmcap.capval);
1086 if (error == 0 &&
1087 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1088 error = EFAULT;
1089 break;
1090 }
1091 break;
1092 }
1093 case VM_SET_CAPABILITY: {
1094 struct vm_capability vmcap;
1095
1096 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1097 error = EFAULT;
1098 break;
1099 }
1100 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1101 vmcap.capval);
1102 break;
1103 }
1104 case VM_SET_X2APIC_STATE: {
1105 struct vm_x2apic x2apic;
1106
1107 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1108 error = EFAULT;
1109 break;
1110 }
1111 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1112 break;
1113 }
1114 case VM_GET_X2APIC_STATE: {
1115 struct vm_x2apic x2apic;
1116
1117 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1118 error = EFAULT;
1119 break;
1120 }
1121 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1122 &x2apic.state);
1123 if (error == 0 &&
1124 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1125 error = EFAULT;
1126 break;
1127 }
1128 break;
1129 }
1130 case VM_GET_GPA_PMAP: {
1131 /*
1132 * Until there is a necessity to leak EPT/RVI PTE values to
1133 * userspace, this will remain unimplemented
1134 */
1135 error = EINVAL;
1136 break;
1137 }
1138 case VM_GET_HPET_CAPABILITIES: {
1139 struct vm_hpet_cap hpetcap;
1140
1141 error = vhpet_getcap(&hpetcap);
1142 if (error == 0 &&
1143 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1144 error = EFAULT;
1145 break;
1146 }
1147 break;
1148 }
1149 case VM_GLA2GPA: {
1150 struct vm_gla2gpa gg;
1151
1152 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1153 error = EFAULT;
1154 break;
1155 }
1156 gg.vcpuid = vcpu;
1157 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1158 gg.prot, &gg.gpa, &gg.fault);
1159 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1160 error = EFAULT;
1161 break;
1162 }
1163 break;
1164 }
1165 case VM_GLA2GPA_NOFAULT: {
1166 struct vm_gla2gpa gg;
1167
1168 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1169 error = EFAULT;
1170 break;
1171 }
1172 gg.vcpuid = vcpu;
1173 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1174 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1175 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1176 error = EFAULT;
1177 break;
1178 }
1179 break;
1180 }
1181
1182 case VM_ACTIVATE_CPU:
1183 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1184 break;
1185
1186 case VM_SUSPEND_CPU:
1187 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1188 error = EFAULT;
1189 } else {
1190 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1191 }
1192 break;
1193
1194 case VM_RESUME_CPU:
1195 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1196 error = EFAULT;
1197 } else {
1198 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1199 }
1200 break;
1201
1202 case VM_GET_CPUS: {
1203 struct vm_cpuset vm_cpuset;
1204 cpuset_t tempset;
1205 void *srcp = &tempset;
1206 int size;
1207
1208 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1209 error = EFAULT;
1210 break;
1211 }
1212
1213 /* Be more generous about sizing since our cpuset_t is large. */
1214 size = vm_cpuset.cpusetsize;
1215 if (size <= 0 || size > sizeof (cpuset_t)) {
1216 error = ERANGE;
1217 }
1218 /*
1219 * If they want a ulong_t or less, make sure they receive the
1220 * low bits with all the useful information.
1221 */
1222 if (size <= sizeof (tempset.cpub[0])) {
1223 srcp = &tempset.cpub[0];
1224 }
1225
1226 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1227 tempset = vm_active_cpus(sc->vmm_vm);
1228 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1229 tempset = vm_suspended_cpus(sc->vmm_vm);
1230 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1231 tempset = vm_debug_cpus(sc->vmm_vm);
1232 } else {
1233 error = EINVAL;
1234 }
1235
1236 ASSERT(size > 0 && size <= sizeof (tempset));
1237 if (error == 0 &&
1238 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1239 error = EFAULT;
1240 break;
1241 }
1242 break;
1243 }
1244 case VM_SET_INTINFO: {
1245 struct vm_intinfo vmii;
1246
1247 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1248 error = EFAULT;
1249 break;
1250 }
1251 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1252 break;
1253 }
1254 case VM_GET_INTINFO: {
1255 struct vm_intinfo vmii;
1256
1257 vmii.vcpuid = vcpu;
1258 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1259 &vmii.info2);
1260 if (error == 0 &&
1261 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1262 error = EFAULT;
1263 break;
1264 }
1265 break;
1266 }
1267 case VM_RTC_WRITE: {
1268 struct vm_rtc_data rtcdata;
1269
1270 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1271 error = EFAULT;
1272 break;
1273 }
1274 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1275 rtcdata.value);
1276 break;
1277 }
1278 case VM_RTC_READ: {
1279 struct vm_rtc_data rtcdata;
1280
1281 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1282 error = EFAULT;
1283 break;
1284 }
1285 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1286 &rtcdata.value);
1287 if (error == 0 &&
1288 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1289 error = EFAULT;
1290 break;
1291 }
1292 break;
1293 }
1294 case VM_RTC_SETTIME: {
1295 struct vm_rtc_time rtctime;
1296
1297 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1298 error = EFAULT;
1299 break;
1300 }
1301 error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1302 break;
1303 }
1304 case VM_RTC_GETTIME: {
1305 struct vm_rtc_time rtctime;
1306
1307 rtctime.secs = vrtc_get_time(sc->vmm_vm);
1308 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1309 error = EFAULT;
1310 break;
1311 }
1312 break;
1313 }
1314
1315 case VM_PMTMR_LOCATE: {
1316 uint16_t port = arg;
1317 error = vpmtmr_set_location(sc->vmm_vm, port);
1318 break;
1319 }
1320
1321 case VM_RESTART_INSTRUCTION:
1322 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1323 break;
1324
1325 case VM_SET_TOPOLOGY: {
1326 struct vm_cpu_topology topo;
1327
1328 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1329 error = EFAULT;
1330 break;
1331 }
1332 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1333 topo.threads, topo.maxcpus);
1334 break;
1335 }
1336 case VM_GET_TOPOLOGY: {
1337 struct vm_cpu_topology topo;
1338
1339 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1340 &topo.threads, &topo.maxcpus);
1341 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1342 error = EFAULT;
1343 break;
1344 }
1345 break;
1346 }
1347
1348 case VM_DEVMEM_GETOFFSET: {
1349 struct vm_devmem_offset vdo;
1350 list_t *dl = &sc->vmm_devmem_list;
1351 vmm_devmem_entry_t *de = NULL;
1352
1353 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1354 error = EFAULT;
1355 break;
1356 }
1357
1358 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
1359 if (de->vde_segid == vdo.segid) {
1360 break;
1361 }
1362 }
1363 if (de != NULL) {
1364 vdo.offset = de->vde_off;
1365 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1366 error = EFAULT;
1367 }
1368 } else {
1369 error = ENOENT;
1370 }
1371 break;
1372 }
1373 case VM_WRLOCK_CYCLE: {
1374 /*
1375 * Present a test mechanism to acquire/release the write lock
1376 * on the VM without any other effects.
1377 */
1378 break;
1379 }
1380
1381 default:
1382 error = ENOTTY;
1383 break;
1384 }
1385
1386 /* Release exclusion resources */
1387 switch (lock_type) {
1388 case LOCK_NONE:
1389 break;
1390 case LOCK_VCPU:
1391 vcpu_unlock_one(sc, vcpu);
1392 break;
1393 case LOCK_READ_HOLD:
1394 vmm_read_unlock(sc);
1395 break;
1396 case LOCK_WRITE_HOLD:
1397 vmm_write_unlock(sc);
1398 break;
1399 default:
1400 panic("unexpected lock type");
1401 break;
1402 }
1403
1404 return (error);
1405 }
1406
1407 static vmm_softc_t *
vmm_lookup(const char * name)1408 vmm_lookup(const char *name)
1409 {
1410 list_t *vml = &vmm_list;
1411 vmm_softc_t *sc;
1412
1413 ASSERT(MUTEX_HELD(&vmm_mtx));
1414
1415 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1416 if (strcmp(sc->vmm_name, name) == 0) {
1417 break;
1418 }
1419 }
1420
1421 return (sc);
1422 }
1423
1424 /*
1425 * Acquire an HMA registration if not already held.
1426 */
1427 static boolean_t
vmm_hma_acquire(void)1428 vmm_hma_acquire(void)
1429 {
1430 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1431
1432 mutex_enter(&vmmdev_mtx);
1433
1434 if (vmmdev_hma_reg == NULL) {
1435 VERIFY3U(vmmdev_hma_ref, ==, 0);
1436 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1437 if (vmmdev_hma_reg == NULL) {
1438 cmn_err(CE_WARN, "%s HMA registration failed.",
1439 vmmdev_hvm_name);
1440 mutex_exit(&vmmdev_mtx);
1441 return (B_FALSE);
1442 }
1443 }
1444
1445 vmmdev_hma_ref++;
1446
1447 mutex_exit(&vmmdev_mtx);
1448
1449 return (B_TRUE);
1450 }
1451
1452 /*
1453 * Release the HMA registration if held and there are no remaining VMs.
1454 */
1455 static void
vmm_hma_release(void)1456 vmm_hma_release(void)
1457 {
1458 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1459
1460 mutex_enter(&vmmdev_mtx);
1461
1462 VERIFY3U(vmmdev_hma_ref, !=, 0);
1463
1464 vmmdev_hma_ref--;
1465
1466 if (vmmdev_hma_ref == 0) {
1467 VERIFY(vmmdev_hma_reg != NULL);
1468 hma_unregister(vmmdev_hma_reg);
1469 vmmdev_hma_reg = NULL;
1470 }
1471 mutex_exit(&vmmdev_mtx);
1472 }
1473
1474 static int
vmmdev_do_vm_create(const struct vm_create_req * req,cred_t * cr)1475 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1476 {
1477 vmm_softc_t *sc = NULL;
1478 minor_t minor;
1479 int error = ENOMEM;
1480 size_t len;
1481 const char *name = req->name;
1482
1483 len = strnlen(name, VM_MAX_NAMELEN);
1484 if (len == 0) {
1485 return (EINVAL);
1486 }
1487 if (len >= VM_MAX_NAMELEN) {
1488 return (ENAMETOOLONG);
1489 }
1490 if (strchr(name, '/') != NULL) {
1491 return (EINVAL);
1492 }
1493
1494 if (!vmm_hma_acquire())
1495 return (ENXIO);
1496
1497 mutex_enter(&vmm_mtx);
1498
1499 /* Look for duplicate names */
1500 if (vmm_lookup(name) != NULL) {
1501 mutex_exit(&vmm_mtx);
1502 vmm_hma_release();
1503 return (EEXIST);
1504 }
1505
1506 /* Allow only one instance per non-global zone. */
1507 if (!INGLOBALZONE(curproc)) {
1508 for (sc = list_head(&vmm_list); sc != NULL;
1509 sc = list_next(&vmm_list, sc)) {
1510 if (sc->vmm_zone == curzone) {
1511 mutex_exit(&vmm_mtx);
1512 vmm_hma_release();
1513 return (EINVAL);
1514 }
1515 }
1516 }
1517
1518 minor = id_alloc(vmm_minors);
1519 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1520 goto fail;
1521 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1522 ddi_soft_state_free(vmm_statep, minor);
1523 goto fail;
1524 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1525 DDI_PSEUDO, 0) != DDI_SUCCESS) {
1526 goto fail;
1527 }
1528
1529 if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1530 goto fail;
1531 }
1532
1533 error = vm_create(req->name, req->flags, &sc->vmm_vm);
1534 if (error == 0) {
1535 /* Complete VM intialization and report success. */
1536 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1537 sc->vmm_minor = minor;
1538 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1539 offsetof(vmm_devmem_entry_t, vde_node));
1540
1541 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1542 offsetof(vmm_hold_t, vmh_node));
1543 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1544
1545 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1546 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1547 offsetof(vmm_lease_t, vml_node));
1548 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1549 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1550
1551 sc->vmm_zone = crgetzone(cr);
1552 zone_hold(sc->vmm_zone);
1553 vmm_zsd_add_vm(sc);
1554 vmm_kstat_init(sc);
1555
1556 list_insert_tail(&vmm_list, sc);
1557 mutex_exit(&vmm_mtx);
1558 return (0);
1559 }
1560
1561 vmm_kstat_fini(sc);
1562 ddi_remove_minor_node(vmmdev_dip, name);
1563 fail:
1564 id_free(vmm_minors, minor);
1565 if (sc != NULL) {
1566 ddi_soft_state_free(vmm_statep, minor);
1567 }
1568 mutex_exit(&vmm_mtx);
1569 vmm_hma_release();
1570
1571 return (error);
1572 }
1573
1574 /*
1575 * Bhyve 'Driver' Interface
1576 *
1577 * While many devices are emulated in the bhyve userspace process, there are
1578 * others with performance constraints which require that they run mostly or
1579 * entirely in-kernel. For those not integrated directly into bhyve, an API is
1580 * needed so they can query/manipulate the portions of VM state needed to
1581 * fulfill their purpose.
1582 *
1583 * This includes:
1584 * - Translating guest-physical addresses to host-virtual pointers
1585 * - Injecting MSIs
1586 * - Hooking IO port addresses
1587 *
1588 * The vmm_drv interface exists to provide that functionality to its consumers.
1589 * (At this time, 'viona' is the only user)
1590 */
1591 int
vmm_drv_hold(file_t * fp,cred_t * cr,vmm_hold_t ** holdp)1592 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
1593 {
1594 vnode_t *vp = fp->f_vnode;
1595 const dev_t dev = vp->v_rdev;
1596 vmm_softc_t *sc;
1597 vmm_hold_t *hold;
1598 int err = 0;
1599
1600 if (vp->v_type != VCHR) {
1601 return (ENXIO);
1602 }
1603 const major_t major = getmajor(dev);
1604 const minor_t minor = getminor(dev);
1605
1606 mutex_enter(&vmmdev_mtx);
1607 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
1608 mutex_exit(&vmmdev_mtx);
1609 return (ENOENT);
1610 }
1611 mutex_enter(&vmm_mtx);
1612 mutex_exit(&vmmdev_mtx);
1613
1614 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1615 err = ENOENT;
1616 goto out;
1617 }
1618 /* XXXJOY: check cred permissions against instance */
1619
1620 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
1621 err = EBUSY;
1622 goto out;
1623 }
1624
1625 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
1626 hold->vmh_sc = sc;
1627 hold->vmh_release_req = B_FALSE;
1628
1629 list_insert_tail(&sc->vmm_holds, hold);
1630 sc->vmm_flags |= VMM_HELD;
1631 *holdp = hold;
1632
1633 out:
1634 mutex_exit(&vmm_mtx);
1635 return (err);
1636 }
1637
1638 void
vmm_drv_rele(vmm_hold_t * hold)1639 vmm_drv_rele(vmm_hold_t *hold)
1640 {
1641 vmm_softc_t *sc;
1642
1643 ASSERT(hold != NULL);
1644 ASSERT(hold->vmh_sc != NULL);
1645 VERIFY(hold->vmh_ioport_hook_cnt == 0);
1646
1647 mutex_enter(&vmm_mtx);
1648 sc = hold->vmh_sc;
1649 list_remove(&sc->vmm_holds, hold);
1650 if (list_is_empty(&sc->vmm_holds)) {
1651 sc->vmm_flags &= ~VMM_HELD;
1652 cv_broadcast(&sc->vmm_cv);
1653 }
1654 mutex_exit(&vmm_mtx);
1655 kmem_free(hold, sizeof (*hold));
1656 }
1657
1658 boolean_t
vmm_drv_release_reqd(vmm_hold_t * hold)1659 vmm_drv_release_reqd(vmm_hold_t *hold)
1660 {
1661 ASSERT(hold != NULL);
1662
1663 return (hold->vmh_release_req);
1664 }
1665
1666 vmm_lease_t *
vmm_drv_lease_sign(vmm_hold_t * hold,boolean_t (* expiref)(void *),void * arg)1667 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
1668 {
1669 vmm_softc_t *sc = hold->vmh_sc;
1670 vmm_lease_t *lease;
1671
1672 ASSERT3P(expiref, !=, NULL);
1673
1674 if (hold->vmh_release_req) {
1675 return (NULL);
1676 }
1677
1678 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
1679 list_link_init(&lease->vml_node);
1680 lease->vml_expire_func = expiref;
1681 lease->vml_expire_arg = arg;
1682 lease->vml_expired = B_FALSE;
1683 lease->vml_break_deferred = B_FALSE;
1684 lease->vml_hold = hold;
1685 /* cache the VM pointer for one less pointer chase */
1686 lease->vml_vm = sc->vmm_vm;
1687 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
1688
1689 mutex_enter(&sc->vmm_lease_lock);
1690 while (sc->vmm_lease_blocker != 0) {
1691 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1692 }
1693 list_insert_tail(&sc->vmm_lease_list, lease);
1694 vmm_read_lock(sc);
1695 mutex_exit(&sc->vmm_lease_lock);
1696
1697 return (lease);
1698 }
1699
1700 static void
vmm_lease_break_locked(vmm_softc_t * sc,vmm_lease_t * lease)1701 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
1702 {
1703 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
1704
1705 list_remove(&sc->vmm_lease_list, lease);
1706 vmm_read_unlock(sc);
1707 vmc_destroy(lease->vml_vmclient);
1708 kmem_free(lease, sizeof (*lease));
1709 }
1710
1711 static void
vmm_lease_block(vmm_softc_t * sc)1712 vmm_lease_block(vmm_softc_t *sc)
1713 {
1714 mutex_enter(&sc->vmm_lease_lock);
1715 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
1716 sc->vmm_lease_blocker++;
1717 if (sc->vmm_lease_blocker == 1) {
1718 list_t *list = &sc->vmm_lease_list;
1719 vmm_lease_t *lease = list_head(list);
1720
1721 while (lease != NULL) {
1722 void *arg = lease->vml_expire_arg;
1723 boolean_t (*expiref)(void *) = lease->vml_expire_func;
1724 boolean_t sync_break = B_FALSE;
1725
1726 /*
1727 * Since the lease expiration notification may
1728 * need to take locks which would deadlock with
1729 * vmm_lease_lock, drop it across the call.
1730 *
1731 * We are the only one allowed to manipulate
1732 * vmm_lease_list right now, so it is safe to
1733 * continue iterating through it after
1734 * reacquiring the lock.
1735 */
1736 lease->vml_expired = B_TRUE;
1737 mutex_exit(&sc->vmm_lease_lock);
1738 sync_break = expiref(arg);
1739 mutex_enter(&sc->vmm_lease_lock);
1740
1741 if (sync_break) {
1742 vmm_lease_t *next;
1743
1744 /*
1745 * These leases which are synchronously broken
1746 * result in vmm_read_unlock() calls from a
1747 * different thread than the corresponding
1748 * vmm_read_lock(). This is acceptable, given
1749 * that the rwlock underpinning the whole
1750 * mechanism tolerates the behavior. This
1751 * flexibility is _only_ afforded to VM read
1752 * lock (RW_READER) holders.
1753 */
1754 next = list_next(list, lease);
1755 vmm_lease_break_locked(sc, lease);
1756 lease = next;
1757 } else {
1758 lease = list_next(list, lease);
1759 }
1760 }
1761
1762 /* Process leases which were not broken synchronously. */
1763 while (!list_is_empty(list)) {
1764 /*
1765 * Although the nested loops are quadratic, the number
1766 * of leases is small.
1767 */
1768 lease = list_head(list);
1769 while (lease != NULL) {
1770 vmm_lease_t *next = list_next(list, lease);
1771 if (lease->vml_break_deferred) {
1772 vmm_lease_break_locked(sc, lease);
1773 }
1774 lease = next;
1775 }
1776 if (list_is_empty(list)) {
1777 break;
1778 }
1779 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1780 }
1781 /* Wake anyone else waiting for the lease list to be empty */
1782 cv_broadcast(&sc->vmm_lease_cv);
1783 } else {
1784 list_t *list = &sc->vmm_lease_list;
1785
1786 /*
1787 * Some other thread beat us to the duty of lease cleanup.
1788 * Wait until that is complete.
1789 */
1790 while (!list_is_empty(list)) {
1791 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
1792 }
1793 }
1794 mutex_exit(&sc->vmm_lease_lock);
1795 }
1796
1797 static void
vmm_lease_unblock(vmm_softc_t * sc)1798 vmm_lease_unblock(vmm_softc_t *sc)
1799 {
1800 mutex_enter(&sc->vmm_lease_lock);
1801 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
1802 sc->vmm_lease_blocker--;
1803 if (sc->vmm_lease_blocker == 0) {
1804 cv_broadcast(&sc->vmm_lease_cv);
1805 }
1806 mutex_exit(&sc->vmm_lease_lock);
1807 }
1808
1809 void
vmm_drv_lease_break(vmm_hold_t * hold,vmm_lease_t * lease)1810 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
1811 {
1812 vmm_softc_t *sc = hold->vmh_sc;
1813
1814 VERIFY3P(hold, ==, lease->vml_hold);
1815 VERIFY(!lease->vml_break_deferred);
1816
1817 mutex_enter(&sc->vmm_lease_lock);
1818 if (sc->vmm_lease_blocker == 0) {
1819 vmm_lease_break_locked(sc, lease);
1820 } else {
1821 /*
1822 * Defer the lease-breaking to whichever thread is currently
1823 * cleaning up all leases as part of a vmm_lease_block() call.
1824 */
1825 lease->vml_break_deferred = B_TRUE;
1826 cv_broadcast(&sc->vmm_lease_cv);
1827 }
1828 mutex_exit(&sc->vmm_lease_lock);
1829 }
1830
1831 boolean_t
vmm_drv_lease_expired(vmm_lease_t * lease)1832 vmm_drv_lease_expired(vmm_lease_t *lease)
1833 {
1834 return (lease->vml_expired);
1835 }
1836
1837 void *
vmm_drv_gpa2kva(vmm_lease_t * lease,uintptr_t gpa,size_t sz)1838 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz)
1839 {
1840 vm_page_t *vmp;
1841 void *res = NULL;
1842
1843 ASSERT(lease != NULL);
1844 ASSERT3U(sz, ==, PAGESIZE);
1845 ASSERT0(gpa & PAGEOFFSET);
1846
1847 vmp = vmc_hold(lease->vml_vmclient, gpa, PROT_READ | PROT_WRITE);
1848 /*
1849 * Break the rules for now and just extract the pointer. This is
1850 * nominally safe, since holding a driver lease on the VM read-locks it.
1851 *
1852 * A pointer which would otherwise be at risk of being a use-after-free
1853 * vector is made safe since actions such as vmspace_unmap() require
1854 * acquisition of the VM write-lock, (causing all driver leases to be
1855 * broken) allowing the consumers to cease their access prior to
1856 * modification of the vmspace.
1857 */
1858 if (vmp != NULL) {
1859 res = vmp_get_writable(vmp);
1860 vmp_release(vmp);
1861 }
1862
1863 return (res);
1864 }
1865
1866 int
vmm_drv_msi(vmm_lease_t * lease,uint64_t addr,uint64_t msg)1867 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
1868 {
1869 ASSERT(lease != NULL);
1870
1871 return (lapic_intr_msi(lease->vml_vm, addr, msg));
1872 }
1873
1874 int
vmm_drv_ioport_hook(vmm_hold_t * hold,uint16_t ioport,vmm_drv_iop_cb_t func,void * arg,void ** cookie)1875 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
1876 void *arg, void **cookie)
1877 {
1878 vmm_softc_t *sc;
1879 int err;
1880
1881 ASSERT(hold != NULL);
1882 ASSERT(cookie != NULL);
1883
1884 sc = hold->vmh_sc;
1885 mutex_enter(&vmm_mtx);
1886 /* Confirm that hook installation is not blocked */
1887 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
1888 mutex_exit(&vmm_mtx);
1889 return (EBUSY);
1890 }
1891 /*
1892 * Optimistically record an installed hook which will prevent a block
1893 * from being asserted while the mutex is dropped.
1894 */
1895 hold->vmh_ioport_hook_cnt++;
1896 mutex_exit(&vmm_mtx);
1897
1898 vmm_write_lock(sc);
1899 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
1900 arg, cookie);
1901 vmm_write_unlock(sc);
1902
1903 if (err != 0) {
1904 mutex_enter(&vmm_mtx);
1905 /* Walk back optimism about the hook installation */
1906 hold->vmh_ioport_hook_cnt--;
1907 mutex_exit(&vmm_mtx);
1908 }
1909 return (err);
1910 }
1911
1912 void
vmm_drv_ioport_unhook(vmm_hold_t * hold,void ** cookie)1913 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
1914 {
1915 vmm_softc_t *sc;
1916
1917 ASSERT(hold != NULL);
1918 ASSERT(cookie != NULL);
1919 ASSERT(hold->vmh_ioport_hook_cnt != 0);
1920
1921 sc = hold->vmh_sc;
1922 vmm_write_lock(sc);
1923 vm_ioport_unhook(sc->vmm_vm, cookie);
1924 vmm_write_unlock(sc);
1925
1926 mutex_enter(&vmm_mtx);
1927 hold->vmh_ioport_hook_cnt--;
1928 mutex_exit(&vmm_mtx);
1929 }
1930
1931 static int
vmm_drv_purge(vmm_softc_t * sc)1932 vmm_drv_purge(vmm_softc_t *sc)
1933 {
1934 ASSERT(MUTEX_HELD(&vmm_mtx));
1935
1936 if ((sc->vmm_flags & VMM_HELD) != 0) {
1937 vmm_hold_t *hold;
1938
1939 sc->vmm_flags |= VMM_CLEANUP;
1940 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1941 hold = list_next(&sc->vmm_holds, hold)) {
1942 hold->vmh_release_req = B_TRUE;
1943 }
1944 while ((sc->vmm_flags & VMM_HELD) != 0) {
1945 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
1946 return (EINTR);
1947 }
1948 }
1949 sc->vmm_flags &= ~VMM_CLEANUP;
1950 }
1951
1952 VERIFY(list_is_empty(&sc->vmm_holds));
1953 sc->vmm_flags |= VMM_PURGED;
1954 return (0);
1955 }
1956
1957 static int
vmm_drv_block_hook(vmm_softc_t * sc,boolean_t enable_block)1958 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
1959 {
1960 int err = 0;
1961
1962 mutex_enter(&vmm_mtx);
1963 if (!enable_block) {
1964 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
1965
1966 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
1967 goto done;
1968 }
1969
1970 /* If any holds have hooks installed, the block is a failure */
1971 if (!list_is_empty(&sc->vmm_holds)) {
1972 vmm_hold_t *hold;
1973
1974 for (hold = list_head(&sc->vmm_holds); hold != NULL;
1975 hold = list_next(&sc->vmm_holds, hold)) {
1976 if (hold->vmh_ioport_hook_cnt != 0) {
1977 err = EBUSY;
1978 goto done;
1979 }
1980 }
1981 }
1982 sc->vmm_flags |= VMM_BLOCK_HOOK;
1983
1984 done:
1985 mutex_exit(&vmm_mtx);
1986 return (err);
1987 }
1988
1989 static int
vmm_do_vm_destroy_locked(vmm_softc_t * sc,boolean_t clean_zsd,boolean_t * hma_release)1990 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd,
1991 boolean_t *hma_release)
1992 {
1993 dev_info_t *pdip = ddi_get_parent(vmmdev_dip);
1994 minor_t minor;
1995
1996 ASSERT(MUTEX_HELD(&vmm_mtx));
1997
1998 *hma_release = B_FALSE;
1999
2000 if (vmm_drv_purge(sc) != 0) {
2001 return (EINTR);
2002 }
2003
2004 if (clean_zsd) {
2005 vmm_zsd_rem_vm(sc);
2006 }
2007
2008 /* Clean up devmem entries */
2009 vmmdev_devmem_purge(sc);
2010
2011 list_remove(&vmm_list, sc);
2012 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2013 minor = sc->vmm_minor;
2014 zone_rele(sc->vmm_zone);
2015 if (sc->vmm_is_open) {
2016 list_insert_tail(&vmm_destroy_list, sc);
2017 sc->vmm_flags |= VMM_DESTROY;
2018 } else {
2019 vmm_kstat_fini(sc);
2020 vm_destroy(sc->vmm_vm);
2021 ddi_soft_state_free(vmm_statep, minor);
2022 id_free(vmm_minors, minor);
2023 *hma_release = B_TRUE;
2024 }
2025 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
2026
2027 return (0);
2028 }
2029
2030 int
vmm_do_vm_destroy(vmm_softc_t * sc,boolean_t clean_zsd)2031 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd)
2032 {
2033 boolean_t hma_release = B_FALSE;
2034 int err;
2035
2036 mutex_enter(&vmm_mtx);
2037 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release);
2038 mutex_exit(&vmm_mtx);
2039
2040 if (hma_release)
2041 vmm_hma_release();
2042
2043 return (err);
2044 }
2045
2046 /* ARGSUSED */
2047 static int
vmmdev_do_vm_destroy(const struct vm_destroy_req * req,cred_t * cr)2048 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2049 {
2050 boolean_t hma_release = B_FALSE;
2051 vmm_softc_t *sc;
2052 int err;
2053
2054 if (crgetuid(cr) != 0)
2055 return (EPERM);
2056
2057 mutex_enter(&vmm_mtx);
2058
2059 if ((sc = vmm_lookup(req->name)) == NULL) {
2060 mutex_exit(&vmm_mtx);
2061 return (ENOENT);
2062 }
2063 /*
2064 * We don't check this in vmm_lookup() since that function is also used
2065 * for validation during create and currently vmm names must be unique.
2066 */
2067 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2068 mutex_exit(&vmm_mtx);
2069 return (EPERM);
2070 }
2071 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release);
2072
2073 mutex_exit(&vmm_mtx);
2074
2075 if (hma_release)
2076 vmm_hma_release();
2077
2078 return (err);
2079 }
2080
2081 #define VCPU_NAME_BUFLEN 32
2082
2083 static int
vmm_kstat_alloc(vmm_softc_t * sc,minor_t minor,const cred_t * cr)2084 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2085 {
2086 zoneid_t zid = crgetzoneid(cr);
2087 int instance = minor;
2088 kstat_t *ksp;
2089
2090 ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2091
2092 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2093 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2094 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2095
2096 if (ksp == NULL) {
2097 return (-1);
2098 }
2099 sc->vmm_kstat_vm = ksp;
2100
2101 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2102 char namebuf[VCPU_NAME_BUFLEN];
2103
2104 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2105
2106 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2107 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2108 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2109 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2110 0, zid);
2111 if (ksp == NULL) {
2112 goto fail;
2113 }
2114
2115 sc->vmm_kstat_vcpu[i] = ksp;
2116 }
2117
2118 /*
2119 * If this instance is associated with a non-global zone, make its
2120 * kstats visible from the GZ.
2121 */
2122 if (zid != GLOBAL_ZONEID) {
2123 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2124 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2125 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2126 }
2127 }
2128
2129 return (0);
2130
2131 fail:
2132 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2133 if (sc->vmm_kstat_vcpu[i] != NULL) {
2134 kstat_delete(sc->vmm_kstat_vcpu[i]);
2135 sc->vmm_kstat_vcpu[i] = NULL;
2136 } else {
2137 break;
2138 }
2139 }
2140 kstat_delete(sc->vmm_kstat_vm);
2141 sc->vmm_kstat_vm = NULL;
2142 return (-1);
2143 }
2144
2145 static void
vmm_kstat_init(vmm_softc_t * sc)2146 vmm_kstat_init(vmm_softc_t *sc)
2147 {
2148 kstat_t *ksp;
2149
2150 ASSERT3P(sc->vmm_vm, !=, NULL);
2151 ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2152
2153 ksp = sc->vmm_kstat_vm;
2154 vmm_kstats_t *vk = ksp->ks_data;
2155 ksp->ks_private = sc->vmm_vm;
2156 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2157 kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2158
2159 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2160 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2161
2162 ksp = sc->vmm_kstat_vcpu[i];
2163 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2164
2165 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2166 vvk->vvk_vcpu.value.ui32 = i;
2167 kstat_named_init(&vvk->vvk_time_init, "time_init",
2168 KSTAT_DATA_UINT64);
2169 kstat_named_init(&vvk->vvk_time_run, "time_run",
2170 KSTAT_DATA_UINT64);
2171 kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2172 KSTAT_DATA_UINT64);
2173 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2174 KSTAT_DATA_UINT64);
2175 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2176 KSTAT_DATA_UINT64);
2177 kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2178 KSTAT_DATA_UINT64);
2179 ksp->ks_private = sc->vmm_vm;
2180 ksp->ks_update = vmm_kstat_update_vcpu;
2181 }
2182
2183 kstat_install(sc->vmm_kstat_vm);
2184 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2185 kstat_install(sc->vmm_kstat_vcpu[i]);
2186 }
2187 }
2188
2189 static void
vmm_kstat_fini(vmm_softc_t * sc)2190 vmm_kstat_fini(vmm_softc_t *sc)
2191 {
2192 ASSERT(sc->vmm_kstat_vm != NULL);
2193
2194 kstat_delete(sc->vmm_kstat_vm);
2195 sc->vmm_kstat_vm = NULL;
2196
2197 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2198 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2199
2200 kstat_delete(sc->vmm_kstat_vcpu[i]);
2201 sc->vmm_kstat_vcpu[i] = NULL;
2202 }
2203 }
2204
2205 static int
vmm_open(dev_t * devp,int flag,int otyp,cred_t * credp)2206 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2207 {
2208 minor_t minor;
2209 vmm_softc_t *sc;
2210
2211 /*
2212 * Forbid running bhyve in a 32-bit process until it has been tested and
2213 * verified to be safe.
2214 */
2215 if (curproc->p_model != DATAMODEL_LP64) {
2216 return (EFBIG);
2217 }
2218
2219 minor = getminor(*devp);
2220 if (minor == VMM_CTL_MINOR) {
2221 /*
2222 * Master control device must be opened exclusively.
2223 */
2224 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2225 return (EINVAL);
2226 }
2227
2228 return (0);
2229 }
2230
2231 mutex_enter(&vmm_mtx);
2232 sc = ddi_get_soft_state(vmm_statep, minor);
2233 if (sc == NULL) {
2234 mutex_exit(&vmm_mtx);
2235 return (ENXIO);
2236 }
2237
2238 sc->vmm_is_open = B_TRUE;
2239 mutex_exit(&vmm_mtx);
2240
2241 return (0);
2242 }
2243
2244 static int
vmm_close(dev_t dev,int flag,int otyp,cred_t * credp)2245 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2246 {
2247 minor_t minor;
2248 vmm_softc_t *sc;
2249 boolean_t hma_release = B_FALSE;
2250
2251 minor = getminor(dev);
2252 if (minor == VMM_CTL_MINOR)
2253 return (0);
2254
2255 mutex_enter(&vmm_mtx);
2256 sc = ddi_get_soft_state(vmm_statep, minor);
2257 if (sc == NULL) {
2258 mutex_exit(&vmm_mtx);
2259 return (ENXIO);
2260 }
2261
2262 VERIFY(sc->vmm_is_open);
2263 sc->vmm_is_open = B_FALSE;
2264
2265 /*
2266 * If this VM was destroyed while the vmm device was open, then
2267 * clean it up now that it is closed.
2268 */
2269 if (sc->vmm_flags & VMM_DESTROY) {
2270 list_remove(&vmm_destroy_list, sc);
2271 vmm_kstat_fini(sc);
2272 vm_destroy(sc->vmm_vm);
2273 ddi_soft_state_free(vmm_statep, minor);
2274 id_free(vmm_minors, minor);
2275 hma_release = B_TRUE;
2276 }
2277 mutex_exit(&vmm_mtx);
2278
2279 if (hma_release)
2280 vmm_hma_release();
2281
2282 return (0);
2283 }
2284
2285 static int
vmm_is_supported(intptr_t arg)2286 vmm_is_supported(intptr_t arg)
2287 {
2288 int r;
2289 const char *msg;
2290
2291 if (vmm_is_intel()) {
2292 r = vmx_x86_supported(&msg);
2293 } else if (vmm_is_svm()) {
2294 /*
2295 * HMA already ensured that the features necessary for SVM
2296 * operation were present and online during vmm_attach().
2297 */
2298 r = 0;
2299 } else {
2300 r = ENXIO;
2301 msg = "Unsupported CPU vendor";
2302 }
2303
2304 if (r != 0 && arg != (intptr_t)NULL) {
2305 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2306 return (EFAULT);
2307 }
2308 return (r);
2309 }
2310
2311 static int
vmm_ctl_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)2312 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2313 {
2314 void *argp = (void *)arg;
2315
2316 switch (cmd) {
2317 case VMM_CREATE_VM: {
2318 struct vm_create_req req;
2319
2320 if ((md & FWRITE) == 0) {
2321 return (EPERM);
2322 }
2323 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2324 return (EFAULT);
2325 }
2326 return (vmmdev_do_vm_create(&req, cr));
2327 }
2328 case VMM_DESTROY_VM: {
2329 struct vm_destroy_req req;
2330
2331 if ((md & FWRITE) == 0) {
2332 return (EPERM);
2333 }
2334 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2335 return (EFAULT);
2336 }
2337 return (vmmdev_do_vm_destroy(&req, cr));
2338 }
2339 case VMM_VM_SUPPORTED:
2340 return (vmm_is_supported(arg));
2341 case VMM_RESV_QUERY:
2342 case VMM_RESV_ADD:
2343 case VMM_RESV_REMOVE:
2344 return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2345 default:
2346 break;
2347 }
2348 /* No other actions are legal on ctl device */
2349 return (ENOTTY);
2350 }
2351
2352 static int
vmm_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)2353 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2354 int *rvalp)
2355 {
2356 vmm_softc_t *sc;
2357 minor_t minor;
2358
2359 /*
2360 * Forbid running bhyve in a 32-bit process until it has been tested and
2361 * verified to be safe.
2362 */
2363 if (curproc->p_model != DATAMODEL_LP64) {
2364 return (EFBIG);
2365 }
2366
2367 /* The structs in bhyve ioctls assume a 64-bit datamodel */
2368 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2369 return (ENOTSUP);
2370 }
2371
2372 minor = getminor(dev);
2373
2374 if (minor == VMM_CTL_MINOR) {
2375 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
2376 }
2377
2378 sc = ddi_get_soft_state(vmm_statep, minor);
2379 ASSERT(sc);
2380
2381 if (sc->vmm_flags & VMM_DESTROY)
2382 return (ENXIO);
2383
2384 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2385 }
2386
2387 static int
vmm_segmap(dev_t dev,off_t off,struct as * as,caddr_t * addrp,off_t len,unsigned int prot,unsigned int maxprot,unsigned int flags,cred_t * credp)2388 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2389 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2390 {
2391 vmm_softc_t *sc;
2392 const minor_t minor = getminor(dev);
2393 int err;
2394
2395 if (minor == VMM_CTL_MINOR) {
2396 return (ENODEV);
2397 }
2398 if (off < 0 || (off + len) <= 0) {
2399 return (EINVAL);
2400 }
2401 if ((prot & PROT_USER) == 0) {
2402 return (EACCES);
2403 }
2404
2405 sc = ddi_get_soft_state(vmm_statep, minor);
2406 ASSERT(sc);
2407
2408 if (sc->vmm_flags & VMM_DESTROY)
2409 return (ENXIO);
2410
2411 /* Grab read lock on the VM to prevent any changes to the memory map */
2412 vmm_read_lock(sc);
2413
2414 if (off >= VM_DEVMEM_START) {
2415 int segid;
2416 off_t segoff;
2417
2418 /* Mapping a devmem "device" */
2419 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
2420 err = ENODEV;
2421 } else {
2422 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
2423 addrp, prot, maxprot, flags);
2424 }
2425 } else {
2426 /* Mapping a part of the guest physical space */
2427 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
2428 maxprot, flags);
2429 }
2430
2431 vmm_read_unlock(sc);
2432 return (err);
2433 }
2434
2435 static sdev_plugin_validate_t
vmm_sdev_validate(sdev_ctx_t ctx)2436 vmm_sdev_validate(sdev_ctx_t ctx)
2437 {
2438 const char *name = sdev_ctx_name(ctx);
2439 vmm_softc_t *sc;
2440 sdev_plugin_validate_t ret;
2441 minor_t minor;
2442
2443 if (sdev_ctx_vtype(ctx) != VCHR)
2444 return (SDEV_VTOR_INVALID);
2445
2446 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2447
2448 mutex_enter(&vmm_mtx);
2449 if ((sc = vmm_lookup(name)) == NULL)
2450 ret = SDEV_VTOR_INVALID;
2451 else if (sc->vmm_minor != minor)
2452 ret = SDEV_VTOR_STALE;
2453 else
2454 ret = SDEV_VTOR_VALID;
2455 mutex_exit(&vmm_mtx);
2456
2457 return (ret);
2458 }
2459
2460 static int
vmm_sdev_filldir(sdev_ctx_t ctx)2461 vmm_sdev_filldir(sdev_ctx_t ctx)
2462 {
2463 vmm_softc_t *sc;
2464 int ret;
2465
2466 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2467 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2468 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2469 return (EINVAL);
2470 }
2471
2472 mutex_enter(&vmm_mtx);
2473 ASSERT(vmmdev_dip != NULL);
2474 for (sc = list_head(&vmm_list); sc != NULL;
2475 sc = list_next(&vmm_list, sc)) {
2476 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2477 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2478 S_IFCHR | 0600,
2479 makedevice(ddi_driver_major(vmmdev_dip),
2480 sc->vmm_minor));
2481 } else {
2482 continue;
2483 }
2484 if (ret != 0 && ret != EEXIST)
2485 goto out;
2486 }
2487
2488 ret = 0;
2489
2490 out:
2491 mutex_exit(&vmm_mtx);
2492 return (ret);
2493 }
2494
2495 /* ARGSUSED */
2496 static void
vmm_sdev_inactive(sdev_ctx_t ctx)2497 vmm_sdev_inactive(sdev_ctx_t ctx)
2498 {
2499 }
2500
2501 static sdev_plugin_ops_t vmm_sdev_ops = {
2502 .spo_version = SDEV_PLUGIN_VERSION,
2503 .spo_flags = SDEV_PLUGIN_SUBDIR,
2504 .spo_validate = vmm_sdev_validate,
2505 .spo_filldir = vmm_sdev_filldir,
2506 .spo_inactive = vmm_sdev_inactive
2507 };
2508
2509 /* ARGSUSED */
2510 static int
vmm_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)2511 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
2512 {
2513 int error;
2514
2515 switch (cmd) {
2516 case DDI_INFO_DEVT2DEVINFO:
2517 *result = (void *)vmmdev_dip;
2518 error = DDI_SUCCESS;
2519 break;
2520 case DDI_INFO_DEVT2INSTANCE:
2521 *result = (void *)0;
2522 error = DDI_SUCCESS;
2523 break;
2524 default:
2525 error = DDI_FAILURE;
2526 break;
2527 }
2528 return (error);
2529 }
2530
2531 static int
vmm_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)2532 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2533 {
2534 sdev_plugin_hdl_t sph;
2535 hma_reg_t *reg = NULL;
2536 boolean_t vmm_loaded = B_FALSE;
2537
2538 if (cmd != DDI_ATTACH) {
2539 return (DDI_FAILURE);
2540 }
2541
2542 mutex_enter(&vmmdev_mtx);
2543 /* Ensure we are not already attached. */
2544 if (vmmdev_dip != NULL) {
2545 mutex_exit(&vmmdev_mtx);
2546 return (DDI_FAILURE);
2547 }
2548
2549 vmm_sol_glue_init();
2550
2551 /*
2552 * Perform temporary HMA registration to determine if the system
2553 * is capable.
2554 */
2555 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
2556 goto fail;
2557 } else if (vmm_mod_load() != 0) {
2558 goto fail;
2559 }
2560 vmm_loaded = B_TRUE;
2561 hma_unregister(reg);
2562 reg = NULL;
2563
2564 /* Create control node. Other nodes will be created on demand. */
2565 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
2566 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
2567 goto fail;
2568 }
2569
2570 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
2571 if (sph == (sdev_plugin_hdl_t)NULL) {
2572 ddi_remove_minor_node(dip, NULL);
2573 goto fail;
2574 }
2575
2576 ddi_report_dev(dip);
2577 vmmdev_sdev_hdl = sph;
2578 vmmdev_dip = dip;
2579 mutex_exit(&vmmdev_mtx);
2580 return (DDI_SUCCESS);
2581
2582 fail:
2583 if (vmm_loaded) {
2584 VERIFY0(vmm_mod_unload());
2585 }
2586 if (reg != NULL) {
2587 hma_unregister(reg);
2588 }
2589 vmm_sol_glue_cleanup();
2590 mutex_exit(&vmmdev_mtx);
2591 return (DDI_FAILURE);
2592 }
2593
2594 static int
vmm_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2595 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2596 {
2597 if (cmd != DDI_DETACH) {
2598 return (DDI_FAILURE);
2599 }
2600
2601 /*
2602 * Ensure that all resources have been cleaned up.
2603 *
2604 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
2605 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
2606 * devinfo locked as iommu_cleanup() tries to recursively lock each
2607 * devinfo, including our own, while holding vmmdev_mtx.
2608 */
2609 if (mutex_tryenter(&vmmdev_mtx) == 0)
2610 return (DDI_FAILURE);
2611
2612 mutex_enter(&vmm_mtx);
2613 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
2614 mutex_exit(&vmm_mtx);
2615 mutex_exit(&vmmdev_mtx);
2616 return (DDI_FAILURE);
2617 }
2618 mutex_exit(&vmm_mtx);
2619
2620 if (!vmmr_is_empty()) {
2621 mutex_exit(&vmmdev_mtx);
2622 return (DDI_FAILURE);
2623 }
2624
2625 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
2626 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
2627 mutex_exit(&vmmdev_mtx);
2628 return (DDI_FAILURE);
2629 }
2630 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
2631
2632 /* Remove the control node. */
2633 ddi_remove_minor_node(dip, "ctl");
2634 vmmdev_dip = NULL;
2635
2636 VERIFY0(vmm_mod_unload());
2637 VERIFY3U(vmmdev_hma_reg, ==, NULL);
2638 vmm_sol_glue_cleanup();
2639
2640 mutex_exit(&vmmdev_mtx);
2641
2642 return (DDI_SUCCESS);
2643 }
2644
2645 static struct cb_ops vmm_cb_ops = {
2646 vmm_open,
2647 vmm_close,
2648 nodev, /* strategy */
2649 nodev, /* print */
2650 nodev, /* dump */
2651 nodev, /* read */
2652 nodev, /* write */
2653 vmm_ioctl,
2654 nodev, /* devmap */
2655 nodev, /* mmap */
2656 vmm_segmap,
2657 nochpoll, /* poll */
2658 ddi_prop_op,
2659 NULL,
2660 D_NEW | D_MP | D_DEVMAP
2661 };
2662
2663 static struct dev_ops vmm_ops = {
2664 DEVO_REV,
2665 0,
2666 vmm_info,
2667 nulldev, /* identify */
2668 nulldev, /* probe */
2669 vmm_attach,
2670 vmm_detach,
2671 nodev, /* reset */
2672 &vmm_cb_ops,
2673 (struct bus_ops *)NULL
2674 };
2675
2676 static struct modldrv modldrv = {
2677 &mod_driverops,
2678 "bhyve vmm",
2679 &vmm_ops
2680 };
2681
2682 static struct modlinkage modlinkage = {
2683 MODREV_1,
2684 &modldrv,
2685 NULL
2686 };
2687
2688 int
_init(void)2689 _init(void)
2690 {
2691 int error;
2692
2693 sysinit();
2694
2695 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
2696 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
2697 list_create(&vmm_list, sizeof (vmm_softc_t),
2698 offsetof(vmm_softc_t, vmm_node));
2699 list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
2700 offsetof(vmm_softc_t, vmm_node));
2701 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
2702
2703 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
2704 if (error) {
2705 return (error);
2706 }
2707
2708 vmm_zsd_init();
2709 vmmr_init();
2710
2711 error = mod_install(&modlinkage);
2712 if (error) {
2713 ddi_soft_state_fini(&vmm_statep);
2714 vmm_zsd_fini();
2715 vmmr_fini();
2716 }
2717
2718 return (error);
2719 }
2720
2721 int
_fini(void)2722 _fini(void)
2723 {
2724 int error;
2725
2726 error = mod_remove(&modlinkage);
2727 if (error) {
2728 return (error);
2729 }
2730
2731 vmm_zsd_fini();
2732 vmmr_fini();
2733
2734 ddi_soft_state_fini(&vmm_statep);
2735
2736 return (0);
2737 }
2738
2739 int
_info(struct modinfo * modinfop)2740 _info(struct modinfo *modinfop)
2741 {
2742 return (mod_info(&modlinkage, modinfop));
2743 }
2744