1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2022 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static list_t vmm_destroy_list; 81 static id_space_t *vmm_minors; 82 static void *vmm_statep; 83 84 /* temporary safety switch */ 85 int vmm_allow_state_writes; 86 87 static const char *vmmdev_hvm_name = "bhyve"; 88 89 /* For sdev plugin (/dev) */ 90 #define VMM_SDEV_ROOT "/dev/vmm" 91 92 /* From uts/intel/io/vmm/intel/vmx.c */ 93 extern int vmx_x86_supported(const char **); 94 95 /* Holds and hooks from drivers external to vmm */ 96 struct vmm_hold { 97 list_node_t vmh_node; 98 vmm_softc_t *vmh_sc; 99 boolean_t vmh_release_req; 100 uint_t vmh_ioport_hook_cnt; 101 }; 102 103 struct vmm_lease { 104 list_node_t vml_node; 105 struct vm *vml_vm; 106 vm_client_t *vml_vmclient; 107 boolean_t vml_expired; 108 boolean_t vml_break_deferred; 109 boolean_t (*vml_expire_func)(void *); 110 void *vml_expire_arg; 111 struct vmm_hold *vml_hold; 112 }; 113 114 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 115 static void vmm_lease_block(vmm_softc_t *); 116 static void vmm_lease_unblock(vmm_softc_t *); 117 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 118 static void vmm_kstat_init(vmm_softc_t *); 119 static void vmm_kstat_fini(vmm_softc_t *); 120 121 /* 122 * The 'devmem' hack: 123 * 124 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 125 * in the vm which appear with their own name related to the vm under /dev. 126 * Since this would be a hassle from an sdev perspective and would require a 127 * new cdev interface (or complicate the existing one), we choose to implement 128 * this in a different manner. Direct access to the underlying vm memory 129 * segments is exposed by placing them in a range of offsets beyond the normal 130 * guest memory space. Userspace can query the appropriate offset to mmap() 131 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 132 */ 133 134 static vmm_devmem_entry_t * 135 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 136 { 137 vmm_devmem_entry_t *ent = NULL; 138 list_t *dl = &sc->vmm_devmem_list; 139 140 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 141 if (ent->vde_segid == segid) { 142 return (ent); 143 } 144 } 145 return (NULL); 146 } 147 148 static int 149 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 150 { 151 int error; 152 bool sysmem; 153 154 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 155 NULL); 156 if (error || mseg->len == 0) 157 return (error); 158 159 if (!sysmem) { 160 vmm_devmem_entry_t *de; 161 162 de = vmmdev_devmem_find(sc, mseg->segid); 163 if (de != NULL) { 164 (void) strlcpy(mseg->name, de->vde_name, 165 sizeof (mseg->name)); 166 } 167 } else { 168 bzero(mseg->name, sizeof (mseg->name)); 169 } 170 171 return (error); 172 } 173 174 static int 175 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 176 { 177 off_t map_offset; 178 vmm_devmem_entry_t *entry; 179 180 if (list_is_empty(&sc->vmm_devmem_list)) { 181 map_offset = VM_DEVMEM_START; 182 } else { 183 entry = list_tail(&sc->vmm_devmem_list); 184 map_offset = entry->vde_off + entry->vde_len; 185 if (map_offset < entry->vde_off) { 186 /* Do not tolerate overflow */ 187 return (ERANGE); 188 } 189 /* 190 * XXXJOY: We could choose to search the list for duplicate 191 * names and toss an error. Since we're using the offset 192 * method for now, it does not make much of a difference. 193 */ 194 } 195 196 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 197 entry->vde_segid = mseg->segid; 198 entry->vde_len = mseg->len; 199 entry->vde_off = map_offset; 200 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 201 list_insert_tail(&sc->vmm_devmem_list, entry); 202 203 return (0); 204 } 205 206 static boolean_t 207 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 208 off_t *map_offp) 209 { 210 list_t *dl = &sc->vmm_devmem_list; 211 vmm_devmem_entry_t *de = NULL; 212 const off_t map_end = off + len; 213 214 VERIFY(off >= VM_DEVMEM_START); 215 216 if (map_end < off) { 217 /* No match on overflow */ 218 return (B_FALSE); 219 } 220 221 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 222 const off_t item_end = de->vde_off + de->vde_len; 223 224 if (de->vde_off <= off && item_end >= map_end) { 225 *segidp = de->vde_segid; 226 *map_offp = off - de->vde_off; 227 return (B_TRUE); 228 } 229 } 230 return (B_FALSE); 231 } 232 233 static void 234 vmmdev_devmem_purge(vmm_softc_t *sc) 235 { 236 vmm_devmem_entry_t *entry; 237 238 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 239 kmem_free(entry, sizeof (*entry)); 240 } 241 } 242 243 static int 244 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 245 { 246 int error; 247 bool sysmem = true; 248 249 if (VM_MEMSEG_NAME(mseg)) { 250 sysmem = false; 251 } 252 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 253 254 if (error == 0) { 255 /* 256 * Rather than create a whole fresh device from which userspace 257 * can mmap this segment, instead make it available at an 258 * offset above where the main guest memory resides. 259 */ 260 error = vmmdev_devmem_create(sc, mseg, mseg->name); 261 if (error != 0) { 262 vm_free_memseg(sc->vmm_vm, mseg->segid); 263 } 264 } 265 return (error); 266 } 267 268 /* 269 * Resource Locking and Exclusion 270 * 271 * Much of bhyve depends on key portions of VM state, such as the guest memory 272 * map, to remain unchanged while the guest is running. As ported from 273 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 274 * access to the instance vCPUs. Threads acting on a single vCPU, like those 275 * performing the work of actually running the guest in VMX/SVM, would lock 276 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 277 * state, all of the vCPUs would be first locked, ensuring that the 278 * operation(s) could complete without any other threads stumbling into 279 * intermediate states. 280 * 281 * This approach is largely effective for bhyve. Common operations, such as 282 * running the vCPUs, steer clear of lock contention. The model begins to 283 * break down for operations which do not occur in the context of a specific 284 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 285 * thread in the bhyve process. In order to properly protect those vCPU-less 286 * operations from encountering invalid states, additional locking is required. 287 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 288 * It does mean that class of operations will be serialized on locking the 289 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 290 * undue contention on the VM_MAXCPU-1 vCPU. 291 * 292 * In order to address the shortcomings of this model, the concept of a 293 * read/write lock has been added to bhyve. Operations which change 294 * fundamental aspects of a VM (such as the memory map) must acquire the write 295 * lock, which also implies locking all of the vCPUs and waiting for all read 296 * lock holders to release. While it increases the cost and waiting time for 297 * those few operations, it allows most hot-path operations on the VM (which 298 * depend on its configuration remaining stable) to occur with minimal locking. 299 * 300 * Consumers of the Driver API (see below) are a special case when it comes to 301 * this locking, since they may hold a read lock via the drv_lease mechanism 302 * for an extended period of time. Rather than forcing those consumers to 303 * continuously poll for a write lock attempt, the lease system forces them to 304 * provide a release callback to trigger their clean-up (and potential later 305 * reacquisition) of the read lock. 306 */ 307 308 static void 309 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 310 { 311 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 312 313 /* 314 * Since this state transition is utilizing from_idle=true, it should 315 * not fail, but rather block until it can be successful. 316 */ 317 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 318 } 319 320 static void 321 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 322 { 323 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 324 325 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 326 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 327 } 328 329 static void 330 vmm_read_lock(vmm_softc_t *sc) 331 { 332 rw_enter(&sc->vmm_rwlock, RW_READER); 333 } 334 335 static void 336 vmm_read_unlock(vmm_softc_t *sc) 337 { 338 rw_exit(&sc->vmm_rwlock); 339 } 340 341 static void 342 vmm_write_lock(vmm_softc_t *sc) 343 { 344 int maxcpus; 345 346 /* First lock all the vCPUs */ 347 maxcpus = vm_get_maxcpus(sc->vmm_vm); 348 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 349 vcpu_lock_one(sc, vcpu); 350 } 351 352 /* 353 * Block vmm_drv leases from being acquired or held while the VM write 354 * lock is held. 355 */ 356 vmm_lease_block(sc); 357 358 rw_enter(&sc->vmm_rwlock, RW_WRITER); 359 /* 360 * For now, the 'maxcpus' value for an instance is fixed at the 361 * compile-time constant of VM_MAXCPU at creation. If this changes in 362 * the future, allowing for dynamic vCPU resource sizing, acquisition 363 * of the write lock will need to be wary of such changes. 364 */ 365 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 366 } 367 368 static void 369 vmm_write_unlock(vmm_softc_t *sc) 370 { 371 int maxcpus; 372 373 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 374 vmm_lease_unblock(sc); 375 376 /* 377 * The VM write lock _must_ be released from the same thread it was 378 * acquired in, unlike the read lock. 379 */ 380 VERIFY(rw_write_held(&sc->vmm_rwlock)); 381 rw_exit(&sc->vmm_rwlock); 382 383 /* Unlock all the vCPUs */ 384 maxcpus = vm_get_maxcpus(sc->vmm_vm); 385 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 386 vcpu_unlock_one(sc, vcpu); 387 } 388 } 389 390 static int 391 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 392 cred_t *credp, int *rvalp) 393 { 394 int error = 0, vcpu = -1; 395 void *datap = (void *)arg; 396 enum vm_lock_type { 397 LOCK_NONE = 0, 398 LOCK_VCPU, 399 LOCK_READ_HOLD, 400 LOCK_WRITE_HOLD 401 } lock_type = LOCK_NONE; 402 403 /* Acquire any exclusion resources needed for the operation. */ 404 switch (cmd) { 405 case VM_RUN: 406 case VM_GET_REGISTER: 407 case VM_SET_REGISTER: 408 case VM_GET_SEGMENT_DESCRIPTOR: 409 case VM_SET_SEGMENT_DESCRIPTOR: 410 case VM_GET_REGISTER_SET: 411 case VM_SET_REGISTER_SET: 412 case VM_INJECT_EXCEPTION: 413 case VM_GET_CAPABILITY: 414 case VM_SET_CAPABILITY: 415 case VM_PPTDEV_MSI: 416 case VM_PPTDEV_MSIX: 417 case VM_SET_X2APIC_STATE: 418 case VM_GLA2GPA: 419 case VM_GLA2GPA_NOFAULT: 420 case VM_ACTIVATE_CPU: 421 case VM_SET_INTINFO: 422 case VM_GET_INTINFO: 423 case VM_RESTART_INSTRUCTION: 424 case VM_SET_KERNEMU_DEV: 425 case VM_GET_KERNEMU_DEV: 426 case VM_RESET_CPU: 427 case VM_GET_RUN_STATE: 428 case VM_SET_RUN_STATE: 429 case VM_GET_FPU: 430 case VM_SET_FPU: 431 /* 432 * Copy in the ID of the vCPU chosen for this operation. 433 * Since a nefarious caller could update their struct between 434 * this locking and when the rest of the ioctl data is copied 435 * in, it is _critical_ that this local 'vcpu' variable be used 436 * rather than the in-struct one when performing the ioctl. 437 */ 438 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 439 return (EFAULT); 440 } 441 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 442 return (EINVAL); 443 } 444 vcpu_lock_one(sc, vcpu); 445 lock_type = LOCK_VCPU; 446 break; 447 448 case VM_REINIT: 449 case VM_BIND_PPTDEV: 450 case VM_UNBIND_PPTDEV: 451 case VM_MAP_PPTDEV_MMIO: 452 case VM_UNMAP_PPTDEV_MMIO: 453 case VM_ALLOC_MEMSEG: 454 case VM_MMAP_MEMSEG: 455 case VM_MUNMAP_MEMSEG: 456 case VM_WRLOCK_CYCLE: 457 case VM_PMTMR_LOCATE: 458 vmm_write_lock(sc); 459 lock_type = LOCK_WRITE_HOLD; 460 break; 461 462 case VM_GET_MEMSEG: 463 case VM_MMAP_GETNEXT: 464 case VM_LAPIC_IRQ: 465 case VM_INJECT_NMI: 466 case VM_IOAPIC_ASSERT_IRQ: 467 case VM_IOAPIC_DEASSERT_IRQ: 468 case VM_IOAPIC_PULSE_IRQ: 469 case VM_LAPIC_MSI: 470 case VM_LAPIC_LOCAL_IRQ: 471 case VM_GET_X2APIC_STATE: 472 case VM_RTC_READ: 473 case VM_RTC_WRITE: 474 case VM_RTC_SETTIME: 475 case VM_RTC_GETTIME: 476 case VM_PPTDEV_DISABLE_MSIX: 477 case VM_DEVMEM_GETOFFSET: 478 case VM_TRACK_DIRTY_PAGES: 479 vmm_read_lock(sc); 480 lock_type = LOCK_READ_HOLD; 481 break; 482 483 case VM_DATA_READ: 484 case VM_DATA_WRITE: 485 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 486 return (EFAULT); 487 } 488 if (vcpu == -1) { 489 /* Access data for VM-wide devices */ 490 vmm_write_lock(sc); 491 lock_type = LOCK_WRITE_HOLD; 492 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 493 /* Access data associated with a specific vCPU */ 494 vcpu_lock_one(sc, vcpu); 495 lock_type = LOCK_VCPU; 496 } else { 497 return (EINVAL); 498 } 499 break; 500 501 case VM_GET_GPA_PMAP: 502 case VM_IOAPIC_PINCOUNT: 503 case VM_SUSPEND: 504 case VM_DESC_FPU_AREA: 505 default: 506 break; 507 } 508 509 /* Execute the primary logic for the ioctl. */ 510 switch (cmd) { 511 case VM_RUN: { 512 struct vm_entry entry; 513 514 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 515 error = EFAULT; 516 break; 517 } 518 519 if (!(curthread->t_schedflag & TS_VCPU)) 520 smt_mark_as_vcpu(); 521 522 error = vm_run(sc->vmm_vm, vcpu, &entry); 523 524 /* 525 * Unexpected states in vm_run() are expressed through positive 526 * errno-oriented return values. VM states which expect further 527 * processing in userspace (necessary context via exitinfo) are 528 * expressed through negative return values. For the time being 529 * a return value of 0 is not expected from vm_run(). 530 */ 531 ASSERT(error != 0); 532 if (error < 0) { 533 const struct vm_exit *vme; 534 void *outp = entry.exit_data; 535 536 error = 0; 537 vme = vm_exitinfo(sc->vmm_vm, vcpu); 538 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 539 error = EFAULT; 540 } 541 } 542 break; 543 } 544 case VM_SUSPEND: { 545 struct vm_suspend vmsuspend; 546 547 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 548 error = EFAULT; 549 break; 550 } 551 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 552 break; 553 } 554 case VM_REINIT: { 555 struct vm_reinit reinit; 556 557 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 558 error = EFAULT; 559 break; 560 } 561 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 562 /* 563 * The VM instance should be free of driver-attached 564 * hooks during the reinitialization process. 565 */ 566 break; 567 } 568 error = vm_reinit(sc->vmm_vm, reinit.flags); 569 (void) vmm_drv_block_hook(sc, B_FALSE); 570 break; 571 } 572 case VM_STAT_DESC: { 573 struct vm_stat_desc statdesc; 574 575 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 576 error = EFAULT; 577 break; 578 } 579 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 580 sizeof (statdesc.desc)); 581 if (error == 0 && 582 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 583 error = EFAULT; 584 break; 585 } 586 break; 587 } 588 case VM_STATS_IOC: { 589 struct vm_stats vmstats; 590 591 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 592 error = EFAULT; 593 break; 594 } 595 hrt2tv(gethrtime(), &vmstats.tv); 596 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 597 nitems(vmstats.statbuf), 598 &vmstats.num_entries, vmstats.statbuf); 599 if (error == 0 && 600 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 601 error = EFAULT; 602 break; 603 } 604 break; 605 } 606 607 case VM_PPTDEV_MSI: { 608 struct vm_pptdev_msi pptmsi; 609 610 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 611 error = EFAULT; 612 break; 613 } 614 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 615 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 616 break; 617 } 618 case VM_PPTDEV_MSIX: { 619 struct vm_pptdev_msix pptmsix; 620 621 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 622 error = EFAULT; 623 break; 624 } 625 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 626 pptmsix.idx, pptmsix.addr, pptmsix.msg, 627 pptmsix.vector_control); 628 break; 629 } 630 case VM_PPTDEV_DISABLE_MSIX: { 631 struct vm_pptdev pptdev; 632 633 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 634 error = EFAULT; 635 break; 636 } 637 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 638 break; 639 } 640 case VM_MAP_PPTDEV_MMIO: { 641 struct vm_pptdev_mmio pptmmio; 642 643 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 644 error = EFAULT; 645 break; 646 } 647 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 648 pptmmio.len, pptmmio.hpa); 649 break; 650 } 651 case VM_UNMAP_PPTDEV_MMIO: { 652 struct vm_pptdev_mmio pptmmio; 653 654 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 655 error = EFAULT; 656 break; 657 } 658 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 659 pptmmio.len); 660 break; 661 } 662 case VM_BIND_PPTDEV: { 663 struct vm_pptdev pptdev; 664 665 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 666 error = EFAULT; 667 break; 668 } 669 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 670 break; 671 } 672 case VM_UNBIND_PPTDEV: { 673 struct vm_pptdev pptdev; 674 675 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 676 error = EFAULT; 677 break; 678 } 679 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 680 break; 681 } 682 case VM_GET_PPTDEV_LIMITS: { 683 struct vm_pptdev_limits pptlimits; 684 685 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 686 error = EFAULT; 687 break; 688 } 689 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 690 &pptlimits.msi_limit, &pptlimits.msix_limit); 691 if (error == 0 && 692 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 693 error = EFAULT; 694 break; 695 } 696 break; 697 } 698 case VM_INJECT_EXCEPTION: { 699 struct vm_exception vmexc; 700 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 701 error = EFAULT; 702 break; 703 } 704 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 705 vmexc.error_code_valid != 0, vmexc.error_code, 706 vmexc.restart_instruction != 0); 707 break; 708 } 709 case VM_INJECT_NMI: { 710 struct vm_nmi vmnmi; 711 712 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 713 error = EFAULT; 714 break; 715 } 716 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 717 break; 718 } 719 case VM_LAPIC_IRQ: { 720 struct vm_lapic_irq vmirq; 721 722 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 723 error = EFAULT; 724 break; 725 } 726 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 727 break; 728 } 729 case VM_LAPIC_LOCAL_IRQ: { 730 struct vm_lapic_irq vmirq; 731 732 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 733 error = EFAULT; 734 break; 735 } 736 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 737 vmirq.vector); 738 break; 739 } 740 case VM_LAPIC_MSI: { 741 struct vm_lapic_msi vmmsi; 742 743 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 748 break; 749 } 750 751 case VM_IOAPIC_ASSERT_IRQ: { 752 struct vm_ioapic_irq ioapic_irq; 753 754 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 755 error = EFAULT; 756 break; 757 } 758 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 759 break; 760 } 761 case VM_IOAPIC_DEASSERT_IRQ: { 762 struct vm_ioapic_irq ioapic_irq; 763 764 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 765 error = EFAULT; 766 break; 767 } 768 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 769 break; 770 } 771 case VM_IOAPIC_PULSE_IRQ: { 772 struct vm_ioapic_irq ioapic_irq; 773 774 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 775 error = EFAULT; 776 break; 777 } 778 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 779 break; 780 } 781 case VM_IOAPIC_PINCOUNT: { 782 int pincount; 783 784 pincount = vioapic_pincount(sc->vmm_vm); 785 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 786 error = EFAULT; 787 break; 788 } 789 break; 790 } 791 case VM_DESC_FPU_AREA: { 792 struct vm_fpu_desc desc; 793 void *buf = NULL; 794 795 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 796 error = EFAULT; 797 break; 798 } 799 if (desc.vfd_num_entries > 64) { 800 error = EINVAL; 801 break; 802 } 803 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 804 desc.vfd_num_entries; 805 if (buf_sz != 0) { 806 buf = kmem_zalloc(buf_sz, KM_SLEEP); 807 } 808 809 /* 810 * For now, we are depending on vm_fpu_desc_entry and 811 * hma_xsave_state_desc_t having the same format. 812 */ 813 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 814 sizeof (hma_xsave_state_desc_t)); 815 816 size_t req_size; 817 const uint_t max_entries = hma_fpu_describe_xsave_state( 818 (hma_xsave_state_desc_t *)buf, 819 desc.vfd_num_entries, 820 &req_size); 821 822 desc.vfd_req_size = req_size; 823 desc.vfd_num_entries = max_entries; 824 if (buf_sz != 0) { 825 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 826 error = EFAULT; 827 } 828 kmem_free(buf, buf_sz); 829 } 830 831 if (error == 0) { 832 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 833 error = EFAULT; 834 } 835 } 836 break; 837 } 838 839 case VM_ISA_ASSERT_IRQ: { 840 struct vm_isa_irq isa_irq; 841 842 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 843 error = EFAULT; 844 break; 845 } 846 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 847 if (error == 0 && isa_irq.ioapic_irq != -1) { 848 error = vioapic_assert_irq(sc->vmm_vm, 849 isa_irq.ioapic_irq); 850 } 851 break; 852 } 853 case VM_ISA_DEASSERT_IRQ: { 854 struct vm_isa_irq isa_irq; 855 856 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 857 error = EFAULT; 858 break; 859 } 860 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 861 if (error == 0 && isa_irq.ioapic_irq != -1) { 862 error = vioapic_deassert_irq(sc->vmm_vm, 863 isa_irq.ioapic_irq); 864 } 865 break; 866 } 867 case VM_ISA_PULSE_IRQ: { 868 struct vm_isa_irq isa_irq; 869 870 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 871 error = EFAULT; 872 break; 873 } 874 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 875 if (error == 0 && isa_irq.ioapic_irq != -1) { 876 error = vioapic_pulse_irq(sc->vmm_vm, 877 isa_irq.ioapic_irq); 878 } 879 break; 880 } 881 case VM_ISA_SET_IRQ_TRIGGER: { 882 struct vm_isa_irq_trigger isa_irq_trigger; 883 884 if (ddi_copyin(datap, &isa_irq_trigger, 885 sizeof (isa_irq_trigger), md)) { 886 error = EFAULT; 887 break; 888 } 889 error = vatpic_set_irq_trigger(sc->vmm_vm, 890 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 891 break; 892 } 893 894 case VM_MMAP_GETNEXT: { 895 struct vm_memmap mm; 896 897 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 898 error = EFAULT; 899 break; 900 } 901 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 902 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 903 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 904 error = EFAULT; 905 break; 906 } 907 break; 908 } 909 case VM_MMAP_MEMSEG: { 910 struct vm_memmap mm; 911 912 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 913 error = EFAULT; 914 break; 915 } 916 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 917 mm.len, mm.prot, mm.flags); 918 break; 919 } 920 case VM_MUNMAP_MEMSEG: { 921 struct vm_munmap mu; 922 923 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 924 error = EFAULT; 925 break; 926 } 927 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 928 break; 929 } 930 case VM_ALLOC_MEMSEG: { 931 struct vm_memseg vmseg; 932 933 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 934 error = EFAULT; 935 break; 936 } 937 error = vmmdev_alloc_memseg(sc, &vmseg); 938 break; 939 } 940 case VM_GET_MEMSEG: { 941 struct vm_memseg vmseg; 942 943 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 944 error = EFAULT; 945 break; 946 } 947 error = vmmdev_get_memseg(sc, &vmseg); 948 if (error == 0 && 949 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 950 error = EFAULT; 951 break; 952 } 953 break; 954 } 955 case VM_GET_REGISTER: { 956 struct vm_register vmreg; 957 958 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 959 error = EFAULT; 960 break; 961 } 962 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 963 &vmreg.regval); 964 if (error == 0 && 965 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 966 error = EFAULT; 967 break; 968 } 969 break; 970 } 971 case VM_SET_REGISTER: { 972 struct vm_register vmreg; 973 974 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 975 error = EFAULT; 976 break; 977 } 978 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 979 vmreg.regval); 980 break; 981 } 982 case VM_SET_SEGMENT_DESCRIPTOR: { 983 struct vm_seg_desc vmsegd; 984 985 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 986 error = EFAULT; 987 break; 988 } 989 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 990 &vmsegd.desc); 991 break; 992 } 993 case VM_GET_SEGMENT_DESCRIPTOR: { 994 struct vm_seg_desc vmsegd; 995 996 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 997 error = EFAULT; 998 break; 999 } 1000 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1001 &vmsegd.desc); 1002 if (error == 0 && 1003 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1004 error = EFAULT; 1005 break; 1006 } 1007 break; 1008 } 1009 case VM_GET_REGISTER_SET: { 1010 struct vm_register_set vrs; 1011 int regnums[VM_REG_LAST]; 1012 uint64_t regvals[VM_REG_LAST]; 1013 1014 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1015 error = EFAULT; 1016 break; 1017 } 1018 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1019 error = EINVAL; 1020 break; 1021 } 1022 if (ddi_copyin(vrs.regnums, regnums, 1023 sizeof (int) * vrs.count, md)) { 1024 error = EFAULT; 1025 break; 1026 } 1027 1028 error = 0; 1029 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1030 if (regnums[i] < 0) { 1031 error = EINVAL; 1032 break; 1033 } 1034 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1035 ®vals[i]); 1036 } 1037 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1038 sizeof (uint64_t) * vrs.count, md)) { 1039 error = EFAULT; 1040 } 1041 break; 1042 } 1043 case VM_SET_REGISTER_SET: { 1044 struct vm_register_set vrs; 1045 int regnums[VM_REG_LAST]; 1046 uint64_t regvals[VM_REG_LAST]; 1047 1048 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1049 error = EFAULT; 1050 break; 1051 } 1052 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1053 error = EINVAL; 1054 break; 1055 } 1056 if (ddi_copyin(vrs.regnums, regnums, 1057 sizeof (int) * vrs.count, md)) { 1058 error = EFAULT; 1059 break; 1060 } 1061 if (ddi_copyin(vrs.regvals, regvals, 1062 sizeof (uint64_t) * vrs.count, md)) { 1063 error = EFAULT; 1064 break; 1065 } 1066 1067 error = 0; 1068 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1069 /* 1070 * Setting registers in a set is not atomic, since a 1071 * failure in the middle of the set will cause a 1072 * bail-out and inconsistent register state. Callers 1073 * should be wary of this. 1074 */ 1075 if (regnums[i] < 0) { 1076 error = EINVAL; 1077 break; 1078 } 1079 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1080 regvals[i]); 1081 } 1082 break; 1083 } 1084 case VM_RESET_CPU: { 1085 struct vm_vcpu_reset vvr; 1086 1087 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1088 error = EFAULT; 1089 break; 1090 } 1091 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1092 error = EINVAL; 1093 } 1094 1095 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1096 break; 1097 } 1098 case VM_GET_RUN_STATE: { 1099 struct vm_run_state vrs; 1100 1101 bzero(&vrs, sizeof (vrs)); 1102 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1103 &vrs.sipi_vector); 1104 if (error == 0) { 1105 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1106 error = EFAULT; 1107 break; 1108 } 1109 } 1110 break; 1111 } 1112 case VM_SET_RUN_STATE: { 1113 struct vm_run_state vrs; 1114 1115 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1116 error = EFAULT; 1117 break; 1118 } 1119 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1120 vrs.sipi_vector); 1121 break; 1122 } 1123 case VM_GET_FPU: { 1124 struct vm_fpu_state req; 1125 const size_t max_len = (PAGESIZE * 2); 1126 void *kbuf; 1127 1128 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1129 error = EFAULT; 1130 break; 1131 } 1132 if (req.len > max_len || req.len == 0) { 1133 error = EINVAL; 1134 break; 1135 } 1136 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1137 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1138 if (error == 0) { 1139 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1140 error = EFAULT; 1141 } 1142 } 1143 kmem_free(kbuf, req.len); 1144 break; 1145 } 1146 case VM_SET_FPU: { 1147 struct vm_fpu_state req; 1148 const size_t max_len = (PAGESIZE * 2); 1149 void *kbuf; 1150 1151 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1152 error = EFAULT; 1153 break; 1154 } 1155 if (req.len > max_len || req.len == 0) { 1156 error = EINVAL; 1157 break; 1158 } 1159 kbuf = kmem_alloc(req.len, KM_SLEEP); 1160 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1161 error = EFAULT; 1162 } else { 1163 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1164 } 1165 kmem_free(kbuf, req.len); 1166 break; 1167 } 1168 1169 case VM_SET_KERNEMU_DEV: 1170 case VM_GET_KERNEMU_DEV: { 1171 struct vm_readwrite_kernemu_device kemu; 1172 size_t size = 0; 1173 1174 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1175 error = EFAULT; 1176 break; 1177 } 1178 1179 if (kemu.access_width > 3) { 1180 error = EINVAL; 1181 break; 1182 } 1183 size = (1 << kemu.access_width); 1184 ASSERT(size >= 1 && size <= 8); 1185 1186 if (cmd == VM_SET_KERNEMU_DEV) { 1187 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1188 kemu.gpa, kemu.value, size); 1189 } else { 1190 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1191 kemu.gpa, &kemu.value, size); 1192 } 1193 1194 if (error == 0) { 1195 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1196 error = EFAULT; 1197 break; 1198 } 1199 } 1200 break; 1201 } 1202 1203 case VM_GET_CAPABILITY: { 1204 struct vm_capability vmcap; 1205 1206 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1207 error = EFAULT; 1208 break; 1209 } 1210 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1211 &vmcap.capval); 1212 if (error == 0 && 1213 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1214 error = EFAULT; 1215 break; 1216 } 1217 break; 1218 } 1219 case VM_SET_CAPABILITY: { 1220 struct vm_capability vmcap; 1221 1222 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1223 error = EFAULT; 1224 break; 1225 } 1226 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1227 vmcap.capval); 1228 break; 1229 } 1230 case VM_SET_X2APIC_STATE: { 1231 struct vm_x2apic x2apic; 1232 1233 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1234 error = EFAULT; 1235 break; 1236 } 1237 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1238 break; 1239 } 1240 case VM_GET_X2APIC_STATE: { 1241 struct vm_x2apic x2apic; 1242 1243 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1244 error = EFAULT; 1245 break; 1246 } 1247 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1248 &x2apic.state); 1249 if (error == 0 && 1250 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1251 error = EFAULT; 1252 break; 1253 } 1254 break; 1255 } 1256 case VM_GET_GPA_PMAP: { 1257 /* 1258 * Until there is a necessity to leak EPT/RVI PTE values to 1259 * userspace, this will remain unimplemented 1260 */ 1261 error = EINVAL; 1262 break; 1263 } 1264 case VM_GET_HPET_CAPABILITIES: { 1265 struct vm_hpet_cap hpetcap; 1266 1267 error = vhpet_getcap(&hpetcap); 1268 if (error == 0 && 1269 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1270 error = EFAULT; 1271 break; 1272 } 1273 break; 1274 } 1275 case VM_GLA2GPA: { 1276 struct vm_gla2gpa gg; 1277 1278 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1279 error = EFAULT; 1280 break; 1281 } 1282 gg.vcpuid = vcpu; 1283 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1284 gg.prot, &gg.gpa, &gg.fault); 1285 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1286 error = EFAULT; 1287 break; 1288 } 1289 break; 1290 } 1291 case VM_GLA2GPA_NOFAULT: { 1292 struct vm_gla2gpa gg; 1293 1294 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1295 error = EFAULT; 1296 break; 1297 } 1298 gg.vcpuid = vcpu; 1299 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1300 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1301 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1302 error = EFAULT; 1303 break; 1304 } 1305 break; 1306 } 1307 1308 case VM_ACTIVATE_CPU: 1309 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1310 break; 1311 1312 case VM_SUSPEND_CPU: 1313 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1314 error = EFAULT; 1315 } else { 1316 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1317 } 1318 break; 1319 1320 case VM_RESUME_CPU: 1321 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1322 error = EFAULT; 1323 } else { 1324 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1325 } 1326 break; 1327 1328 case VM_GET_CPUS: { 1329 struct vm_cpuset vm_cpuset; 1330 cpuset_t tempset; 1331 void *srcp = &tempset; 1332 int size; 1333 1334 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1335 error = EFAULT; 1336 break; 1337 } 1338 1339 /* Be more generous about sizing since our cpuset_t is large. */ 1340 size = vm_cpuset.cpusetsize; 1341 if (size <= 0 || size > sizeof (cpuset_t)) { 1342 error = ERANGE; 1343 } 1344 /* 1345 * If they want a ulong_t or less, make sure they receive the 1346 * low bits with all the useful information. 1347 */ 1348 if (size <= sizeof (tempset.cpub[0])) { 1349 srcp = &tempset.cpub[0]; 1350 } 1351 1352 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1353 tempset = vm_active_cpus(sc->vmm_vm); 1354 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1355 tempset = vm_suspended_cpus(sc->vmm_vm); 1356 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1357 tempset = vm_debug_cpus(sc->vmm_vm); 1358 } else { 1359 error = EINVAL; 1360 } 1361 1362 ASSERT(size > 0 && size <= sizeof (tempset)); 1363 if (error == 0 && 1364 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1365 error = EFAULT; 1366 break; 1367 } 1368 break; 1369 } 1370 case VM_SET_INTINFO: { 1371 struct vm_intinfo vmii; 1372 1373 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1374 error = EFAULT; 1375 break; 1376 } 1377 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1378 break; 1379 } 1380 case VM_GET_INTINFO: { 1381 struct vm_intinfo vmii; 1382 1383 vmii.vcpuid = vcpu; 1384 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1385 &vmii.info2); 1386 if (error == 0 && 1387 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1388 error = EFAULT; 1389 break; 1390 } 1391 break; 1392 } 1393 case VM_RTC_WRITE: { 1394 struct vm_rtc_data rtcdata; 1395 1396 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1397 error = EFAULT; 1398 break; 1399 } 1400 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1401 rtcdata.value); 1402 break; 1403 } 1404 case VM_RTC_READ: { 1405 struct vm_rtc_data rtcdata; 1406 1407 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1408 error = EFAULT; 1409 break; 1410 } 1411 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1412 &rtcdata.value); 1413 if (error == 0 && 1414 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1415 error = EFAULT; 1416 break; 1417 } 1418 break; 1419 } 1420 case VM_RTC_SETTIME: { 1421 struct vm_rtc_time rtctime; 1422 1423 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1424 error = EFAULT; 1425 break; 1426 } 1427 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1428 break; 1429 } 1430 case VM_RTC_GETTIME: { 1431 struct vm_rtc_time rtctime; 1432 1433 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1434 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1435 error = EFAULT; 1436 break; 1437 } 1438 break; 1439 } 1440 1441 case VM_PMTMR_LOCATE: { 1442 uint16_t port = arg; 1443 error = vpmtmr_set_location(sc->vmm_vm, port); 1444 break; 1445 } 1446 1447 case VM_RESTART_INSTRUCTION: 1448 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1449 break; 1450 1451 case VM_SET_TOPOLOGY: { 1452 struct vm_cpu_topology topo; 1453 1454 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1455 error = EFAULT; 1456 break; 1457 } 1458 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1459 topo.threads, topo.maxcpus); 1460 break; 1461 } 1462 case VM_GET_TOPOLOGY: { 1463 struct vm_cpu_topology topo; 1464 1465 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1466 &topo.threads, &topo.maxcpus); 1467 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1468 error = EFAULT; 1469 break; 1470 } 1471 break; 1472 } 1473 case VM_DEVMEM_GETOFFSET: { 1474 struct vm_devmem_offset vdo; 1475 vmm_devmem_entry_t *de; 1476 1477 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1478 error = EFAULT; 1479 break; 1480 } 1481 1482 de = vmmdev_devmem_find(sc, vdo.segid); 1483 if (de != NULL) { 1484 vdo.offset = de->vde_off; 1485 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1486 error = EFAULT; 1487 } 1488 } else { 1489 error = ENOENT; 1490 } 1491 break; 1492 } 1493 case VM_TRACK_DIRTY_PAGES: { 1494 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1495 struct vmm_dirty_tracker tracker; 1496 uint8_t *bitmap; 1497 size_t len; 1498 1499 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1500 error = EFAULT; 1501 break; 1502 } 1503 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1504 error = EINVAL; 1505 break; 1506 } 1507 if (tracker.vdt_len == 0) { 1508 break; 1509 } 1510 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1511 error = EINVAL; 1512 break; 1513 } 1514 if (tracker.vdt_len > max_track_region_len) { 1515 error = EINVAL; 1516 break; 1517 } 1518 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1519 bitmap = kmem_zalloc(len, KM_SLEEP); 1520 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1521 tracker.vdt_len, bitmap); 1522 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1523 error = EFAULT; 1524 } 1525 kmem_free(bitmap, len); 1526 1527 break; 1528 } 1529 case VM_WRLOCK_CYCLE: { 1530 /* 1531 * Present a test mechanism to acquire/release the write lock 1532 * on the VM without any other effects. 1533 */ 1534 break; 1535 } 1536 case VM_DATA_READ: { 1537 struct vm_data_xfer vdx; 1538 1539 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1540 error = EFAULT; 1541 break; 1542 } 1543 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1544 error = EINVAL; 1545 break; 1546 } 1547 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1548 error = EFBIG; 1549 break; 1550 } 1551 1552 const size_t len = vdx.vdx_len; 1553 void *buf = kmem_alloc(len, KM_SLEEP); 1554 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0) { 1555 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1556 kmem_free(buf, len); 1557 error = EFAULT; 1558 break; 1559 } 1560 } else { 1561 bzero(buf, len); 1562 } 1563 1564 vmm_data_req_t req = { 1565 .vdr_class = vdx.vdx_class, 1566 .vdr_version = vdx.vdx_version, 1567 .vdr_flags = vdx.vdx_flags, 1568 .vdr_len = vdx.vdx_len, 1569 .vdr_data = buf, 1570 }; 1571 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1572 1573 if (error == 0) { 1574 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1575 error = EFAULT; 1576 } 1577 } 1578 kmem_free(buf, len); 1579 break; 1580 } 1581 case VM_DATA_WRITE: { 1582 struct vm_data_xfer vdx; 1583 1584 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1585 error = EFAULT; 1586 break; 1587 } 1588 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1589 error = EINVAL; 1590 break; 1591 } 1592 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1593 error = EFBIG; 1594 break; 1595 } 1596 1597 const size_t len = vdx.vdx_len; 1598 void *buf = kmem_alloc(len, KM_SLEEP); 1599 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1600 kmem_free(buf, len); 1601 error = EFAULT; 1602 break; 1603 } 1604 1605 vmm_data_req_t req = { 1606 .vdr_class = vdx.vdx_class, 1607 .vdr_version = vdx.vdx_version, 1608 .vdr_flags = vdx.vdx_flags, 1609 .vdr_len = vdx.vdx_len, 1610 .vdr_data = buf, 1611 }; 1612 if (vmm_allow_state_writes == 0) { 1613 /* XXX: Play it safe for now */ 1614 error = EPERM; 1615 } else { 1616 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1617 &req); 1618 } 1619 1620 if (error == 0 && 1621 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1622 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1623 error = EFAULT; 1624 } 1625 } 1626 kmem_free(buf, len); 1627 break; 1628 } 1629 1630 default: 1631 error = ENOTTY; 1632 break; 1633 } 1634 1635 /* Release exclusion resources */ 1636 switch (lock_type) { 1637 case LOCK_NONE: 1638 break; 1639 case LOCK_VCPU: 1640 vcpu_unlock_one(sc, vcpu); 1641 break; 1642 case LOCK_READ_HOLD: 1643 vmm_read_unlock(sc); 1644 break; 1645 case LOCK_WRITE_HOLD: 1646 vmm_write_unlock(sc); 1647 break; 1648 default: 1649 panic("unexpected lock type"); 1650 break; 1651 } 1652 1653 return (error); 1654 } 1655 1656 static vmm_softc_t * 1657 vmm_lookup(const char *name) 1658 { 1659 list_t *vml = &vmm_list; 1660 vmm_softc_t *sc; 1661 1662 ASSERT(MUTEX_HELD(&vmm_mtx)); 1663 1664 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1665 if (strcmp(sc->vmm_name, name) == 0) { 1666 break; 1667 } 1668 } 1669 1670 return (sc); 1671 } 1672 1673 /* 1674 * Acquire an HMA registration if not already held. 1675 */ 1676 static boolean_t 1677 vmm_hma_acquire(void) 1678 { 1679 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1680 1681 mutex_enter(&vmmdev_mtx); 1682 1683 if (vmmdev_hma_reg == NULL) { 1684 VERIFY3U(vmmdev_hma_ref, ==, 0); 1685 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1686 if (vmmdev_hma_reg == NULL) { 1687 cmn_err(CE_WARN, "%s HMA registration failed.", 1688 vmmdev_hvm_name); 1689 mutex_exit(&vmmdev_mtx); 1690 return (B_FALSE); 1691 } 1692 } 1693 1694 vmmdev_hma_ref++; 1695 1696 mutex_exit(&vmmdev_mtx); 1697 1698 return (B_TRUE); 1699 } 1700 1701 /* 1702 * Release the HMA registration if held and there are no remaining VMs. 1703 */ 1704 static void 1705 vmm_hma_release(void) 1706 { 1707 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1708 1709 mutex_enter(&vmmdev_mtx); 1710 1711 VERIFY3U(vmmdev_hma_ref, !=, 0); 1712 1713 vmmdev_hma_ref--; 1714 1715 if (vmmdev_hma_ref == 0) { 1716 VERIFY(vmmdev_hma_reg != NULL); 1717 hma_unregister(vmmdev_hma_reg); 1718 vmmdev_hma_reg = NULL; 1719 } 1720 mutex_exit(&vmmdev_mtx); 1721 } 1722 1723 static int 1724 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1725 { 1726 vmm_softc_t *sc = NULL; 1727 minor_t minor; 1728 int error = ENOMEM; 1729 size_t len; 1730 const char *name = req->name; 1731 1732 len = strnlen(name, VM_MAX_NAMELEN); 1733 if (len == 0) { 1734 return (EINVAL); 1735 } 1736 if (len >= VM_MAX_NAMELEN) { 1737 return (ENAMETOOLONG); 1738 } 1739 if (strchr(name, '/') != NULL) { 1740 return (EINVAL); 1741 } 1742 1743 if (!vmm_hma_acquire()) 1744 return (ENXIO); 1745 1746 mutex_enter(&vmm_mtx); 1747 1748 /* Look for duplicate names */ 1749 if (vmm_lookup(name) != NULL) { 1750 mutex_exit(&vmm_mtx); 1751 vmm_hma_release(); 1752 return (EEXIST); 1753 } 1754 1755 /* Allow only one instance per non-global zone. */ 1756 if (!INGLOBALZONE(curproc)) { 1757 for (sc = list_head(&vmm_list); sc != NULL; 1758 sc = list_next(&vmm_list, sc)) { 1759 if (sc->vmm_zone == curzone) { 1760 mutex_exit(&vmm_mtx); 1761 vmm_hma_release(); 1762 return (EINVAL); 1763 } 1764 } 1765 } 1766 1767 minor = id_alloc(vmm_minors); 1768 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1769 goto fail; 1770 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1771 ddi_soft_state_free(vmm_statep, minor); 1772 goto fail; 1773 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1774 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1775 goto fail; 1776 } 1777 1778 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 1779 goto fail; 1780 } 1781 1782 error = vm_create(req->flags, &sc->vmm_vm); 1783 if (error == 0) { 1784 /* Complete VM intialization and report success. */ 1785 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 1786 sc->vmm_minor = minor; 1787 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 1788 offsetof(vmm_devmem_entry_t, vde_node)); 1789 1790 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 1791 offsetof(vmm_hold_t, vmh_node)); 1792 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 1793 1794 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 1795 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 1796 offsetof(vmm_lease_t, vml_node)); 1797 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 1798 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 1799 1800 sc->vmm_zone = crgetzone(cr); 1801 zone_hold(sc->vmm_zone); 1802 vmm_zsd_add_vm(sc); 1803 vmm_kstat_init(sc); 1804 1805 list_insert_tail(&vmm_list, sc); 1806 mutex_exit(&vmm_mtx); 1807 return (0); 1808 } 1809 1810 vmm_kstat_fini(sc); 1811 ddi_remove_minor_node(vmmdev_dip, name); 1812 fail: 1813 id_free(vmm_minors, minor); 1814 if (sc != NULL) { 1815 ddi_soft_state_free(vmm_statep, minor); 1816 } 1817 mutex_exit(&vmm_mtx); 1818 vmm_hma_release(); 1819 1820 return (error); 1821 } 1822 1823 /* 1824 * Bhyve 'Driver' Interface 1825 * 1826 * While many devices are emulated in the bhyve userspace process, there are 1827 * others with performance constraints which require that they run mostly or 1828 * entirely in-kernel. For those not integrated directly into bhyve, an API is 1829 * needed so they can query/manipulate the portions of VM state needed to 1830 * fulfill their purpose. 1831 * 1832 * This includes: 1833 * - Translating guest-physical addresses to host-virtual pointers 1834 * - Injecting MSIs 1835 * - Hooking IO port addresses 1836 * 1837 * The vmm_drv interface exists to provide that functionality to its consumers. 1838 * (At this time, 'viona' is the only user) 1839 */ 1840 int 1841 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 1842 { 1843 vnode_t *vp = fp->f_vnode; 1844 const dev_t dev = vp->v_rdev; 1845 vmm_softc_t *sc; 1846 vmm_hold_t *hold; 1847 int err = 0; 1848 1849 if (vp->v_type != VCHR) { 1850 return (ENXIO); 1851 } 1852 const major_t major = getmajor(dev); 1853 const minor_t minor = getminor(dev); 1854 1855 mutex_enter(&vmmdev_mtx); 1856 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 1857 mutex_exit(&vmmdev_mtx); 1858 return (ENOENT); 1859 } 1860 mutex_enter(&vmm_mtx); 1861 mutex_exit(&vmmdev_mtx); 1862 1863 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1864 err = ENOENT; 1865 goto out; 1866 } 1867 /* XXXJOY: check cred permissions against instance */ 1868 1869 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { 1870 err = EBUSY; 1871 goto out; 1872 } 1873 1874 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 1875 hold->vmh_sc = sc; 1876 hold->vmh_release_req = B_FALSE; 1877 1878 list_insert_tail(&sc->vmm_holds, hold); 1879 sc->vmm_flags |= VMM_HELD; 1880 *holdp = hold; 1881 1882 out: 1883 mutex_exit(&vmm_mtx); 1884 return (err); 1885 } 1886 1887 void 1888 vmm_drv_rele(vmm_hold_t *hold) 1889 { 1890 vmm_softc_t *sc; 1891 1892 ASSERT(hold != NULL); 1893 ASSERT(hold->vmh_sc != NULL); 1894 VERIFY(hold->vmh_ioport_hook_cnt == 0); 1895 1896 mutex_enter(&vmm_mtx); 1897 sc = hold->vmh_sc; 1898 list_remove(&sc->vmm_holds, hold); 1899 if (list_is_empty(&sc->vmm_holds)) { 1900 sc->vmm_flags &= ~VMM_HELD; 1901 cv_broadcast(&sc->vmm_cv); 1902 } 1903 mutex_exit(&vmm_mtx); 1904 kmem_free(hold, sizeof (*hold)); 1905 } 1906 1907 boolean_t 1908 vmm_drv_release_reqd(vmm_hold_t *hold) 1909 { 1910 ASSERT(hold != NULL); 1911 1912 return (hold->vmh_release_req); 1913 } 1914 1915 vmm_lease_t * 1916 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 1917 { 1918 vmm_softc_t *sc = hold->vmh_sc; 1919 vmm_lease_t *lease; 1920 1921 ASSERT3P(expiref, !=, NULL); 1922 1923 if (hold->vmh_release_req) { 1924 return (NULL); 1925 } 1926 1927 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 1928 list_link_init(&lease->vml_node); 1929 lease->vml_expire_func = expiref; 1930 lease->vml_expire_arg = arg; 1931 lease->vml_expired = B_FALSE; 1932 lease->vml_break_deferred = B_FALSE; 1933 lease->vml_hold = hold; 1934 /* cache the VM pointer for one less pointer chase */ 1935 lease->vml_vm = sc->vmm_vm; 1936 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 1937 1938 mutex_enter(&sc->vmm_lease_lock); 1939 while (sc->vmm_lease_blocker != 0) { 1940 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 1941 } 1942 list_insert_tail(&sc->vmm_lease_list, lease); 1943 vmm_read_lock(sc); 1944 mutex_exit(&sc->vmm_lease_lock); 1945 1946 return (lease); 1947 } 1948 1949 static void 1950 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 1951 { 1952 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 1953 1954 list_remove(&sc->vmm_lease_list, lease); 1955 vmm_read_unlock(sc); 1956 vmc_destroy(lease->vml_vmclient); 1957 kmem_free(lease, sizeof (*lease)); 1958 } 1959 1960 static void 1961 vmm_lease_block(vmm_softc_t *sc) 1962 { 1963 mutex_enter(&sc->vmm_lease_lock); 1964 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 1965 sc->vmm_lease_blocker++; 1966 if (sc->vmm_lease_blocker == 1) { 1967 list_t *list = &sc->vmm_lease_list; 1968 vmm_lease_t *lease = list_head(list); 1969 1970 while (lease != NULL) { 1971 void *arg = lease->vml_expire_arg; 1972 boolean_t (*expiref)(void *) = lease->vml_expire_func; 1973 boolean_t sync_break = B_FALSE; 1974 1975 /* 1976 * Since the lease expiration notification may 1977 * need to take locks which would deadlock with 1978 * vmm_lease_lock, drop it across the call. 1979 * 1980 * We are the only one allowed to manipulate 1981 * vmm_lease_list right now, so it is safe to 1982 * continue iterating through it after 1983 * reacquiring the lock. 1984 */ 1985 lease->vml_expired = B_TRUE; 1986 mutex_exit(&sc->vmm_lease_lock); 1987 sync_break = expiref(arg); 1988 mutex_enter(&sc->vmm_lease_lock); 1989 1990 if (sync_break) { 1991 vmm_lease_t *next; 1992 1993 /* 1994 * These leases which are synchronously broken 1995 * result in vmm_read_unlock() calls from a 1996 * different thread than the corresponding 1997 * vmm_read_lock(). This is acceptable, given 1998 * that the rwlock underpinning the whole 1999 * mechanism tolerates the behavior. This 2000 * flexibility is _only_ afforded to VM read 2001 * lock (RW_READER) holders. 2002 */ 2003 next = list_next(list, lease); 2004 vmm_lease_break_locked(sc, lease); 2005 lease = next; 2006 } else { 2007 lease = list_next(list, lease); 2008 } 2009 } 2010 2011 /* Process leases which were not broken synchronously. */ 2012 while (!list_is_empty(list)) { 2013 /* 2014 * Although the nested loops are quadratic, the number 2015 * of leases is small. 2016 */ 2017 lease = list_head(list); 2018 while (lease != NULL) { 2019 vmm_lease_t *next = list_next(list, lease); 2020 if (lease->vml_break_deferred) { 2021 vmm_lease_break_locked(sc, lease); 2022 } 2023 lease = next; 2024 } 2025 if (list_is_empty(list)) { 2026 break; 2027 } 2028 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2029 } 2030 /* Wake anyone else waiting for the lease list to be empty */ 2031 cv_broadcast(&sc->vmm_lease_cv); 2032 } else { 2033 list_t *list = &sc->vmm_lease_list; 2034 2035 /* 2036 * Some other thread beat us to the duty of lease cleanup. 2037 * Wait until that is complete. 2038 */ 2039 while (!list_is_empty(list)) { 2040 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2041 } 2042 } 2043 mutex_exit(&sc->vmm_lease_lock); 2044 } 2045 2046 static void 2047 vmm_lease_unblock(vmm_softc_t *sc) 2048 { 2049 mutex_enter(&sc->vmm_lease_lock); 2050 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2051 sc->vmm_lease_blocker--; 2052 if (sc->vmm_lease_blocker == 0) { 2053 cv_broadcast(&sc->vmm_lease_cv); 2054 } 2055 mutex_exit(&sc->vmm_lease_lock); 2056 } 2057 2058 void 2059 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2060 { 2061 vmm_softc_t *sc = hold->vmh_sc; 2062 2063 VERIFY3P(hold, ==, lease->vml_hold); 2064 VERIFY(!lease->vml_break_deferred); 2065 2066 mutex_enter(&sc->vmm_lease_lock); 2067 if (sc->vmm_lease_blocker == 0) { 2068 vmm_lease_break_locked(sc, lease); 2069 } else { 2070 /* 2071 * Defer the lease-breaking to whichever thread is currently 2072 * cleaning up all leases as part of a vmm_lease_block() call. 2073 */ 2074 lease->vml_break_deferred = B_TRUE; 2075 cv_broadcast(&sc->vmm_lease_cv); 2076 } 2077 mutex_exit(&sc->vmm_lease_lock); 2078 } 2079 2080 boolean_t 2081 vmm_drv_lease_expired(vmm_lease_t *lease) 2082 { 2083 return (lease->vml_expired); 2084 } 2085 2086 vmm_page_t * 2087 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2088 { 2089 ASSERT(lease != NULL); 2090 ASSERT0(gpa & PAGEOFFSET); 2091 2092 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2093 } 2094 2095 void 2096 vmm_drv_page_release(vmm_page_t *vmmp) 2097 { 2098 (void) vmp_release((vm_page_t *)vmmp); 2099 } 2100 2101 void 2102 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2103 { 2104 (void) vmp_release_chain((vm_page_t *)vmmp); 2105 } 2106 2107 const void * 2108 vmm_drv_page_readable(const vmm_page_t *vmmp) 2109 { 2110 return (vmp_get_readable((const vm_page_t *)vmmp)); 2111 } 2112 2113 void * 2114 vmm_drv_page_writable(const vmm_page_t *vmmp) 2115 { 2116 return (vmp_get_writable((const vm_page_t *)vmmp)); 2117 } 2118 2119 void 2120 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2121 { 2122 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2123 } 2124 2125 vmm_page_t * 2126 vmm_drv_page_next(const vmm_page_t *vmmp) 2127 { 2128 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2129 } 2130 2131 int 2132 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2133 { 2134 ASSERT(lease != NULL); 2135 2136 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2137 } 2138 2139 int 2140 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2141 void *arg, void **cookie) 2142 { 2143 vmm_softc_t *sc; 2144 int err; 2145 2146 ASSERT(hold != NULL); 2147 ASSERT(cookie != NULL); 2148 2149 sc = hold->vmh_sc; 2150 mutex_enter(&vmm_mtx); 2151 /* Confirm that hook installation is not blocked */ 2152 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2153 mutex_exit(&vmm_mtx); 2154 return (EBUSY); 2155 } 2156 /* 2157 * Optimistically record an installed hook which will prevent a block 2158 * from being asserted while the mutex is dropped. 2159 */ 2160 hold->vmh_ioport_hook_cnt++; 2161 mutex_exit(&vmm_mtx); 2162 2163 vmm_write_lock(sc); 2164 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2165 arg, cookie); 2166 vmm_write_unlock(sc); 2167 2168 if (err != 0) { 2169 mutex_enter(&vmm_mtx); 2170 /* Walk back optimism about the hook installation */ 2171 hold->vmh_ioport_hook_cnt--; 2172 mutex_exit(&vmm_mtx); 2173 } 2174 return (err); 2175 } 2176 2177 void 2178 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2179 { 2180 vmm_softc_t *sc; 2181 2182 ASSERT(hold != NULL); 2183 ASSERT(cookie != NULL); 2184 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2185 2186 sc = hold->vmh_sc; 2187 vmm_write_lock(sc); 2188 vm_ioport_unhook(sc->vmm_vm, cookie); 2189 vmm_write_unlock(sc); 2190 2191 mutex_enter(&vmm_mtx); 2192 hold->vmh_ioport_hook_cnt--; 2193 mutex_exit(&vmm_mtx); 2194 } 2195 2196 static int 2197 vmm_drv_purge(vmm_softc_t *sc) 2198 { 2199 ASSERT(MUTEX_HELD(&vmm_mtx)); 2200 2201 if ((sc->vmm_flags & VMM_HELD) != 0) { 2202 vmm_hold_t *hold; 2203 2204 sc->vmm_flags |= VMM_CLEANUP; 2205 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2206 hold = list_next(&sc->vmm_holds, hold)) { 2207 hold->vmh_release_req = B_TRUE; 2208 } 2209 while ((sc->vmm_flags & VMM_HELD) != 0) { 2210 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2211 return (EINTR); 2212 } 2213 } 2214 sc->vmm_flags &= ~VMM_CLEANUP; 2215 } 2216 2217 VERIFY(list_is_empty(&sc->vmm_holds)); 2218 sc->vmm_flags |= VMM_PURGED; 2219 return (0); 2220 } 2221 2222 static int 2223 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2224 { 2225 int err = 0; 2226 2227 mutex_enter(&vmm_mtx); 2228 if (!enable_block) { 2229 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2230 2231 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2232 goto done; 2233 } 2234 2235 /* If any holds have hooks installed, the block is a failure */ 2236 if (!list_is_empty(&sc->vmm_holds)) { 2237 vmm_hold_t *hold; 2238 2239 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2240 hold = list_next(&sc->vmm_holds, hold)) { 2241 if (hold->vmh_ioport_hook_cnt != 0) { 2242 err = EBUSY; 2243 goto done; 2244 } 2245 } 2246 } 2247 sc->vmm_flags |= VMM_BLOCK_HOOK; 2248 2249 done: 2250 mutex_exit(&vmm_mtx); 2251 return (err); 2252 } 2253 2254 static int 2255 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd, 2256 boolean_t *hma_release) 2257 { 2258 dev_info_t *pdip = ddi_get_parent(vmmdev_dip); 2259 minor_t minor; 2260 2261 ASSERT(MUTEX_HELD(&vmm_mtx)); 2262 2263 *hma_release = B_FALSE; 2264 2265 if (vmm_drv_purge(sc) != 0) { 2266 return (EINTR); 2267 } 2268 2269 if (clean_zsd) { 2270 vmm_zsd_rem_vm(sc); 2271 } 2272 2273 /* Clean up devmem entries */ 2274 vmmdev_devmem_purge(sc); 2275 2276 list_remove(&vmm_list, sc); 2277 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2278 minor = sc->vmm_minor; 2279 zone_rele(sc->vmm_zone); 2280 if (sc->vmm_is_open) { 2281 list_insert_tail(&vmm_destroy_list, sc); 2282 sc->vmm_flags |= VMM_DESTROY; 2283 } else { 2284 vmm_kstat_fini(sc); 2285 vm_destroy(sc->vmm_vm); 2286 ddi_soft_state_free(vmm_statep, minor); 2287 id_free(vmm_minors, minor); 2288 *hma_release = B_TRUE; 2289 } 2290 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); 2291 2292 return (0); 2293 } 2294 2295 int 2296 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) 2297 { 2298 boolean_t hma_release = B_FALSE; 2299 int err; 2300 2301 mutex_enter(&vmm_mtx); 2302 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release); 2303 mutex_exit(&vmm_mtx); 2304 2305 if (hma_release) 2306 vmm_hma_release(); 2307 2308 return (err); 2309 } 2310 2311 /* ARGSUSED */ 2312 static int 2313 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2314 { 2315 boolean_t hma_release = B_FALSE; 2316 vmm_softc_t *sc; 2317 int err; 2318 2319 if (crgetuid(cr) != 0) 2320 return (EPERM); 2321 2322 mutex_enter(&vmm_mtx); 2323 2324 if ((sc = vmm_lookup(req->name)) == NULL) { 2325 mutex_exit(&vmm_mtx); 2326 return (ENOENT); 2327 } 2328 /* 2329 * We don't check this in vmm_lookup() since that function is also used 2330 * for validation during create and currently vmm names must be unique. 2331 */ 2332 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2333 mutex_exit(&vmm_mtx); 2334 return (EPERM); 2335 } 2336 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release); 2337 2338 mutex_exit(&vmm_mtx); 2339 2340 if (hma_release) 2341 vmm_hma_release(); 2342 2343 return (err); 2344 } 2345 2346 #define VCPU_NAME_BUFLEN 32 2347 2348 static int 2349 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2350 { 2351 zoneid_t zid = crgetzoneid(cr); 2352 int instance = minor; 2353 kstat_t *ksp; 2354 2355 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2356 2357 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2358 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2359 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2360 2361 if (ksp == NULL) { 2362 return (-1); 2363 } 2364 sc->vmm_kstat_vm = ksp; 2365 2366 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2367 char namebuf[VCPU_NAME_BUFLEN]; 2368 2369 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2370 2371 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2372 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2373 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2374 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2375 0, zid); 2376 if (ksp == NULL) { 2377 goto fail; 2378 } 2379 2380 sc->vmm_kstat_vcpu[i] = ksp; 2381 } 2382 2383 /* 2384 * If this instance is associated with a non-global zone, make its 2385 * kstats visible from the GZ. 2386 */ 2387 if (zid != GLOBAL_ZONEID) { 2388 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2389 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2390 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2391 } 2392 } 2393 2394 return (0); 2395 2396 fail: 2397 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2398 if (sc->vmm_kstat_vcpu[i] != NULL) { 2399 kstat_delete(sc->vmm_kstat_vcpu[i]); 2400 sc->vmm_kstat_vcpu[i] = NULL; 2401 } else { 2402 break; 2403 } 2404 } 2405 kstat_delete(sc->vmm_kstat_vm); 2406 sc->vmm_kstat_vm = NULL; 2407 return (-1); 2408 } 2409 2410 static void 2411 vmm_kstat_init(vmm_softc_t *sc) 2412 { 2413 kstat_t *ksp; 2414 2415 ASSERT3P(sc->vmm_vm, !=, NULL); 2416 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2417 2418 ksp = sc->vmm_kstat_vm; 2419 vmm_kstats_t *vk = ksp->ks_data; 2420 ksp->ks_private = sc->vmm_vm; 2421 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2422 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2423 2424 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2425 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2426 2427 ksp = sc->vmm_kstat_vcpu[i]; 2428 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2429 2430 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2431 vvk->vvk_vcpu.value.ui32 = i; 2432 kstat_named_init(&vvk->vvk_time_init, "time_init", 2433 KSTAT_DATA_UINT64); 2434 kstat_named_init(&vvk->vvk_time_run, "time_run", 2435 KSTAT_DATA_UINT64); 2436 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2437 KSTAT_DATA_UINT64); 2438 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2439 KSTAT_DATA_UINT64); 2440 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2441 KSTAT_DATA_UINT64); 2442 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2443 KSTAT_DATA_UINT64); 2444 ksp->ks_private = sc->vmm_vm; 2445 ksp->ks_update = vmm_kstat_update_vcpu; 2446 } 2447 2448 kstat_install(sc->vmm_kstat_vm); 2449 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2450 kstat_install(sc->vmm_kstat_vcpu[i]); 2451 } 2452 } 2453 2454 static void 2455 vmm_kstat_fini(vmm_softc_t *sc) 2456 { 2457 ASSERT(sc->vmm_kstat_vm != NULL); 2458 2459 kstat_delete(sc->vmm_kstat_vm); 2460 sc->vmm_kstat_vm = NULL; 2461 2462 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2463 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2464 2465 kstat_delete(sc->vmm_kstat_vcpu[i]); 2466 sc->vmm_kstat_vcpu[i] = NULL; 2467 } 2468 } 2469 2470 static int 2471 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2472 { 2473 minor_t minor; 2474 vmm_softc_t *sc; 2475 2476 /* 2477 * Forbid running bhyve in a 32-bit process until it has been tested and 2478 * verified to be safe. 2479 */ 2480 if (curproc->p_model != DATAMODEL_LP64) { 2481 return (EFBIG); 2482 } 2483 2484 minor = getminor(*devp); 2485 if (minor == VMM_CTL_MINOR) { 2486 /* 2487 * Master control device must be opened exclusively. 2488 */ 2489 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2490 return (EINVAL); 2491 } 2492 2493 return (0); 2494 } 2495 2496 mutex_enter(&vmm_mtx); 2497 sc = ddi_get_soft_state(vmm_statep, minor); 2498 if (sc == NULL) { 2499 mutex_exit(&vmm_mtx); 2500 return (ENXIO); 2501 } 2502 2503 sc->vmm_is_open = B_TRUE; 2504 mutex_exit(&vmm_mtx); 2505 2506 return (0); 2507 } 2508 2509 static int 2510 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2511 { 2512 minor_t minor; 2513 vmm_softc_t *sc; 2514 boolean_t hma_release = B_FALSE; 2515 2516 minor = getminor(dev); 2517 if (minor == VMM_CTL_MINOR) 2518 return (0); 2519 2520 mutex_enter(&vmm_mtx); 2521 sc = ddi_get_soft_state(vmm_statep, minor); 2522 if (sc == NULL) { 2523 mutex_exit(&vmm_mtx); 2524 return (ENXIO); 2525 } 2526 2527 VERIFY(sc->vmm_is_open); 2528 sc->vmm_is_open = B_FALSE; 2529 2530 /* 2531 * If this VM was destroyed while the vmm device was open, then 2532 * clean it up now that it is closed. 2533 */ 2534 if (sc->vmm_flags & VMM_DESTROY) { 2535 list_remove(&vmm_destroy_list, sc); 2536 vmm_kstat_fini(sc); 2537 vm_destroy(sc->vmm_vm); 2538 ddi_soft_state_free(vmm_statep, minor); 2539 id_free(vmm_minors, minor); 2540 hma_release = B_TRUE; 2541 } 2542 mutex_exit(&vmm_mtx); 2543 2544 if (hma_release) 2545 vmm_hma_release(); 2546 2547 return (0); 2548 } 2549 2550 static int 2551 vmm_is_supported(intptr_t arg) 2552 { 2553 int r; 2554 const char *msg; 2555 2556 if (vmm_is_intel()) { 2557 r = vmx_x86_supported(&msg); 2558 } else if (vmm_is_svm()) { 2559 /* 2560 * HMA already ensured that the features necessary for SVM 2561 * operation were present and online during vmm_attach(). 2562 */ 2563 r = 0; 2564 } else { 2565 r = ENXIO; 2566 msg = "Unsupported CPU vendor"; 2567 } 2568 2569 if (r != 0 && arg != (intptr_t)NULL) { 2570 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2571 return (EFAULT); 2572 } 2573 return (r); 2574 } 2575 2576 static int 2577 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2578 { 2579 void *argp = (void *)arg; 2580 2581 switch (cmd) { 2582 case VMM_CREATE_VM: { 2583 struct vm_create_req req; 2584 2585 if ((md & FWRITE) == 0) { 2586 return (EPERM); 2587 } 2588 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2589 return (EFAULT); 2590 } 2591 return (vmmdev_do_vm_create(&req, cr)); 2592 } 2593 case VMM_DESTROY_VM: { 2594 struct vm_destroy_req req; 2595 2596 if ((md & FWRITE) == 0) { 2597 return (EPERM); 2598 } 2599 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2600 return (EFAULT); 2601 } 2602 return (vmmdev_do_vm_destroy(&req, cr)); 2603 } 2604 case VMM_VM_SUPPORTED: 2605 return (vmm_is_supported(arg)); 2606 case VMM_INTERFACE_VERSION: 2607 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2608 return (0); 2609 case VMM_CHECK_IOMMU: 2610 if (!vmm_check_iommu()) { 2611 return (ENXIO); 2612 } 2613 return (0); 2614 case VMM_RESV_QUERY: 2615 case VMM_RESV_ADD: 2616 case VMM_RESV_REMOVE: 2617 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2618 default: 2619 break; 2620 } 2621 /* No other actions are legal on ctl device */ 2622 return (ENOTTY); 2623 } 2624 2625 static int 2626 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2627 int *rvalp) 2628 { 2629 vmm_softc_t *sc; 2630 minor_t minor; 2631 2632 /* 2633 * Forbid running bhyve in a 32-bit process until it has been tested and 2634 * verified to be safe. 2635 */ 2636 if (curproc->p_model != DATAMODEL_LP64) { 2637 return (EFBIG); 2638 } 2639 2640 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2641 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2642 return (ENOTSUP); 2643 } 2644 2645 minor = getminor(dev); 2646 2647 if (minor == VMM_CTL_MINOR) { 2648 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2649 } 2650 2651 sc = ddi_get_soft_state(vmm_statep, minor); 2652 ASSERT(sc); 2653 2654 if (sc->vmm_flags & VMM_DESTROY) 2655 return (ENXIO); 2656 2657 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 2658 } 2659 2660 static int 2661 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 2662 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 2663 { 2664 vmm_softc_t *sc; 2665 const minor_t minor = getminor(dev); 2666 int err; 2667 2668 if (minor == VMM_CTL_MINOR) { 2669 return (ENODEV); 2670 } 2671 if (off < 0 || (off + len) <= 0) { 2672 return (EINVAL); 2673 } 2674 if ((prot & PROT_USER) == 0) { 2675 return (EACCES); 2676 } 2677 2678 sc = ddi_get_soft_state(vmm_statep, minor); 2679 ASSERT(sc); 2680 2681 if (sc->vmm_flags & VMM_DESTROY) 2682 return (ENXIO); 2683 2684 /* Grab read lock on the VM to prevent any changes to the memory map */ 2685 vmm_read_lock(sc); 2686 2687 if (off >= VM_DEVMEM_START) { 2688 int segid; 2689 off_t segoff; 2690 2691 /* Mapping a devmem "device" */ 2692 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 2693 err = ENODEV; 2694 } else { 2695 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 2696 addrp, prot, maxprot, flags); 2697 } 2698 } else { 2699 /* Mapping a part of the guest physical space */ 2700 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 2701 maxprot, flags); 2702 } 2703 2704 vmm_read_unlock(sc); 2705 return (err); 2706 } 2707 2708 static sdev_plugin_validate_t 2709 vmm_sdev_validate(sdev_ctx_t ctx) 2710 { 2711 const char *name = sdev_ctx_name(ctx); 2712 vmm_softc_t *sc; 2713 sdev_plugin_validate_t ret; 2714 minor_t minor; 2715 2716 if (sdev_ctx_vtype(ctx) != VCHR) 2717 return (SDEV_VTOR_INVALID); 2718 2719 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 2720 2721 mutex_enter(&vmm_mtx); 2722 if ((sc = vmm_lookup(name)) == NULL) 2723 ret = SDEV_VTOR_INVALID; 2724 else if (sc->vmm_minor != minor) 2725 ret = SDEV_VTOR_STALE; 2726 else 2727 ret = SDEV_VTOR_VALID; 2728 mutex_exit(&vmm_mtx); 2729 2730 return (ret); 2731 } 2732 2733 static int 2734 vmm_sdev_filldir(sdev_ctx_t ctx) 2735 { 2736 vmm_softc_t *sc; 2737 int ret; 2738 2739 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 2740 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 2741 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 2742 return (EINVAL); 2743 } 2744 2745 mutex_enter(&vmm_mtx); 2746 ASSERT(vmmdev_dip != NULL); 2747 for (sc = list_head(&vmm_list); sc != NULL; 2748 sc = list_next(&vmm_list, sc)) { 2749 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 2750 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 2751 S_IFCHR | 0600, 2752 makedevice(ddi_driver_major(vmmdev_dip), 2753 sc->vmm_minor)); 2754 } else { 2755 continue; 2756 } 2757 if (ret != 0 && ret != EEXIST) 2758 goto out; 2759 } 2760 2761 ret = 0; 2762 2763 out: 2764 mutex_exit(&vmm_mtx); 2765 return (ret); 2766 } 2767 2768 /* ARGSUSED */ 2769 static void 2770 vmm_sdev_inactive(sdev_ctx_t ctx) 2771 { 2772 } 2773 2774 static sdev_plugin_ops_t vmm_sdev_ops = { 2775 .spo_version = SDEV_PLUGIN_VERSION, 2776 .spo_flags = SDEV_PLUGIN_SUBDIR, 2777 .spo_validate = vmm_sdev_validate, 2778 .spo_filldir = vmm_sdev_filldir, 2779 .spo_inactive = vmm_sdev_inactive 2780 }; 2781 2782 /* ARGSUSED */ 2783 static int 2784 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 2785 { 2786 int error; 2787 2788 switch (cmd) { 2789 case DDI_INFO_DEVT2DEVINFO: 2790 *result = (void *)vmmdev_dip; 2791 error = DDI_SUCCESS; 2792 break; 2793 case DDI_INFO_DEVT2INSTANCE: 2794 *result = (void *)0; 2795 error = DDI_SUCCESS; 2796 break; 2797 default: 2798 error = DDI_FAILURE; 2799 break; 2800 } 2801 return (error); 2802 } 2803 2804 static int 2805 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2806 { 2807 sdev_plugin_hdl_t sph; 2808 hma_reg_t *reg = NULL; 2809 boolean_t vmm_loaded = B_FALSE; 2810 2811 if (cmd != DDI_ATTACH) { 2812 return (DDI_FAILURE); 2813 } 2814 2815 mutex_enter(&vmmdev_mtx); 2816 /* Ensure we are not already attached. */ 2817 if (vmmdev_dip != NULL) { 2818 mutex_exit(&vmmdev_mtx); 2819 return (DDI_FAILURE); 2820 } 2821 2822 vmm_sol_glue_init(); 2823 2824 /* 2825 * Perform temporary HMA registration to determine if the system 2826 * is capable. 2827 */ 2828 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 2829 goto fail; 2830 } else if (vmm_mod_load() != 0) { 2831 goto fail; 2832 } 2833 vmm_loaded = B_TRUE; 2834 hma_unregister(reg); 2835 reg = NULL; 2836 2837 /* Create control node. Other nodes will be created on demand. */ 2838 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 2839 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 2840 goto fail; 2841 } 2842 2843 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 2844 if (sph == (sdev_plugin_hdl_t)NULL) { 2845 ddi_remove_minor_node(dip, NULL); 2846 goto fail; 2847 } 2848 2849 ddi_report_dev(dip); 2850 vmmdev_sdev_hdl = sph; 2851 vmmdev_dip = dip; 2852 mutex_exit(&vmmdev_mtx); 2853 return (DDI_SUCCESS); 2854 2855 fail: 2856 if (vmm_loaded) { 2857 VERIFY0(vmm_mod_unload()); 2858 } 2859 if (reg != NULL) { 2860 hma_unregister(reg); 2861 } 2862 vmm_sol_glue_cleanup(); 2863 mutex_exit(&vmmdev_mtx); 2864 return (DDI_FAILURE); 2865 } 2866 2867 static int 2868 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2869 { 2870 if (cmd != DDI_DETACH) { 2871 return (DDI_FAILURE); 2872 } 2873 2874 /* 2875 * Ensure that all resources have been cleaned up. 2876 * 2877 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 2878 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 2879 * devinfo locked as iommu_cleanup() tries to recursively lock each 2880 * devinfo, including our own, while holding vmmdev_mtx. 2881 */ 2882 if (mutex_tryenter(&vmmdev_mtx) == 0) 2883 return (DDI_FAILURE); 2884 2885 mutex_enter(&vmm_mtx); 2886 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { 2887 mutex_exit(&vmm_mtx); 2888 mutex_exit(&vmmdev_mtx); 2889 return (DDI_FAILURE); 2890 } 2891 mutex_exit(&vmm_mtx); 2892 2893 if (!vmmr_is_empty()) { 2894 mutex_exit(&vmmdev_mtx); 2895 return (DDI_FAILURE); 2896 } 2897 2898 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 2899 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 2900 mutex_exit(&vmmdev_mtx); 2901 return (DDI_FAILURE); 2902 } 2903 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 2904 2905 /* Remove the control node. */ 2906 ddi_remove_minor_node(dip, "ctl"); 2907 vmmdev_dip = NULL; 2908 2909 VERIFY0(vmm_mod_unload()); 2910 VERIFY3U(vmmdev_hma_reg, ==, NULL); 2911 vmm_sol_glue_cleanup(); 2912 2913 mutex_exit(&vmmdev_mtx); 2914 2915 return (DDI_SUCCESS); 2916 } 2917 2918 static struct cb_ops vmm_cb_ops = { 2919 vmm_open, 2920 vmm_close, 2921 nodev, /* strategy */ 2922 nodev, /* print */ 2923 nodev, /* dump */ 2924 nodev, /* read */ 2925 nodev, /* write */ 2926 vmm_ioctl, 2927 nodev, /* devmap */ 2928 nodev, /* mmap */ 2929 vmm_segmap, 2930 nochpoll, /* poll */ 2931 ddi_prop_op, 2932 NULL, 2933 D_NEW | D_MP | D_DEVMAP 2934 }; 2935 2936 static struct dev_ops vmm_ops = { 2937 DEVO_REV, 2938 0, 2939 vmm_info, 2940 nulldev, /* identify */ 2941 nulldev, /* probe */ 2942 vmm_attach, 2943 vmm_detach, 2944 nodev, /* reset */ 2945 &vmm_cb_ops, 2946 (struct bus_ops *)NULL 2947 }; 2948 2949 static struct modldrv modldrv = { 2950 &mod_driverops, 2951 "bhyve vmm", 2952 &vmm_ops 2953 }; 2954 2955 static struct modlinkage modlinkage = { 2956 MODREV_1, 2957 &modldrv, 2958 NULL 2959 }; 2960 2961 int 2962 _init(void) 2963 { 2964 int error; 2965 2966 sysinit(); 2967 2968 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 2969 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 2970 list_create(&vmm_list, sizeof (vmm_softc_t), 2971 offsetof(vmm_softc_t, vmm_node)); 2972 list_create(&vmm_destroy_list, sizeof (vmm_softc_t), 2973 offsetof(vmm_softc_t, vmm_node)); 2974 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 2975 2976 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 2977 if (error) { 2978 return (error); 2979 } 2980 2981 vmm_zsd_init(); 2982 vmmr_init(); 2983 2984 error = mod_install(&modlinkage); 2985 if (error) { 2986 ddi_soft_state_fini(&vmm_statep); 2987 vmm_zsd_fini(); 2988 vmmr_fini(); 2989 } 2990 2991 return (error); 2992 } 2993 2994 int 2995 _fini(void) 2996 { 2997 int error; 2998 2999 error = mod_remove(&modlinkage); 3000 if (error) { 3001 return (error); 3002 } 3003 3004 vmm_zsd_fini(); 3005 vmmr_fini(); 3006 3007 ddi_soft_state_fini(&vmm_statep); 3008 3009 return (0); 3010 } 3011 3012 int 3013 _info(struct modinfo *modinfop) 3014 { 3015 return (mod_info(&modlinkage, modinfop)); 3016 } 3017