1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2015 Pluribus Networks Inc. 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 16 */ 17 18 #include <sys/types.h> 19 #include <sys/conf.h> 20 #include <sys/cpuvar.h> 21 #include <sys/ioccom.h> 22 #include <sys/stat.h> 23 #include <sys/vmsystm.h> 24 #include <sys/ddi.h> 25 #include <sys/mkdev.h> 26 #include <sys/sunddi.h> 27 #include <sys/fs/dv_node.h> 28 #include <sys/cpuset.h> 29 #include <sys/id_space.h> 30 #include <sys/fs/sdev_plugin.h> 31 #include <sys/smt.h> 32 33 #include <sys/kernel.h> 34 #include <sys/hma.h> 35 #include <sys/x86_archext.h> 36 #include <x86/apicreg.h> 37 38 #include <sys/vmm.h> 39 #include <sys/vmm_instruction_emul.h> 40 #include <sys/vmm_dev.h> 41 #include <sys/vmm_impl.h> 42 #include <sys/vmm_drv.h> 43 44 #include <vm/vm.h> 45 #include <vm/seg_dev.h> 46 47 #include "io/ppt.h" 48 #include "io/vatpic.h" 49 #include "io/vioapic.h" 50 #include "io/vrtc.h" 51 #include "io/vhpet.h" 52 #include "vmm_lapic.h" 53 #include "vmm_stat.h" 54 #include "vmm_util.h" 55 #include "vm/vm_glue.h" 56 57 /* 58 * Locking details: 59 * 60 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 61 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 62 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 63 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 64 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 65 */ 66 67 static kmutex_t vmmdev_mtx; 68 static dev_info_t *vmmdev_dip; 69 static hma_reg_t *vmmdev_hma_reg; 70 static uint_t vmmdev_hma_ref; 71 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 72 73 static kmutex_t vmm_mtx; 74 static list_t vmm_list; 75 static list_t vmm_destroy_list; 76 static id_space_t *vmm_minors; 77 static void *vmm_statep; 78 79 static const char *vmmdev_hvm_name = "bhyve"; 80 81 /* For sdev plugin (/dev) */ 82 #define VMM_SDEV_ROOT "/dev/vmm" 83 84 /* From uts/i86pc/io/vmm/intel/vmx.c */ 85 extern int vmx_x86_supported(const char **); 86 87 /* Holds and hooks from drivers external to vmm */ 88 struct vmm_hold { 89 list_node_t vmh_node; 90 vmm_softc_t *vmh_sc; 91 boolean_t vmh_release_req; 92 uint_t vmh_ioport_hook_cnt; 93 }; 94 95 struct vmm_lease { 96 list_node_t vml_node; 97 struct vm *vml_vm; 98 boolean_t vml_expired; 99 boolean_t (*vml_expire_func)(void *); 100 void *vml_expire_arg; 101 list_node_t vml_expire_node; 102 struct vmm_hold *vml_hold; 103 }; 104 105 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 106 static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *); 107 108 static int 109 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 110 { 111 int error; 112 bool sysmem; 113 114 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 115 NULL); 116 if (error || mseg->len == 0) 117 return (error); 118 119 if (!sysmem) { 120 vmm_devmem_entry_t *de; 121 list_t *dl = &sc->vmm_devmem_list; 122 123 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 124 if (de->vde_segid == mseg->segid) { 125 break; 126 } 127 } 128 if (de != NULL) { 129 (void) strlcpy(mseg->name, de->vde_name, 130 sizeof (mseg->name)); 131 } 132 } else { 133 bzero(mseg->name, sizeof (mseg->name)); 134 } 135 136 return (error); 137 } 138 139 /* 140 * The 'devmem' hack: 141 * 142 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 143 * in the vm which appear with their own name related to the vm under /dev. 144 * Since this would be a hassle from an sdev perspective and would require a 145 * new cdev interface (or complicate the existing one), we choose to implement 146 * this in a different manner. When 'devmem' mappings are created, an 147 * identifying off_t is communicated back out to userspace. That off_t, 148 * residing above the normal guest memory space, can be used to mmap the 149 * 'devmem' mapping from the already-open vm device. 150 */ 151 152 static int 153 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 154 { 155 off_t map_offset; 156 vmm_devmem_entry_t *entry; 157 158 if (list_is_empty(&sc->vmm_devmem_list)) { 159 map_offset = VM_DEVMEM_START; 160 } else { 161 entry = list_tail(&sc->vmm_devmem_list); 162 map_offset = entry->vde_off + entry->vde_len; 163 if (map_offset < entry->vde_off) { 164 /* Do not tolerate overflow */ 165 return (ERANGE); 166 } 167 /* 168 * XXXJOY: We could choose to search the list for duplicate 169 * names and toss an error. Since we're using the offset 170 * method for now, it does not make much of a difference. 171 */ 172 } 173 174 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 175 entry->vde_segid = mseg->segid; 176 entry->vde_len = mseg->len; 177 entry->vde_off = map_offset; 178 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 179 list_insert_tail(&sc->vmm_devmem_list, entry); 180 181 return (0); 182 } 183 184 static boolean_t 185 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp) 186 { 187 list_t *dl = &sc->vmm_devmem_list; 188 vmm_devmem_entry_t *de = NULL; 189 190 VERIFY(off >= VM_DEVMEM_START); 191 192 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 193 /* XXX: Only hit on direct offset/length matches for now */ 194 if (de->vde_off == off && de->vde_len == len) { 195 break; 196 } 197 } 198 if (de == NULL) { 199 return (B_FALSE); 200 } 201 202 *segidp = de->vde_segid; 203 return (B_TRUE); 204 } 205 206 static void 207 vmmdev_devmem_purge(vmm_softc_t *sc) 208 { 209 vmm_devmem_entry_t *entry; 210 211 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 212 kmem_free(entry, sizeof (*entry)); 213 } 214 } 215 216 static int 217 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 218 { 219 int error; 220 bool sysmem = true; 221 222 if (VM_MEMSEG_NAME(mseg)) { 223 sysmem = false; 224 } 225 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 226 227 if (error == 0 && VM_MEMSEG_NAME(mseg)) { 228 /* 229 * Rather than create a whole fresh device from which userspace 230 * can mmap this segment, instead make it available at an 231 * offset above where the main guest memory resides. 232 */ 233 error = vmmdev_devmem_create(sc, mseg, mseg->name); 234 if (error != 0) { 235 vm_free_memseg(sc->vmm_vm, mseg->segid); 236 } 237 } 238 return (error); 239 } 240 241 /* 242 * Resource Locking and Exclusion 243 * 244 * Much of bhyve depends on key portions of VM state, such as the guest memory 245 * map, to remain unchanged while the guest is running. As ported from 246 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 247 * access to the instance vCPUs. Threads acting on a single vCPU, like those 248 * performing the work of actually running the guest in VMX/SVM, would lock 249 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 250 * state, all of the vCPUs would be first locked, ensuring that the 251 * operation(s) could complete without any other threads stumbling into 252 * intermediate states. 253 * 254 * This approach is largely effective for bhyve. Common operations, such as 255 * running the vCPUs, steer clear of lock contention. The model begins to 256 * break down for operations which do not occur in the context of a specific 257 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 258 * thread in the bhyve process. In order to properly protect those vCPU-less 259 * operations from encountering invalid states, additional locking is required. 260 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 261 * It does mean that class of operations will be serialized on locking the 262 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 263 * undue contention on the VM_MAXCPU-1 vCPU. 264 * 265 * In order to address the shortcomings of this model, the concept of a 266 * read/write lock has been added to bhyve. Operations which change 267 * fundamental aspects of a VM (such as the memory map) must acquire the write 268 * lock, which also implies locking all of the vCPUs and waiting for all read 269 * lock holders to release. While it increases the cost and waiting time for 270 * those few operations, it allows most hot-path operations on the VM (which 271 * depend on its configuration remaining stable) to occur with minimal locking. 272 * 273 * Consumers of the Driver API (see below) are a special case when it comes to 274 * this locking, since they may hold a read lock via the drv_lease mechanism 275 * for an extended period of time. Rather than forcing those consumers to 276 * continuously poll for a write lock attempt, the lease system forces them to 277 * provide a release callback to trigger their clean-up (and potential later 278 * reacquisition) of the read lock. 279 */ 280 281 static void 282 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 283 { 284 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 285 286 /* 287 * Since this state transition is utilizing from_idle=true, it should 288 * not fail, but rather block until it can be successful. 289 */ 290 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 291 } 292 293 static void 294 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 295 { 296 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 297 298 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 299 vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false); 300 } 301 302 static void 303 vmm_read_lock(vmm_softc_t *sc) 304 { 305 rw_enter(&sc->vmm_rwlock, RW_READER); 306 } 307 308 static void 309 vmm_read_unlock(vmm_softc_t *sc) 310 { 311 rw_exit(&sc->vmm_rwlock); 312 } 313 314 static void 315 vmm_write_lock(vmm_softc_t *sc) 316 { 317 int maxcpus; 318 319 /* First lock all the vCPUs */ 320 maxcpus = vm_get_maxcpus(sc->vmm_vm); 321 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 322 vcpu_lock_one(sc, vcpu); 323 } 324 325 mutex_enter(&sc->vmm_lease_lock); 326 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 327 sc->vmm_lease_blocker++; 328 if (sc->vmm_lease_blocker == 1) { 329 list_t *list = &sc->vmm_lease_list; 330 vmm_lease_t *lease = list_head(list); 331 332 while (lease != NULL) { 333 boolean_t sync_break = B_FALSE; 334 335 if (!lease->vml_expired) { 336 void *arg = lease->vml_expire_arg; 337 lease->vml_expired = B_TRUE; 338 sync_break = lease->vml_expire_func(arg); 339 } 340 341 if (sync_break) { 342 vmm_lease_t *next; 343 344 /* 345 * These leases which are synchronously broken 346 * result in vmm_read_unlock() calls from a 347 * different thread than the corresponding 348 * vmm_read_lock(). This is acceptable, given 349 * that the rwlock underpinning the whole 350 * mechanism tolerates the behavior. This 351 * flexibility is _only_ afforded to VM read 352 * lock (RW_READER) holders. 353 */ 354 next = list_next(list, lease); 355 vmm_lease_break_locked(sc, lease); 356 lease = next; 357 } else { 358 lease = list_next(list, lease); 359 } 360 } 361 } 362 mutex_exit(&sc->vmm_lease_lock); 363 364 rw_enter(&sc->vmm_rwlock, RW_WRITER); 365 /* 366 * For now, the 'maxcpus' value for an instance is fixed at the 367 * compile-time constant of VM_MAXCPU at creation. If this changes in 368 * the future, allowing for dynamic vCPU resource sizing, acquisition 369 * of the write lock will need to be wary of such changes. 370 */ 371 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 372 } 373 374 static void 375 vmm_write_unlock(vmm_softc_t *sc) 376 { 377 int maxcpus; 378 379 mutex_enter(&sc->vmm_lease_lock); 380 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 381 sc->vmm_lease_blocker--; 382 if (sc->vmm_lease_blocker == 0) { 383 cv_broadcast(&sc->vmm_lease_cv); 384 } 385 mutex_exit(&sc->vmm_lease_lock); 386 387 /* 388 * The VM write lock _must_ be released from the same thread it was 389 * acquired in, unlike the read lock. 390 */ 391 VERIFY(rw_write_held(&sc->vmm_rwlock)); 392 rw_exit(&sc->vmm_rwlock); 393 394 /* Unlock all the vCPUs */ 395 maxcpus = vm_get_maxcpus(sc->vmm_vm); 396 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 397 vcpu_unlock_one(sc, vcpu); 398 } 399 } 400 401 static int 402 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 403 cred_t *credp, int *rvalp) 404 { 405 int error = 0, vcpu = -1; 406 void *datap = (void *)arg; 407 enum vm_lock_type { 408 LOCK_NONE = 0, 409 LOCK_VCPU, 410 LOCK_READ_HOLD, 411 LOCK_WRITE_HOLD 412 } lock_type = LOCK_NONE; 413 414 /* Acquire any exclusion resources needed for the operation. */ 415 switch (cmd) { 416 case VM_RUN: 417 case VM_GET_REGISTER: 418 case VM_SET_REGISTER: 419 case VM_GET_SEGMENT_DESCRIPTOR: 420 case VM_SET_SEGMENT_DESCRIPTOR: 421 case VM_GET_REGISTER_SET: 422 case VM_SET_REGISTER_SET: 423 case VM_INJECT_EXCEPTION: 424 case VM_GET_CAPABILITY: 425 case VM_SET_CAPABILITY: 426 case VM_PPTDEV_MSI: 427 case VM_PPTDEV_MSIX: 428 case VM_SET_X2APIC_STATE: 429 case VM_GLA2GPA: 430 case VM_GLA2GPA_NOFAULT: 431 case VM_ACTIVATE_CPU: 432 case VM_SET_INTINFO: 433 case VM_GET_INTINFO: 434 case VM_RESTART_INSTRUCTION: 435 case VM_SET_KERNEMU_DEV: 436 case VM_GET_KERNEMU_DEV: 437 /* 438 * Copy in the ID of the vCPU chosen for this operation. 439 * Since a nefarious caller could update their struct between 440 * this locking and when the rest of the ioctl data is copied 441 * in, it is _critical_ that this local 'vcpu' variable be used 442 * rather than the in-struct one when performing the ioctl. 443 */ 444 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 445 return (EFAULT); 446 } 447 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 448 return (EINVAL); 449 } 450 vcpu_lock_one(sc, vcpu); 451 lock_type = LOCK_VCPU; 452 break; 453 454 case VM_REINIT: 455 case VM_BIND_PPTDEV: 456 case VM_UNBIND_PPTDEV: 457 case VM_MAP_PPTDEV_MMIO: 458 case VM_ALLOC_MEMSEG: 459 case VM_MMAP_MEMSEG: 460 case VM_WRLOCK_CYCLE: 461 vmm_write_lock(sc); 462 lock_type = LOCK_WRITE_HOLD; 463 break; 464 465 case VM_GET_GPA_PMAP: 466 case VM_GET_MEMSEG: 467 case VM_MMAP_GETNEXT: 468 case VM_LAPIC_IRQ: 469 case VM_INJECT_NMI: 470 case VM_IOAPIC_ASSERT_IRQ: 471 case VM_IOAPIC_DEASSERT_IRQ: 472 case VM_IOAPIC_PULSE_IRQ: 473 case VM_LAPIC_MSI: 474 case VM_LAPIC_LOCAL_IRQ: 475 case VM_GET_X2APIC_STATE: 476 case VM_RTC_READ: 477 case VM_RTC_WRITE: 478 case VM_RTC_SETTIME: 479 case VM_RTC_GETTIME: 480 #ifndef __FreeBSD__ 481 case VM_DEVMEM_GETOFFSET: 482 #endif 483 vmm_read_lock(sc); 484 lock_type = LOCK_READ_HOLD; 485 break; 486 487 case VM_IOAPIC_PINCOUNT: 488 default: 489 break; 490 } 491 492 /* Execute the primary logic for the ioctl. */ 493 switch (cmd) { 494 case VM_RUN: { 495 struct vm_run vmrun; 496 497 if (ddi_copyin(datap, &vmrun, sizeof (vmrun), md)) { 498 error = EFAULT; 499 break; 500 } 501 vmrun.cpuid = vcpu; 502 503 if (!(curthread->t_schedflag & TS_VCPU)) 504 smt_mark_as_vcpu(); 505 506 error = vm_run(sc->vmm_vm, &vmrun); 507 /* 508 * XXXJOY: I think it's necessary to do copyout, even in the 509 * face of errors, since the exit state is communicated out. 510 */ 511 if (ddi_copyout(&vmrun, datap, sizeof (vmrun), md)) { 512 error = EFAULT; 513 break; 514 } 515 break; 516 } 517 case VM_SUSPEND: { 518 struct vm_suspend vmsuspend; 519 520 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 521 error = EFAULT; 522 break; 523 } 524 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 525 break; 526 } 527 case VM_REINIT: 528 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 529 /* 530 * The VM instance should be free of driver-attached 531 * hooks during the reinitialization process. 532 */ 533 break; 534 } 535 error = vm_reinit(sc->vmm_vm); 536 (void) vmm_drv_block_hook(sc, B_FALSE); 537 break; 538 case VM_STAT_DESC: { 539 struct vm_stat_desc statdesc; 540 541 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 542 error = EFAULT; 543 break; 544 } 545 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 546 sizeof (statdesc.desc)); 547 if (error == 0 && 548 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 549 error = EFAULT; 550 break; 551 } 552 break; 553 } 554 case VM_STATS_IOC: { 555 struct vm_stats vmstats; 556 557 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 558 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 559 error = EFAULT; 560 break; 561 } 562 hrt2tv(gethrtime(), &vmstats.tv); 563 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, 564 &vmstats.num_entries, vmstats.statbuf); 565 if (error == 0 && 566 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 567 error = EFAULT; 568 break; 569 } 570 break; 571 } 572 573 case VM_PPTDEV_MSI: { 574 struct vm_pptdev_msi pptmsi; 575 576 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 577 error = EFAULT; 578 break; 579 } 580 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 581 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 582 break; 583 } 584 case VM_PPTDEV_MSIX: { 585 struct vm_pptdev_msix pptmsix; 586 587 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 588 error = EFAULT; 589 break; 590 } 591 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 592 pptmsix.idx, pptmsix.addr, pptmsix.msg, 593 pptmsix.vector_control); 594 break; 595 } 596 case VM_MAP_PPTDEV_MMIO: { 597 struct vm_pptdev_mmio pptmmio; 598 599 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 600 error = EFAULT; 601 break; 602 } 603 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 604 pptmmio.len, pptmmio.hpa); 605 break; 606 } 607 case VM_BIND_PPTDEV: { 608 struct vm_pptdev pptdev; 609 610 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 611 error = EFAULT; 612 break; 613 } 614 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 615 break; 616 } 617 case VM_UNBIND_PPTDEV: { 618 struct vm_pptdev pptdev; 619 620 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 621 error = EFAULT; 622 break; 623 } 624 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 625 break; 626 } 627 case VM_GET_PPTDEV_LIMITS: { 628 struct vm_pptdev_limits pptlimits; 629 630 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 631 error = EFAULT; 632 break; 633 } 634 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 635 &pptlimits.msi_limit, &pptlimits.msix_limit); 636 if (error == 0 && 637 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 638 error = EFAULT; 639 break; 640 } 641 break; 642 } 643 case VM_INJECT_EXCEPTION: { 644 struct vm_exception vmexc; 645 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 646 error = EFAULT; 647 break; 648 } 649 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 650 vmexc.error_code_valid, vmexc.error_code, 651 vmexc.restart_instruction); 652 break; 653 } 654 case VM_INJECT_NMI: { 655 struct vm_nmi vmnmi; 656 657 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 658 error = EFAULT; 659 break; 660 } 661 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 662 break; 663 } 664 case VM_LAPIC_IRQ: { 665 struct vm_lapic_irq vmirq; 666 667 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 668 error = EFAULT; 669 break; 670 } 671 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 672 break; 673 } 674 case VM_LAPIC_LOCAL_IRQ: { 675 struct vm_lapic_irq vmirq; 676 677 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 678 error = EFAULT; 679 break; 680 } 681 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 682 vmirq.vector); 683 break; 684 } 685 case VM_LAPIC_MSI: { 686 struct vm_lapic_msi vmmsi; 687 688 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 689 error = EFAULT; 690 break; 691 } 692 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 693 break; 694 } 695 696 case VM_IOAPIC_ASSERT_IRQ: { 697 struct vm_ioapic_irq ioapic_irq; 698 699 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 700 error = EFAULT; 701 break; 702 } 703 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 704 break; 705 } 706 case VM_IOAPIC_DEASSERT_IRQ: { 707 struct vm_ioapic_irq ioapic_irq; 708 709 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 710 error = EFAULT; 711 break; 712 } 713 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 714 break; 715 } 716 case VM_IOAPIC_PULSE_IRQ: { 717 struct vm_ioapic_irq ioapic_irq; 718 719 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 720 error = EFAULT; 721 break; 722 } 723 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 724 break; 725 } 726 case VM_IOAPIC_PINCOUNT: { 727 int pincount; 728 729 pincount = vioapic_pincount(sc->vmm_vm); 730 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 731 error = EFAULT; 732 break; 733 } 734 break; 735 } 736 737 case VM_ISA_ASSERT_IRQ: { 738 struct vm_isa_irq isa_irq; 739 740 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 741 error = EFAULT; 742 break; 743 } 744 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 745 if (error == 0 && isa_irq.ioapic_irq != -1) { 746 error = vioapic_assert_irq(sc->vmm_vm, 747 isa_irq.ioapic_irq); 748 } 749 break; 750 } 751 case VM_ISA_DEASSERT_IRQ: { 752 struct vm_isa_irq isa_irq; 753 754 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 755 error = EFAULT; 756 break; 757 } 758 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 759 if (error == 0 && isa_irq.ioapic_irq != -1) { 760 error = vioapic_deassert_irq(sc->vmm_vm, 761 isa_irq.ioapic_irq); 762 } 763 break; 764 } 765 case VM_ISA_PULSE_IRQ: { 766 struct vm_isa_irq isa_irq; 767 768 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 769 error = EFAULT; 770 break; 771 } 772 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 773 if (error == 0 && isa_irq.ioapic_irq != -1) { 774 error = vioapic_pulse_irq(sc->vmm_vm, 775 isa_irq.ioapic_irq); 776 } 777 break; 778 } 779 case VM_ISA_SET_IRQ_TRIGGER: { 780 struct vm_isa_irq_trigger isa_irq_trigger; 781 782 if (ddi_copyin(datap, &isa_irq_trigger, 783 sizeof (isa_irq_trigger), md)) { 784 error = EFAULT; 785 break; 786 } 787 error = vatpic_set_irq_trigger(sc->vmm_vm, 788 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 789 break; 790 } 791 792 case VM_MMAP_GETNEXT: { 793 struct vm_memmap mm; 794 795 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 796 error = EFAULT; 797 break; 798 } 799 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 800 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 801 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 802 error = EFAULT; 803 break; 804 } 805 break; 806 } 807 case VM_MMAP_MEMSEG: { 808 struct vm_memmap mm; 809 810 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 811 error = EFAULT; 812 break; 813 } 814 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 815 mm.len, mm.prot, mm.flags); 816 break; 817 } 818 case VM_ALLOC_MEMSEG: { 819 struct vm_memseg vmseg; 820 821 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 822 error = EFAULT; 823 break; 824 } 825 error = vmmdev_alloc_memseg(sc, &vmseg); 826 break; 827 } 828 case VM_GET_MEMSEG: { 829 struct vm_memseg vmseg; 830 831 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 832 error = EFAULT; 833 break; 834 } 835 error = vmmdev_get_memseg(sc, &vmseg); 836 if (error == 0 && 837 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 838 error = EFAULT; 839 break; 840 } 841 break; 842 } 843 case VM_GET_REGISTER: { 844 struct vm_register vmreg; 845 846 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 847 error = EFAULT; 848 break; 849 } 850 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 851 &vmreg.regval); 852 if (error == 0 && 853 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 854 error = EFAULT; 855 break; 856 } 857 break; 858 } 859 case VM_SET_REGISTER: { 860 struct vm_register vmreg; 861 862 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 863 error = EFAULT; 864 break; 865 } 866 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 867 vmreg.regval); 868 break; 869 } 870 case VM_SET_SEGMENT_DESCRIPTOR: { 871 struct vm_seg_desc vmsegd; 872 873 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 874 error = EFAULT; 875 break; 876 } 877 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 878 &vmsegd.desc); 879 break; 880 } 881 case VM_GET_SEGMENT_DESCRIPTOR: { 882 struct vm_seg_desc vmsegd; 883 884 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 885 error = EFAULT; 886 break; 887 } 888 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 889 &vmsegd.desc); 890 if (error == 0 && 891 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 892 error = EFAULT; 893 break; 894 } 895 break; 896 } 897 case VM_GET_REGISTER_SET: { 898 struct vm_register_set vrs; 899 int regnums[VM_REG_LAST]; 900 uint64_t regvals[VM_REG_LAST]; 901 902 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 903 error = EFAULT; 904 break; 905 } 906 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 907 error = EINVAL; 908 break; 909 } 910 if (ddi_copyin(vrs.regnums, regnums, 911 sizeof (int) * vrs.count, md)) { 912 error = EFAULT; 913 break; 914 } 915 916 error = 0; 917 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 918 if (regnums[i] < 0) { 919 error = EINVAL; 920 break; 921 } 922 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 923 ®vals[i]); 924 } 925 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 926 sizeof (uint64_t) * vrs.count, md)) { 927 error = EFAULT; 928 } 929 break; 930 } 931 case VM_SET_REGISTER_SET: { 932 struct vm_register_set vrs; 933 int regnums[VM_REG_LAST]; 934 uint64_t regvals[VM_REG_LAST]; 935 936 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 937 error = EFAULT; 938 break; 939 } 940 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 941 error = EINVAL; 942 break; 943 } 944 if (ddi_copyin(vrs.regnums, regnums, 945 sizeof (int) * vrs.count, md)) { 946 error = EFAULT; 947 break; 948 } 949 if (ddi_copyin(vrs.regvals, regvals, 950 sizeof (uint64_t) * vrs.count, md)) { 951 error = EFAULT; 952 break; 953 } 954 955 error = 0; 956 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 957 /* 958 * Setting registers in a set is not atomic, since a 959 * failure in the middle of the set will cause a 960 * bail-out and inconsistent register state. Callers 961 * should be wary of this. 962 */ 963 if (regnums[i] < 0) { 964 error = EINVAL; 965 break; 966 } 967 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 968 regvals[i]); 969 } 970 break; 971 } 972 973 case VM_SET_KERNEMU_DEV: 974 case VM_GET_KERNEMU_DEV: { 975 struct vm_readwrite_kernemu_device kemu; 976 size_t size = 0; 977 mem_region_write_t mwrite = NULL; 978 mem_region_read_t mread = NULL; 979 uint64_t ignored = 0; 980 981 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 982 error = EFAULT; 983 break; 984 } 985 986 if (kemu.access_width > 3) { 987 error = EINVAL; 988 break; 989 } 990 size = (1 << kemu.access_width); 991 ASSERT(size >= 1 && size <= 8); 992 993 if (kemu.gpa >= DEFAULT_APIC_BASE && 994 kemu.gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 995 mread = lapic_mmio_read; 996 mwrite = lapic_mmio_write; 997 } else if (kemu.gpa >= VIOAPIC_BASE && 998 kemu.gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 999 mread = vioapic_mmio_read; 1000 mwrite = vioapic_mmio_write; 1001 } else if (kemu.gpa >= VHPET_BASE && 1002 kemu.gpa < VHPET_BASE + VHPET_SIZE) { 1003 mread = vhpet_mmio_read; 1004 mwrite = vhpet_mmio_write; 1005 } else { 1006 error = EINVAL; 1007 break; 1008 } 1009 1010 if (cmd == VM_SET_KERNEMU_DEV) { 1011 VERIFY(mwrite != NULL); 1012 error = mwrite(sc->vmm_vm, vcpu, kemu.gpa, kemu.value, 1013 size, &ignored); 1014 } else { 1015 VERIFY(mread != NULL); 1016 error = mread(sc->vmm_vm, vcpu, kemu.gpa, &kemu.value, 1017 size, &ignored); 1018 } 1019 1020 if (error == 0) { 1021 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1022 error = EFAULT; 1023 break; 1024 } 1025 } 1026 break; 1027 } 1028 1029 case VM_GET_CAPABILITY: { 1030 struct vm_capability vmcap; 1031 1032 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1033 error = EFAULT; 1034 break; 1035 } 1036 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1037 &vmcap.capval); 1038 if (error == 0 && 1039 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1040 error = EFAULT; 1041 break; 1042 } 1043 break; 1044 } 1045 case VM_SET_CAPABILITY: { 1046 struct vm_capability vmcap; 1047 1048 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1049 error = EFAULT; 1050 break; 1051 } 1052 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1053 vmcap.capval); 1054 break; 1055 } 1056 case VM_SET_X2APIC_STATE: { 1057 struct vm_x2apic x2apic; 1058 1059 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1060 error = EFAULT; 1061 break; 1062 } 1063 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1064 break; 1065 } 1066 case VM_GET_X2APIC_STATE: { 1067 struct vm_x2apic x2apic; 1068 1069 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1070 error = EFAULT; 1071 break; 1072 } 1073 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1074 &x2apic.state); 1075 if (error == 0 && 1076 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1077 error = EFAULT; 1078 break; 1079 } 1080 break; 1081 } 1082 case VM_GET_GPA_PMAP: { 1083 struct vm_gpa_pte gpapte; 1084 1085 if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) { 1086 error = EFAULT; 1087 break; 1088 } 1089 #ifdef __FreeBSD__ 1090 /* XXXJOY: add function? */ 1091 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)), 1092 gpapte.gpa, gpapte.pte, &gpapte.ptenum); 1093 #endif 1094 error = 0; 1095 break; 1096 } 1097 case VM_GET_HPET_CAPABILITIES: { 1098 struct vm_hpet_cap hpetcap; 1099 1100 error = vhpet_getcap(&hpetcap); 1101 if (error == 0 && 1102 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1103 error = EFAULT; 1104 break; 1105 } 1106 break; 1107 } 1108 case VM_GLA2GPA: { 1109 struct vm_gla2gpa gg; 1110 1111 CTASSERT(PROT_READ == VM_PROT_READ); 1112 CTASSERT(PROT_WRITE == VM_PROT_WRITE); 1113 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); 1114 1115 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1116 error = EFAULT; 1117 break; 1118 } 1119 gg.vcpuid = vcpu; 1120 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1121 gg.prot, &gg.gpa, &gg.fault); 1122 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1123 error = EFAULT; 1124 break; 1125 } 1126 break; 1127 } 1128 case VM_GLA2GPA_NOFAULT: { 1129 struct vm_gla2gpa gg; 1130 1131 CTASSERT(PROT_READ == VM_PROT_READ); 1132 CTASSERT(PROT_WRITE == VM_PROT_WRITE); 1133 CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); 1134 1135 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1136 error = EFAULT; 1137 break; 1138 } 1139 gg.vcpuid = vcpu; 1140 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1141 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1142 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1143 error = EFAULT; 1144 break; 1145 } 1146 break; 1147 } 1148 1149 case VM_ACTIVATE_CPU: 1150 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1151 break; 1152 1153 case VM_SUSPEND_CPU: 1154 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1155 error = EFAULT; 1156 } else { 1157 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1158 } 1159 break; 1160 1161 case VM_RESUME_CPU: 1162 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1163 error = EFAULT; 1164 } else { 1165 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1166 } 1167 break; 1168 1169 case VM_GET_CPUS: { 1170 struct vm_cpuset vm_cpuset; 1171 cpuset_t tempset; 1172 void *srcp = &tempset; 1173 int size; 1174 1175 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1176 error = EFAULT; 1177 break; 1178 } 1179 1180 /* Be more generous about sizing since our cpuset_t is large. */ 1181 size = vm_cpuset.cpusetsize; 1182 if (size <= 0 || size > sizeof (cpuset_t)) { 1183 error = ERANGE; 1184 } 1185 /* 1186 * If they want a ulong_t or less, make sure they receive the 1187 * low bits with all the useful information. 1188 */ 1189 if (size <= sizeof (tempset.cpub[0])) { 1190 srcp = &tempset.cpub[0]; 1191 } 1192 1193 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1194 tempset = vm_active_cpus(sc->vmm_vm); 1195 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1196 tempset = vm_suspended_cpus(sc->vmm_vm); 1197 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1198 tempset = vm_debug_cpus(sc->vmm_vm); 1199 } else { 1200 error = EINVAL; 1201 } 1202 1203 ASSERT(size > 0 && size <= sizeof (tempset)); 1204 if (error == 0 && 1205 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1206 error = EFAULT; 1207 break; 1208 } 1209 break; 1210 } 1211 case VM_SET_INTINFO: { 1212 struct vm_intinfo vmii; 1213 1214 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1215 error = EFAULT; 1216 break; 1217 } 1218 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1219 break; 1220 } 1221 case VM_GET_INTINFO: { 1222 struct vm_intinfo vmii; 1223 1224 vmii.vcpuid = vcpu; 1225 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1226 &vmii.info2); 1227 if (error == 0 && 1228 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1229 error = EFAULT; 1230 break; 1231 } 1232 break; 1233 } 1234 case VM_RTC_WRITE: { 1235 struct vm_rtc_data rtcdata; 1236 1237 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1238 error = EFAULT; 1239 break; 1240 } 1241 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1242 rtcdata.value); 1243 break; 1244 } 1245 case VM_RTC_READ: { 1246 struct vm_rtc_data rtcdata; 1247 1248 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1249 error = EFAULT; 1250 break; 1251 } 1252 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1253 &rtcdata.value); 1254 if (error == 0 && 1255 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1256 error = EFAULT; 1257 break; 1258 } 1259 break; 1260 } 1261 case VM_RTC_SETTIME: { 1262 struct vm_rtc_time rtctime; 1263 1264 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1265 error = EFAULT; 1266 break; 1267 } 1268 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1269 break; 1270 } 1271 case VM_RTC_GETTIME: { 1272 struct vm_rtc_time rtctime; 1273 1274 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1275 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1276 error = EFAULT; 1277 break; 1278 } 1279 break; 1280 } 1281 1282 case VM_RESTART_INSTRUCTION: 1283 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1284 break; 1285 1286 case VM_SET_TOPOLOGY: { 1287 struct vm_cpu_topology topo; 1288 1289 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1290 error = EFAULT; 1291 break; 1292 } 1293 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1294 topo.threads, topo.maxcpus); 1295 break; 1296 } 1297 case VM_GET_TOPOLOGY: { 1298 struct vm_cpu_topology topo; 1299 1300 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1301 &topo.threads, &topo.maxcpus); 1302 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1303 error = EFAULT; 1304 break; 1305 } 1306 break; 1307 } 1308 1309 #ifndef __FreeBSD__ 1310 case VM_DEVMEM_GETOFFSET: { 1311 struct vm_devmem_offset vdo; 1312 list_t *dl = &sc->vmm_devmem_list; 1313 vmm_devmem_entry_t *de = NULL; 1314 1315 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1316 error = EFAULT; 1317 break; 1318 } 1319 1320 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 1321 if (de->vde_segid == vdo.segid) { 1322 break; 1323 } 1324 } 1325 if (de != NULL) { 1326 vdo.offset = de->vde_off; 1327 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1328 error = EFAULT; 1329 } 1330 } else { 1331 error = ENOENT; 1332 } 1333 break; 1334 } 1335 case VM_WRLOCK_CYCLE: { 1336 /* 1337 * Present a test mechanism to acquire/release the write lock 1338 * on the VM without any other effects. 1339 */ 1340 break; 1341 } 1342 #endif 1343 default: 1344 error = ENOTTY; 1345 break; 1346 } 1347 1348 /* Release exclusion resources */ 1349 switch (lock_type) { 1350 case LOCK_NONE: 1351 break; 1352 case LOCK_VCPU: 1353 vcpu_unlock_one(sc, vcpu); 1354 break; 1355 case LOCK_READ_HOLD: 1356 vmm_read_unlock(sc); 1357 break; 1358 case LOCK_WRITE_HOLD: 1359 vmm_write_unlock(sc); 1360 break; 1361 default: 1362 panic("unexpected lock type"); 1363 break; 1364 } 1365 1366 return (error); 1367 } 1368 1369 static vmm_softc_t * 1370 vmm_lookup(const char *name) 1371 { 1372 list_t *vml = &vmm_list; 1373 vmm_softc_t *sc; 1374 1375 ASSERT(MUTEX_HELD(&vmm_mtx)); 1376 1377 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1378 if (strcmp(sc->vmm_name, name) == 0) { 1379 break; 1380 } 1381 } 1382 1383 return (sc); 1384 } 1385 1386 /* 1387 * Acquire an HMA registration if not already held. 1388 */ 1389 static boolean_t 1390 vmm_hma_acquire(void) 1391 { 1392 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1393 1394 mutex_enter(&vmmdev_mtx); 1395 1396 if (vmmdev_hma_reg == NULL) { 1397 VERIFY3U(vmmdev_hma_ref, ==, 0); 1398 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1399 if (vmmdev_hma_reg == NULL) { 1400 cmn_err(CE_WARN, "%s HMA registration failed.", 1401 vmmdev_hvm_name); 1402 mutex_exit(&vmmdev_mtx); 1403 return (B_FALSE); 1404 } 1405 } 1406 1407 vmmdev_hma_ref++; 1408 1409 mutex_exit(&vmmdev_mtx); 1410 1411 return (B_TRUE); 1412 } 1413 1414 /* 1415 * Release the HMA registration if held and there are no remaining VMs. 1416 */ 1417 static void 1418 vmm_hma_release(void) 1419 { 1420 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1421 1422 mutex_enter(&vmmdev_mtx); 1423 1424 VERIFY3U(vmmdev_hma_ref, !=, 0); 1425 1426 vmmdev_hma_ref--; 1427 1428 if (vmmdev_hma_ref == 0) { 1429 VERIFY(vmmdev_hma_reg != NULL); 1430 hma_unregister(vmmdev_hma_reg); 1431 vmmdev_hma_reg = NULL; 1432 } 1433 mutex_exit(&vmmdev_mtx); 1434 } 1435 1436 static int 1437 vmmdev_do_vm_create(char *name, cred_t *cr) 1438 { 1439 vmm_softc_t *sc = NULL; 1440 minor_t minor; 1441 int error = ENOMEM; 1442 1443 if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) { 1444 return (EINVAL); 1445 } 1446 1447 if (!vmm_hma_acquire()) 1448 return (ENXIO); 1449 1450 mutex_enter(&vmm_mtx); 1451 1452 /* Look for duplicate names */ 1453 if (vmm_lookup(name) != NULL) { 1454 mutex_exit(&vmm_mtx); 1455 vmm_hma_release(); 1456 return (EEXIST); 1457 } 1458 1459 /* Allow only one instance per non-global zone. */ 1460 if (!INGLOBALZONE(curproc)) { 1461 for (sc = list_head(&vmm_list); sc != NULL; 1462 sc = list_next(&vmm_list, sc)) { 1463 if (sc->vmm_zone == curzone) { 1464 mutex_exit(&vmm_mtx); 1465 vmm_hma_release(); 1466 return (EINVAL); 1467 } 1468 } 1469 } 1470 1471 minor = id_alloc(vmm_minors); 1472 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1473 goto fail; 1474 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1475 ddi_soft_state_free(vmm_statep, minor); 1476 goto fail; 1477 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1478 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1479 goto fail; 1480 } 1481 1482 error = vm_create(name, &sc->vmm_vm); 1483 if (error == 0) { 1484 /* Complete VM intialization and report success. */ 1485 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 1486 sc->vmm_minor = minor; 1487 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 1488 offsetof(vmm_devmem_entry_t, vde_node)); 1489 1490 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 1491 offsetof(vmm_hold_t, vmh_node)); 1492 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 1493 1494 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 1495 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 1496 offsetof(vmm_lease_t, vml_node)); 1497 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 1498 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 1499 1500 sc->vmm_zone = crgetzone(cr); 1501 zone_hold(sc->vmm_zone); 1502 vmm_zsd_add_vm(sc); 1503 1504 list_insert_tail(&vmm_list, sc); 1505 mutex_exit(&vmm_mtx); 1506 return (0); 1507 } 1508 1509 ddi_remove_minor_node(vmmdev_dip, name); 1510 fail: 1511 id_free(vmm_minors, minor); 1512 if (sc != NULL) { 1513 ddi_soft_state_free(vmm_statep, minor); 1514 } 1515 mutex_exit(&vmm_mtx); 1516 vmm_hma_release(); 1517 1518 return (error); 1519 } 1520 1521 /* 1522 * Bhyve 'Driver' Interface 1523 * 1524 * While many devices are emulated in the bhyve userspace process, there are 1525 * others with performance constraints which require that they run mostly or 1526 * entirely in-kernel. For those not integrated directly into bhyve, an API is 1527 * needed so they can query/manipulate the portions of VM state needed to 1528 * fulfill their purpose. 1529 * 1530 * This includes: 1531 * - Translating guest-physical addresses to host-virtual pointers 1532 * - Injecting MSIs 1533 * - Hooking IO port addresses 1534 * 1535 * The vmm_drv interface exists to provide that functionality to its consumers. 1536 * (At this time, 'viona' is the only user) 1537 */ 1538 int 1539 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 1540 { 1541 vnode_t *vp = fp->f_vnode; 1542 const dev_t dev = vp->v_rdev; 1543 vmm_softc_t *sc; 1544 vmm_hold_t *hold; 1545 int err = 0; 1546 1547 if (vp->v_type != VCHR) { 1548 return (ENXIO); 1549 } 1550 const major_t major = getmajor(dev); 1551 const minor_t minor = getminor(dev); 1552 1553 mutex_enter(&vmmdev_mtx); 1554 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 1555 mutex_exit(&vmmdev_mtx); 1556 return (ENOENT); 1557 } 1558 mutex_enter(&vmm_mtx); 1559 mutex_exit(&vmmdev_mtx); 1560 1561 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1562 err = ENOENT; 1563 goto out; 1564 } 1565 /* XXXJOY: check cred permissions against instance */ 1566 1567 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { 1568 err = EBUSY; 1569 goto out; 1570 } 1571 1572 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 1573 hold->vmh_sc = sc; 1574 hold->vmh_release_req = B_FALSE; 1575 1576 list_insert_tail(&sc->vmm_holds, hold); 1577 sc->vmm_flags |= VMM_HELD; 1578 *holdp = hold; 1579 1580 out: 1581 mutex_exit(&vmm_mtx); 1582 return (err); 1583 } 1584 1585 void 1586 vmm_drv_rele(vmm_hold_t *hold) 1587 { 1588 vmm_softc_t *sc; 1589 1590 ASSERT(hold != NULL); 1591 ASSERT(hold->vmh_sc != NULL); 1592 VERIFY(hold->vmh_ioport_hook_cnt == 0); 1593 1594 mutex_enter(&vmm_mtx); 1595 sc = hold->vmh_sc; 1596 list_remove(&sc->vmm_holds, hold); 1597 if (list_is_empty(&sc->vmm_holds)) { 1598 sc->vmm_flags &= ~VMM_HELD; 1599 cv_broadcast(&sc->vmm_cv); 1600 } 1601 mutex_exit(&vmm_mtx); 1602 kmem_free(hold, sizeof (*hold)); 1603 } 1604 1605 boolean_t 1606 vmm_drv_release_reqd(vmm_hold_t *hold) 1607 { 1608 ASSERT(hold != NULL); 1609 1610 return (hold->vmh_release_req); 1611 } 1612 1613 vmm_lease_t * 1614 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 1615 { 1616 vmm_softc_t *sc = hold->vmh_sc; 1617 vmm_lease_t *lease; 1618 1619 ASSERT3P(expiref, !=, NULL); 1620 1621 if (hold->vmh_release_req) { 1622 return (NULL); 1623 } 1624 1625 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 1626 list_link_init(&lease->vml_node); 1627 lease->vml_expire_func = expiref; 1628 lease->vml_expire_arg = arg; 1629 lease->vml_expired = B_FALSE; 1630 lease->vml_hold = hold; 1631 /* cache the VM pointer for one less pointer chase */ 1632 lease->vml_vm = sc->vmm_vm; 1633 1634 mutex_enter(&sc->vmm_lease_lock); 1635 while (sc->vmm_lease_blocker != 0) { 1636 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 1637 } 1638 list_insert_tail(&sc->vmm_lease_list, lease); 1639 vmm_read_lock(sc); 1640 mutex_exit(&sc->vmm_lease_lock); 1641 1642 return (lease); 1643 } 1644 1645 static void 1646 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 1647 { 1648 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 1649 1650 list_remove(&sc->vmm_lease_list, lease); 1651 vmm_read_unlock(sc); 1652 kmem_free(lease, sizeof (*lease)); 1653 } 1654 1655 void 1656 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 1657 { 1658 vmm_softc_t *sc = hold->vmh_sc; 1659 1660 VERIFY3P(hold, ==, lease->vml_hold); 1661 1662 mutex_enter(&sc->vmm_lease_lock); 1663 vmm_lease_break_locked(sc, lease); 1664 mutex_exit(&sc->vmm_lease_lock); 1665 } 1666 1667 boolean_t 1668 vmm_drv_lease_expired(vmm_lease_t *lease) 1669 { 1670 return (lease->vml_expired); 1671 } 1672 1673 void * 1674 vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz) 1675 { 1676 ASSERT(lease != NULL); 1677 1678 return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz)); 1679 } 1680 1681 int 1682 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 1683 { 1684 ASSERT(lease != NULL); 1685 1686 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 1687 } 1688 1689 int 1690 vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc, 1691 vmm_drv_wmem_cb_t wfunc, void *arg, void **cookie) 1692 { 1693 vmm_softc_t *sc; 1694 int err; 1695 1696 ASSERT(hold != NULL); 1697 ASSERT(cookie != NULL); 1698 1699 sc = hold->vmh_sc; 1700 mutex_enter(&vmm_mtx); 1701 /* Confirm that hook installation is not blocked */ 1702 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 1703 mutex_exit(&vmm_mtx); 1704 return (EBUSY); 1705 } 1706 /* 1707 * Optimistically record an installed hook which will prevent a block 1708 * from being asserted while the mutex is dropped. 1709 */ 1710 hold->vmh_ioport_hook_cnt++; 1711 mutex_exit(&vmm_mtx); 1712 1713 vmm_write_lock(sc); 1714 err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc, 1715 (vmm_wmem_cb_t)wfunc, arg, cookie); 1716 vmm_write_unlock(sc); 1717 1718 if (err != 0) { 1719 mutex_enter(&vmm_mtx); 1720 /* Walk back optimism about the hook installation */ 1721 hold->vmh_ioport_hook_cnt--; 1722 mutex_exit(&vmm_mtx); 1723 } 1724 return (err); 1725 } 1726 1727 void 1728 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 1729 { 1730 vmm_softc_t *sc; 1731 1732 ASSERT(hold != NULL); 1733 ASSERT(cookie != NULL); 1734 ASSERT(hold->vmh_ioport_hook_cnt != 0); 1735 1736 sc = hold->vmh_sc; 1737 vmm_write_lock(sc); 1738 vm_ioport_unhook(sc->vmm_vm, cookie); 1739 vmm_write_unlock(sc); 1740 1741 mutex_enter(&vmm_mtx); 1742 hold->vmh_ioport_hook_cnt--; 1743 mutex_exit(&vmm_mtx); 1744 } 1745 1746 static int 1747 vmm_drv_purge(vmm_softc_t *sc) 1748 { 1749 ASSERT(MUTEX_HELD(&vmm_mtx)); 1750 1751 if ((sc->vmm_flags & VMM_HELD) != 0) { 1752 vmm_hold_t *hold; 1753 1754 sc->vmm_flags |= VMM_CLEANUP; 1755 for (hold = list_head(&sc->vmm_holds); hold != NULL; 1756 hold = list_next(&sc->vmm_holds, hold)) { 1757 hold->vmh_release_req = B_TRUE; 1758 } 1759 while ((sc->vmm_flags & VMM_HELD) != 0) { 1760 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 1761 return (EINTR); 1762 } 1763 } 1764 sc->vmm_flags &= ~VMM_CLEANUP; 1765 } 1766 1767 VERIFY(list_is_empty(&sc->vmm_holds)); 1768 sc->vmm_flags |= VMM_PURGED; 1769 return (0); 1770 } 1771 1772 static int 1773 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 1774 { 1775 int err = 0; 1776 1777 mutex_enter(&vmm_mtx); 1778 if (!enable_block) { 1779 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 1780 1781 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 1782 goto done; 1783 } 1784 1785 /* If any holds have hooks installed, the block is a failure */ 1786 if (!list_is_empty(&sc->vmm_holds)) { 1787 vmm_hold_t *hold; 1788 1789 for (hold = list_head(&sc->vmm_holds); hold != NULL; 1790 hold = list_next(&sc->vmm_holds, hold)) { 1791 if (hold->vmh_ioport_hook_cnt != 0) { 1792 err = EBUSY; 1793 goto done; 1794 } 1795 } 1796 } 1797 sc->vmm_flags |= VMM_BLOCK_HOOK; 1798 1799 done: 1800 mutex_exit(&vmm_mtx); 1801 return (err); 1802 } 1803 1804 static int 1805 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd, 1806 boolean_t *hma_release) 1807 { 1808 dev_info_t *pdip = ddi_get_parent(vmmdev_dip); 1809 minor_t minor; 1810 1811 ASSERT(MUTEX_HELD(&vmm_mtx)); 1812 1813 *hma_release = B_FALSE; 1814 1815 if (clean_zsd) { 1816 vmm_zsd_rem_vm(sc); 1817 } 1818 1819 if (vmm_drv_purge(sc) != 0) { 1820 return (EINTR); 1821 } 1822 1823 /* Clean up devmem entries */ 1824 vmmdev_devmem_purge(sc); 1825 1826 list_remove(&vmm_list, sc); 1827 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 1828 minor = sc->vmm_minor; 1829 zone_rele(sc->vmm_zone); 1830 if (sc->vmm_is_open) { 1831 list_insert_tail(&vmm_destroy_list, sc); 1832 sc->vmm_flags |= VMM_DESTROY; 1833 } else { 1834 vm_destroy(sc->vmm_vm); 1835 ddi_soft_state_free(vmm_statep, minor); 1836 id_free(vmm_minors, minor); 1837 *hma_release = B_TRUE; 1838 } 1839 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); 1840 1841 return (0); 1842 } 1843 1844 int 1845 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) 1846 { 1847 boolean_t hma_release = B_FALSE; 1848 int err; 1849 1850 mutex_enter(&vmm_mtx); 1851 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release); 1852 mutex_exit(&vmm_mtx); 1853 1854 if (hma_release) 1855 vmm_hma_release(); 1856 1857 return (err); 1858 } 1859 1860 /* ARGSUSED */ 1861 static int 1862 vmmdev_do_vm_destroy(const char *name, cred_t *cr) 1863 { 1864 boolean_t hma_release = B_FALSE; 1865 vmm_softc_t *sc; 1866 int err; 1867 1868 if (crgetuid(cr) != 0) 1869 return (EPERM); 1870 1871 mutex_enter(&vmm_mtx); 1872 1873 if ((sc = vmm_lookup(name)) == NULL) { 1874 mutex_exit(&vmm_mtx); 1875 return (ENOENT); 1876 } 1877 /* 1878 * We don't check this in vmm_lookup() since that function is also used 1879 * for validation during create and currently vmm names must be unique. 1880 */ 1881 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 1882 mutex_exit(&vmm_mtx); 1883 return (EPERM); 1884 } 1885 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release); 1886 1887 mutex_exit(&vmm_mtx); 1888 1889 if (hma_release) 1890 vmm_hma_release(); 1891 1892 return (err); 1893 } 1894 1895 static int 1896 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 1897 { 1898 minor_t minor; 1899 vmm_softc_t *sc; 1900 1901 minor = getminor(*devp); 1902 if (minor == VMM_CTL_MINOR) { 1903 /* 1904 * Master control device must be opened exclusively. 1905 */ 1906 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 1907 return (EINVAL); 1908 } 1909 1910 return (0); 1911 } 1912 1913 mutex_enter(&vmm_mtx); 1914 sc = ddi_get_soft_state(vmm_statep, minor); 1915 if (sc == NULL) { 1916 mutex_exit(&vmm_mtx); 1917 return (ENXIO); 1918 } 1919 1920 sc->vmm_is_open = B_TRUE; 1921 mutex_exit(&vmm_mtx); 1922 1923 return (0); 1924 } 1925 1926 static int 1927 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 1928 { 1929 minor_t minor; 1930 vmm_softc_t *sc; 1931 boolean_t hma_release = B_FALSE; 1932 1933 minor = getminor(dev); 1934 if (minor == VMM_CTL_MINOR) 1935 return (0); 1936 1937 mutex_enter(&vmm_mtx); 1938 sc = ddi_get_soft_state(vmm_statep, minor); 1939 if (sc == NULL) { 1940 mutex_exit(&vmm_mtx); 1941 return (ENXIO); 1942 } 1943 1944 VERIFY(sc->vmm_is_open); 1945 sc->vmm_is_open = B_FALSE; 1946 1947 /* 1948 * If this VM was destroyed while the vmm device was open, then 1949 * clean it up now that it is closed. 1950 */ 1951 if (sc->vmm_flags & VMM_DESTROY) { 1952 list_remove(&vmm_destroy_list, sc); 1953 vm_destroy(sc->vmm_vm); 1954 ddi_soft_state_free(vmm_statep, minor); 1955 id_free(vmm_minors, minor); 1956 hma_release = B_TRUE; 1957 } 1958 mutex_exit(&vmm_mtx); 1959 1960 if (hma_release) 1961 vmm_hma_release(); 1962 1963 return (0); 1964 } 1965 1966 static int 1967 vmm_is_supported(intptr_t arg) 1968 { 1969 int r; 1970 const char *msg; 1971 1972 if (vmm_is_intel()) { 1973 r = vmx_x86_supported(&msg); 1974 } else if (vmm_is_svm()) { 1975 /* 1976 * HMA already ensured that the features necessary for SVM 1977 * operation were present and online during vmm_attach(). 1978 */ 1979 r = 0; 1980 } else { 1981 r = ENXIO; 1982 msg = "Unsupported CPU vendor"; 1983 } 1984 1985 if (r != 0 && arg != (intptr_t)NULL) { 1986 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 1987 return (EFAULT); 1988 } 1989 return (r); 1990 } 1991 1992 static int 1993 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 1994 int *rvalp) 1995 { 1996 vmm_softc_t *sc; 1997 minor_t minor; 1998 1999 minor = getminor(dev); 2000 2001 if (minor == VMM_CTL_MINOR) { 2002 void *argp = (void *)arg; 2003 char name[VM_MAX_NAMELEN] = { 0 }; 2004 size_t len = 0; 2005 2006 if ((mode & FKIOCTL) != 0) { 2007 len = strlcpy(name, argp, sizeof (name)); 2008 } else { 2009 if (copyinstr(argp, name, sizeof (name), &len) != 0) { 2010 return (EFAULT); 2011 } 2012 } 2013 if (len >= VM_MAX_NAMELEN) { 2014 return (ENAMETOOLONG); 2015 } 2016 2017 switch (cmd) { 2018 case VMM_CREATE_VM: 2019 if ((mode & FWRITE) == 0) 2020 return (EPERM); 2021 return (vmmdev_do_vm_create(name, credp)); 2022 case VMM_DESTROY_VM: 2023 if ((mode & FWRITE) == 0) 2024 return (EPERM); 2025 return (vmmdev_do_vm_destroy(name, credp)); 2026 case VMM_VM_SUPPORTED: 2027 return (vmm_is_supported(arg)); 2028 default: 2029 /* No other actions are legal on ctl device */ 2030 return (ENOTTY); 2031 } 2032 } 2033 2034 sc = ddi_get_soft_state(vmm_statep, minor); 2035 ASSERT(sc); 2036 2037 if (sc->vmm_flags & VMM_DESTROY) 2038 return (ENXIO); 2039 2040 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 2041 } 2042 2043 static int 2044 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 2045 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 2046 { 2047 vmm_softc_t *sc; 2048 const minor_t minor = getminor(dev); 2049 struct vm *vm; 2050 int err; 2051 vm_object_t vmo = NULL; 2052 struct vmspace *vms; 2053 2054 if (minor == VMM_CTL_MINOR) { 2055 return (ENODEV); 2056 } 2057 if (off < 0 || (off + len) <= 0) { 2058 return (EINVAL); 2059 } 2060 if ((prot & PROT_USER) == 0) { 2061 return (EACCES); 2062 } 2063 2064 sc = ddi_get_soft_state(vmm_statep, minor); 2065 ASSERT(sc); 2066 2067 if (sc->vmm_flags & VMM_DESTROY) 2068 return (ENXIO); 2069 2070 /* Grab read lock on the VM to prevent any changes to the memory map */ 2071 vmm_read_lock(sc); 2072 2073 vm = sc->vmm_vm; 2074 vms = vm_get_vmspace(vm); 2075 if (off >= VM_DEVMEM_START) { 2076 int segid; 2077 2078 /* Mapping a devmem "device" */ 2079 if (!vmmdev_devmem_segid(sc, off, len, &segid)) { 2080 err = ENODEV; 2081 goto out; 2082 } 2083 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 2084 if (err != 0) { 2085 goto out; 2086 } 2087 err = vm_segmap_obj(vms, vmo, as, addrp, prot, maxprot, flags); 2088 } else { 2089 /* Mapping a part of the guest physical space */ 2090 err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot, 2091 flags); 2092 } 2093 2094 2095 out: 2096 vmm_read_unlock(sc); 2097 return (err); 2098 } 2099 2100 static sdev_plugin_validate_t 2101 vmm_sdev_validate(sdev_ctx_t ctx) 2102 { 2103 const char *name = sdev_ctx_name(ctx); 2104 vmm_softc_t *sc; 2105 sdev_plugin_validate_t ret; 2106 minor_t minor; 2107 2108 if (sdev_ctx_vtype(ctx) != VCHR) 2109 return (SDEV_VTOR_INVALID); 2110 2111 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 2112 2113 mutex_enter(&vmm_mtx); 2114 if ((sc = vmm_lookup(name)) == NULL) 2115 ret = SDEV_VTOR_INVALID; 2116 else if (sc->vmm_minor != minor) 2117 ret = SDEV_VTOR_STALE; 2118 else 2119 ret = SDEV_VTOR_VALID; 2120 mutex_exit(&vmm_mtx); 2121 2122 return (ret); 2123 } 2124 2125 static int 2126 vmm_sdev_filldir(sdev_ctx_t ctx) 2127 { 2128 vmm_softc_t *sc; 2129 int ret; 2130 2131 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 2132 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 2133 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 2134 return (EINVAL); 2135 } 2136 2137 mutex_enter(&vmm_mtx); 2138 ASSERT(vmmdev_dip != NULL); 2139 for (sc = list_head(&vmm_list); sc != NULL; 2140 sc = list_next(&vmm_list, sc)) { 2141 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 2142 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 2143 S_IFCHR | 0600, 2144 makedevice(ddi_driver_major(vmmdev_dip), 2145 sc->vmm_minor)); 2146 } else { 2147 continue; 2148 } 2149 if (ret != 0 && ret != EEXIST) 2150 goto out; 2151 } 2152 2153 ret = 0; 2154 2155 out: 2156 mutex_exit(&vmm_mtx); 2157 return (ret); 2158 } 2159 2160 /* ARGSUSED */ 2161 static void 2162 vmm_sdev_inactive(sdev_ctx_t ctx) 2163 { 2164 } 2165 2166 static sdev_plugin_ops_t vmm_sdev_ops = { 2167 .spo_version = SDEV_PLUGIN_VERSION, 2168 .spo_flags = SDEV_PLUGIN_SUBDIR, 2169 .spo_validate = vmm_sdev_validate, 2170 .spo_filldir = vmm_sdev_filldir, 2171 .spo_inactive = vmm_sdev_inactive 2172 }; 2173 2174 /* ARGSUSED */ 2175 static int 2176 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 2177 { 2178 int error; 2179 2180 switch (cmd) { 2181 case DDI_INFO_DEVT2DEVINFO: 2182 *result = (void *)vmmdev_dip; 2183 error = DDI_SUCCESS; 2184 break; 2185 case DDI_INFO_DEVT2INSTANCE: 2186 *result = (void *)0; 2187 error = DDI_SUCCESS; 2188 break; 2189 default: 2190 error = DDI_FAILURE; 2191 break; 2192 } 2193 return (error); 2194 } 2195 2196 static int 2197 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2198 { 2199 sdev_plugin_hdl_t sph; 2200 hma_reg_t *reg = NULL; 2201 boolean_t vmm_loaded = B_FALSE; 2202 2203 if (cmd != DDI_ATTACH) { 2204 return (DDI_FAILURE); 2205 } 2206 2207 mutex_enter(&vmmdev_mtx); 2208 /* Ensure we are not already attached. */ 2209 if (vmmdev_dip != NULL) { 2210 mutex_exit(&vmmdev_mtx); 2211 return (DDI_FAILURE); 2212 } 2213 2214 vmm_sol_glue_init(); 2215 vmm_arena_init(); 2216 2217 /* 2218 * Perform temporary HMA registration to determine if the system 2219 * is capable. 2220 */ 2221 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 2222 goto fail; 2223 } else if (vmm_mod_load() != 0) { 2224 goto fail; 2225 } 2226 vmm_loaded = B_TRUE; 2227 hma_unregister(reg); 2228 reg = NULL; 2229 2230 /* Create control node. Other nodes will be created on demand. */ 2231 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 2232 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 2233 goto fail; 2234 } 2235 2236 if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) == 2237 (sdev_plugin_hdl_t)NULL) { 2238 ddi_remove_minor_node(dip, NULL); 2239 goto fail; 2240 } 2241 2242 ddi_report_dev(dip); 2243 vmmdev_sdev_hdl = sph; 2244 vmmdev_dip = dip; 2245 mutex_exit(&vmmdev_mtx); 2246 return (DDI_SUCCESS); 2247 2248 fail: 2249 if (vmm_loaded) { 2250 VERIFY0(vmm_mod_unload()); 2251 } 2252 if (reg != NULL) { 2253 hma_unregister(reg); 2254 } 2255 vmm_arena_fini(); 2256 vmm_sol_glue_cleanup(); 2257 mutex_exit(&vmmdev_mtx); 2258 return (DDI_FAILURE); 2259 } 2260 2261 static int 2262 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2263 { 2264 if (cmd != DDI_DETACH) { 2265 return (DDI_FAILURE); 2266 } 2267 2268 /* 2269 * Ensure that all resources have been cleaned up. 2270 * 2271 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 2272 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 2273 * devinfo locked as iommu_cleanup() tries to recursively lock each 2274 * devinfo, including our own, while holding vmmdev_mtx. 2275 */ 2276 if (mutex_tryenter(&vmmdev_mtx) == 0) 2277 return (DDI_FAILURE); 2278 2279 mutex_enter(&vmm_mtx); 2280 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { 2281 mutex_exit(&vmm_mtx); 2282 mutex_exit(&vmmdev_mtx); 2283 return (DDI_FAILURE); 2284 } 2285 mutex_exit(&vmm_mtx); 2286 2287 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 2288 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 2289 mutex_exit(&vmmdev_mtx); 2290 return (DDI_FAILURE); 2291 } 2292 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 2293 2294 /* Remove the control node. */ 2295 ddi_remove_minor_node(dip, "ctl"); 2296 vmmdev_dip = NULL; 2297 2298 VERIFY0(vmm_mod_unload()); 2299 VERIFY3U(vmmdev_hma_reg, ==, NULL); 2300 vmm_arena_fini(); 2301 vmm_sol_glue_cleanup(); 2302 2303 mutex_exit(&vmmdev_mtx); 2304 2305 return (DDI_SUCCESS); 2306 } 2307 2308 static struct cb_ops vmm_cb_ops = { 2309 vmm_open, 2310 vmm_close, 2311 nodev, /* strategy */ 2312 nodev, /* print */ 2313 nodev, /* dump */ 2314 nodev, /* read */ 2315 nodev, /* write */ 2316 vmm_ioctl, 2317 nodev, /* devmap */ 2318 nodev, /* mmap */ 2319 vmm_segmap, 2320 nochpoll, /* poll */ 2321 ddi_prop_op, 2322 NULL, 2323 D_NEW | D_MP | D_DEVMAP 2324 }; 2325 2326 static struct dev_ops vmm_ops = { 2327 DEVO_REV, 2328 0, 2329 vmm_info, 2330 nulldev, /* identify */ 2331 nulldev, /* probe */ 2332 vmm_attach, 2333 vmm_detach, 2334 nodev, /* reset */ 2335 &vmm_cb_ops, 2336 (struct bus_ops *)NULL 2337 }; 2338 2339 static struct modldrv modldrv = { 2340 &mod_driverops, 2341 "bhyve vmm", 2342 &vmm_ops 2343 }; 2344 2345 static struct modlinkage modlinkage = { 2346 MODREV_1, 2347 &modldrv, 2348 NULL 2349 }; 2350 2351 int 2352 _init(void) 2353 { 2354 int error; 2355 2356 sysinit(); 2357 2358 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 2359 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 2360 list_create(&vmm_list, sizeof (vmm_softc_t), 2361 offsetof(vmm_softc_t, vmm_node)); 2362 list_create(&vmm_destroy_list, sizeof (vmm_softc_t), 2363 offsetof(vmm_softc_t, vmm_node)); 2364 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 2365 2366 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 2367 if (error) { 2368 return (error); 2369 } 2370 2371 vmm_zsd_init(); 2372 2373 error = mod_install(&modlinkage); 2374 if (error) { 2375 ddi_soft_state_fini(&vmm_statep); 2376 vmm_zsd_fini(); 2377 } 2378 2379 return (error); 2380 } 2381 2382 int 2383 _fini(void) 2384 { 2385 int error; 2386 2387 error = mod_remove(&modlinkage); 2388 if (error) { 2389 return (error); 2390 } 2391 2392 vmm_zsd_fini(); 2393 2394 ddi_soft_state_fini(&vmm_statep); 2395 2396 return (0); 2397 } 2398 2399 int 2400 _info(struct modinfo *modinfop) 2401 { 2402 return (mod_info(&modlinkage, modinfop)); 2403 } 2404