xref: /illumos-gate/usr/src/uts/intel/io/vmm/intel/vmx.c (revision 3c5f2a9d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  * Copyright (c) 2018 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 /*
32  * This file and its contents are supplied under the terms of the
33  * Common Development and Distribution License ("CDDL"), version 1.0.
34  * You may only use this file in accordance with the terms of version
35  * 1.0 of the CDDL.
36  *
37  * A full copy of the text of the CDDL should have accompanied this
38  * source.  A copy of the CDDL is also available via the Internet at
39  * http://www.illumos.org/license/CDDL.
40  *
41  * Copyright 2015 Pluribus Networks Inc.
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2020 Oxide Computer Company
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/smp.h>
52 #include <sys/kernel.h>
53 #include <sys/malloc.h>
54 #include <sys/pcpu.h>
55 #include <sys/proc.h>
56 #include <sys/sysctl.h>
57 
58 #ifndef __FreeBSD__
59 #include <sys/x86_archext.h>
60 #include <sys/smp_impldefs.h>
61 #include <sys/smt.h>
62 #include <sys/hma.h>
63 #include <sys/trap.h>
64 #endif
65 
66 #include <vm/vm.h>
67 #include <vm/pmap.h>
68 
69 #include <machine/psl.h>
70 #include <machine/cpufunc.h>
71 #include <machine/md_var.h>
72 #include <machine/reg.h>
73 #include <machine/segments.h>
74 #include <machine/smp.h>
75 #include <machine/specialreg.h>
76 #include <machine/vmparam.h>
77 
78 #include <machine/vmm.h>
79 #include <machine/vmm_dev.h>
80 #include <machine/vmm_instruction_emul.h>
81 #include "vmm_lapic.h"
82 #include "vmm_host.h"
83 #include "vmm_ioport.h"
84 #include "vmm_ktr.h"
85 #include "vmm_stat.h"
86 #include "vatpic.h"
87 #include "vlapic.h"
88 #include "vlapic_priv.h"
89 
90 #include "ept.h"
91 #include "vmx_cpufunc.h"
92 #include "vmcs.h"
93 #include "vmx.h"
94 #include "vmx_msr.h"
95 #include "x86.h"
96 #include "vmx_controls.h"
97 
98 #define	PINBASED_CTLS_ONE_SETTING					\
99 	(PINBASED_EXTINT_EXITING	|				\
100 	 PINBASED_NMI_EXITING		|				\
101 	 PINBASED_VIRTUAL_NMI)
102 #define	PINBASED_CTLS_ZERO_SETTING	0
103 
104 #define PROCBASED_CTLS_WINDOW_SETTING					\
105 	(PROCBASED_INT_WINDOW_EXITING	|				\
106 	 PROCBASED_NMI_WINDOW_EXITING)
107 
108 #ifdef __FreeBSD__
109 #define	PROCBASED_CTLS_ONE_SETTING					\
110 	(PROCBASED_SECONDARY_CONTROLS	|				\
111 	 PROCBASED_MWAIT_EXITING	|				\
112 	 PROCBASED_MONITOR_EXITING	|				\
113 	 PROCBASED_IO_EXITING		|				\
114 	 PROCBASED_MSR_BITMAPS		|				\
115 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
116 	 PROCBASED_CR8_LOAD_EXITING	|				\
117 	 PROCBASED_CR8_STORE_EXITING)
118 #else
119 /* We consider TSC offset a necessity for unsynched TSC handling */
120 #define	PROCBASED_CTLS_ONE_SETTING 					\
121 	(PROCBASED_SECONDARY_CONTROLS	|				\
122 	 PROCBASED_TSC_OFFSET		|				\
123 	 PROCBASED_MWAIT_EXITING	|				\
124 	 PROCBASED_MONITOR_EXITING	|				\
125 	 PROCBASED_IO_EXITING		|				\
126 	 PROCBASED_MSR_BITMAPS		|				\
127 	 PROCBASED_CTLS_WINDOW_SETTING	|				\
128 	 PROCBASED_CR8_LOAD_EXITING	|				\
129 	 PROCBASED_CR8_STORE_EXITING)
130 #endif /* __FreeBSD__ */
131 
132 #define	PROCBASED_CTLS_ZERO_SETTING	\
133 	(PROCBASED_CR3_LOAD_EXITING |	\
134 	PROCBASED_CR3_STORE_EXITING |	\
135 	PROCBASED_IO_BITMAPS)
136 
137 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
138 #define	PROCBASED_CTLS2_ZERO_SETTING	0
139 
140 #define	VM_EXIT_CTLS_ONE_SETTING					\
141 	(VM_EXIT_SAVE_DEBUG_CONTROLS		|			\
142 	VM_EXIT_HOST_LMA			|			\
143 	VM_EXIT_LOAD_PAT			|			\
144 	VM_EXIT_SAVE_EFER			|			\
145 	VM_EXIT_LOAD_EFER			|			\
146 	VM_EXIT_ACKNOWLEDGE_INTERRUPT)
147 
148 #define	VM_EXIT_CTLS_ZERO_SETTING	0
149 
150 #define	VM_ENTRY_CTLS_ONE_SETTING					\
151 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
152 	VM_ENTRY_LOAD_EFER)
153 
154 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
155 	(VM_ENTRY_INTO_SMM			|			\
156 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
157 
158 #define	HANDLED		1
159 #define	UNHANDLED	0
160 
161 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
162 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
163 
164 SYSCTL_DECL(_hw_vmm);
165 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
166 
167 #ifdef __FreeBSD__
168 int vmxon_enabled[MAXCPU];
169 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
170 #endif /*__FreeBSD__ */
171 
172 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
173 static uint32_t exit_ctls, entry_ctls;
174 
175 static uint64_t cr0_ones_mask, cr0_zeros_mask;
176 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
177 	     &cr0_ones_mask, 0, NULL);
178 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
179 	     &cr0_zeros_mask, 0, NULL);
180 
181 static uint64_t cr4_ones_mask, cr4_zeros_mask;
182 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
183 	     &cr4_ones_mask, 0, NULL);
184 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
185 	     &cr4_zeros_mask, 0, NULL);
186 
187 static int vmx_initialized;
188 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
189 	   &vmx_initialized, 0, "Intel VMX initialized");
190 
191 /*
192  * Optional capabilities
193  */
194 SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
195 
196 static int cap_halt_exit;
197 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
198     "HLT triggers a VM-exit");
199 
200 static int cap_pause_exit;
201 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
202     0, "PAUSE triggers a VM-exit");
203 
204 static int cap_unrestricted_guest;
205 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
206     &cap_unrestricted_guest, 0, "Unrestricted guests");
207 
208 static int cap_monitor_trap;
209 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
210     &cap_monitor_trap, 0, "Monitor trap flag");
211 
212 static int cap_invpcid;
213 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
214     0, "Guests are allowed to use INVPCID");
215 
216 static int virtual_interrupt_delivery;
217 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
218     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
219 
220 static int posted_interrupts;
221 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
222     &posted_interrupts, 0, "APICv posted interrupt support");
223 
224 static int pirvec = -1;
225 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
226     &pirvec, 0, "APICv posted interrupt vector");
227 
228 #ifdef __FreeBSD__
229 static struct unrhdr *vpid_unr;
230 #endif /* __FreeBSD__ */
231 static u_int vpid_alloc_failed;
232 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
233 	    &vpid_alloc_failed, 0, NULL);
234 
235 static int guest_l1d_flush;
236 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
237     &guest_l1d_flush, 0, NULL);
238 static int guest_l1d_flush_sw;
239 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
240     &guest_l1d_flush_sw, 0, NULL);
241 
242 static struct msr_entry msr_load_list[1] __aligned(16);
243 
244 /*
245  * The definitions of SDT probes for VMX.
246  */
247 
248 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
249     "struct vmx *", "int", "struct vm_exit *");
250 
251 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
252     "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
253 
254 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
255     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
256 
257 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
258     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
259 
260 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
261     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
262 
263 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
264     "struct vmx *", "int", "struct vm_exit *");
265 
266 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
267     "struct vmx *", "int", "struct vm_exit *");
268 
269 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
270     "struct vmx *", "int", "struct vm_exit *");
271 
272 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
273     "struct vmx *", "int", "struct vm_exit *");
274 
275 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
276     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
277 
278 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
279     "struct vmx *", "int", "struct vm_exit *");
280 
281 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
282     "struct vmx *", "int", "struct vm_exit *");
283 
284 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
285     "struct vmx *", "int", "struct vm_exit *");
286 
287 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
288     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
289 
290 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
291     "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
292 
293 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
294     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
295 
296 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
297     "struct vmx *", "int", "struct vm_exit *");
298 
299 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
300     "struct vmx *", "int", "struct vm_exit *");
301 
302 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
303     "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
304 
305 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
306     "struct vmx *", "int", "struct vm_exit *");
307 
308 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
309     "struct vmx *", "int", "struct vm_exit *");
310 
311 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
312     "struct vmx *", "int", "struct vm_exit *");
313 
314 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
315     "struct vmx *", "int", "struct vm_exit *");
316 
317 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
318     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
319 
320 SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
321     "struct vmx *", "int", "struct vm_exit *", "int");
322 
323 /*
324  * Use the last page below 4GB as the APIC access address. This address is
325  * occupied by the boot firmware so it is guaranteed that it will not conflict
326  * with a page in system memory.
327  */
328 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
329 
330 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
331 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
332 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
333 static void vmx_inject_pir(struct vlapic *vlapic);
334 #ifndef __FreeBSD__
335 static int vmx_apply_tsc_adjust(struct vmx *, int);
336 #endif /* __FreeBSD__ */
337 
338 #ifdef KTR
339 static const char *
340 exit_reason_to_str(int reason)
341 {
342 	static char reasonbuf[32];
343 
344 	switch (reason) {
345 	case EXIT_REASON_EXCEPTION:
346 		return "exception";
347 	case EXIT_REASON_EXT_INTR:
348 		return "extint";
349 	case EXIT_REASON_TRIPLE_FAULT:
350 		return "triplefault";
351 	case EXIT_REASON_INIT:
352 		return "init";
353 	case EXIT_REASON_SIPI:
354 		return "sipi";
355 	case EXIT_REASON_IO_SMI:
356 		return "iosmi";
357 	case EXIT_REASON_SMI:
358 		return "smi";
359 	case EXIT_REASON_INTR_WINDOW:
360 		return "intrwindow";
361 	case EXIT_REASON_NMI_WINDOW:
362 		return "nmiwindow";
363 	case EXIT_REASON_TASK_SWITCH:
364 		return "taskswitch";
365 	case EXIT_REASON_CPUID:
366 		return "cpuid";
367 	case EXIT_REASON_GETSEC:
368 		return "getsec";
369 	case EXIT_REASON_HLT:
370 		return "hlt";
371 	case EXIT_REASON_INVD:
372 		return "invd";
373 	case EXIT_REASON_INVLPG:
374 		return "invlpg";
375 	case EXIT_REASON_RDPMC:
376 		return "rdpmc";
377 	case EXIT_REASON_RDTSC:
378 		return "rdtsc";
379 	case EXIT_REASON_RSM:
380 		return "rsm";
381 	case EXIT_REASON_VMCALL:
382 		return "vmcall";
383 	case EXIT_REASON_VMCLEAR:
384 		return "vmclear";
385 	case EXIT_REASON_VMLAUNCH:
386 		return "vmlaunch";
387 	case EXIT_REASON_VMPTRLD:
388 		return "vmptrld";
389 	case EXIT_REASON_VMPTRST:
390 		return "vmptrst";
391 	case EXIT_REASON_VMREAD:
392 		return "vmread";
393 	case EXIT_REASON_VMRESUME:
394 		return "vmresume";
395 	case EXIT_REASON_VMWRITE:
396 		return "vmwrite";
397 	case EXIT_REASON_VMXOFF:
398 		return "vmxoff";
399 	case EXIT_REASON_VMXON:
400 		return "vmxon";
401 	case EXIT_REASON_CR_ACCESS:
402 		return "craccess";
403 	case EXIT_REASON_DR_ACCESS:
404 		return "draccess";
405 	case EXIT_REASON_INOUT:
406 		return "inout";
407 	case EXIT_REASON_RDMSR:
408 		return "rdmsr";
409 	case EXIT_REASON_WRMSR:
410 		return "wrmsr";
411 	case EXIT_REASON_INVAL_VMCS:
412 		return "invalvmcs";
413 	case EXIT_REASON_INVAL_MSR:
414 		return "invalmsr";
415 	case EXIT_REASON_MWAIT:
416 		return "mwait";
417 	case EXIT_REASON_MTF:
418 		return "mtf";
419 	case EXIT_REASON_MONITOR:
420 		return "monitor";
421 	case EXIT_REASON_PAUSE:
422 		return "pause";
423 	case EXIT_REASON_MCE_DURING_ENTRY:
424 		return "mce-during-entry";
425 	case EXIT_REASON_TPR:
426 		return "tpr";
427 	case EXIT_REASON_APIC_ACCESS:
428 		return "apic-access";
429 	case EXIT_REASON_GDTR_IDTR:
430 		return "gdtridtr";
431 	case EXIT_REASON_LDTR_TR:
432 		return "ldtrtr";
433 	case EXIT_REASON_EPT_FAULT:
434 		return "eptfault";
435 	case EXIT_REASON_EPT_MISCONFIG:
436 		return "eptmisconfig";
437 	case EXIT_REASON_INVEPT:
438 		return "invept";
439 	case EXIT_REASON_RDTSCP:
440 		return "rdtscp";
441 	case EXIT_REASON_VMX_PREEMPT:
442 		return "vmxpreempt";
443 	case EXIT_REASON_INVVPID:
444 		return "invvpid";
445 	case EXIT_REASON_WBINVD:
446 		return "wbinvd";
447 	case EXIT_REASON_XSETBV:
448 		return "xsetbv";
449 	case EXIT_REASON_APIC_WRITE:
450 		return "apic-write";
451 	default:
452 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
453 		return (reasonbuf);
454 	}
455 }
456 #endif	/* KTR */
457 
458 static int
459 vmx_allow_x2apic_msrs(struct vmx *vmx)
460 {
461 	int i, error;
462 
463 	error = 0;
464 
465 	/*
466 	 * Allow readonly access to the following x2APIC MSRs from the guest.
467 	 */
468 	error += guest_msr_ro(vmx, MSR_APIC_ID);
469 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
470 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
471 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
472 
473 	for (i = 0; i < 8; i++)
474 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
475 
476 	for (i = 0; i < 8; i++)
477 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
478 
479 	for (i = 0; i < 8; i++)
480 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
481 
482 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
483 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
484 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
485 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
486 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
487 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
488 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
489 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
490 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
491 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
492 
493 	/*
494 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
495 	 *
496 	 * These registers get special treatment described in the section
497 	 * "Virtualizing MSR-Based APIC Accesses".
498 	 */
499 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
500 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
501 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
502 
503 	return (error);
504 }
505 
506 u_long
507 vmx_fix_cr0(u_long cr0)
508 {
509 
510 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
511 }
512 
513 u_long
514 vmx_fix_cr4(u_long cr4)
515 {
516 
517 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
518 }
519 
520 static void
521 vpid_free(int vpid)
522 {
523 	if (vpid < 0 || vpid > 0xffff)
524 		panic("vpid_free: invalid vpid %d", vpid);
525 
526 	/*
527 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
528 	 * the unit number allocator.
529 	 */
530 
531 	if (vpid > VM_MAXCPU)
532 #ifdef __FreeBSD__
533 		free_unr(vpid_unr, vpid);
534 #else
535 		hma_vmx_vpid_free((uint16_t)vpid);
536 #endif
537 }
538 
539 static void
540 vpid_alloc(uint16_t *vpid, int num)
541 {
542 	int i, x;
543 
544 	if (num <= 0 || num > VM_MAXCPU)
545 		panic("invalid number of vpids requested: %d", num);
546 
547 	/*
548 	 * If the "enable vpid" execution control is not enabled then the
549 	 * VPID is required to be 0 for all vcpus.
550 	 */
551 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
552 		for (i = 0; i < num; i++)
553 			vpid[i] = 0;
554 		return;
555 	}
556 
557 	/*
558 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
559 	 */
560 	for (i = 0; i < num; i++) {
561 #ifdef __FreeBSD__
562 		x = alloc_unr(vpid_unr);
563 #else
564 		uint16_t tmp;
565 
566 		tmp = hma_vmx_vpid_alloc();
567 		x = (tmp == 0) ? -1 : tmp;
568 #endif
569 		if (x == -1)
570 			break;
571 		else
572 			vpid[i] = x;
573 	}
574 
575 	if (i < num) {
576 		atomic_add_int(&vpid_alloc_failed, 1);
577 
578 		/*
579 		 * If the unit number allocator does not have enough unique
580 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
581 		 *
582 		 * These VPIDs are not be unique across VMs but this does not
583 		 * affect correctness because the combined mappings are also
584 		 * tagged with the EP4TA which is unique for each VM.
585 		 *
586 		 * It is still sub-optimal because the invvpid will invalidate
587 		 * combined mappings for a particular VPID across all EP4TAs.
588 		 */
589 		while (i-- > 0)
590 			vpid_free(vpid[i]);
591 
592 		for (i = 0; i < num; i++)
593 			vpid[i] = i + 1;
594 	}
595 }
596 
597 #ifdef __FreeBSD__
598 static void
599 vpid_init(void)
600 {
601 	/*
602 	 * VPID 0 is required when the "enable VPID" execution control is
603 	 * disabled.
604 	 *
605 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
606 	 * unit number allocator does not have sufficient unique VPIDs to
607 	 * satisfy the allocation.
608 	 *
609 	 * The remaining VPIDs are managed by the unit number allocator.
610 	 */
611 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
612 }
613 
614 static void
615 vmx_disable(void *arg __unused)
616 {
617 	struct invvpid_desc invvpid_desc = { 0 };
618 	struct invept_desc invept_desc = { 0 };
619 
620 	if (vmxon_enabled[curcpu]) {
621 		/*
622 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
623 		 *
624 		 * VMXON or VMXOFF are not required to invalidate any TLB
625 		 * caching structures. This prevents potential retention of
626 		 * cached information in the TLB between distinct VMX episodes.
627 		 */
628 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
629 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
630 		vmxoff();
631 	}
632 	load_cr4(rcr4() & ~CR4_VMXE);
633 }
634 
635 static int
636 vmx_cleanup(void)
637 {
638 
639 	if (pirvec >= 0)
640 		lapic_ipi_free(pirvec);
641 
642 	if (vpid_unr != NULL) {
643 		delete_unrhdr(vpid_unr);
644 		vpid_unr = NULL;
645 	}
646 
647 	if (nmi_flush_l1d_sw == 1)
648 		nmi_flush_l1d_sw = 0;
649 
650 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
651 
652 	return (0);
653 }
654 
655 static void
656 vmx_enable(void *arg __unused)
657 {
658 	int error;
659 	uint64_t feature_control;
660 
661 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
662 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
663 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
664 		wrmsr(MSR_IA32_FEATURE_CONTROL,
665 		    feature_control | IA32_FEATURE_CONTROL_VMX_EN |
666 		    IA32_FEATURE_CONTROL_LOCK);
667 	}
668 
669 	load_cr4(rcr4() | CR4_VMXE);
670 
671 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
672 	error = vmxon(vmxon_region[curcpu]);
673 	if (error == 0)
674 		vmxon_enabled[curcpu] = 1;
675 }
676 
677 static void
678 vmx_restore(void)
679 {
680 
681 	if (vmxon_enabled[curcpu])
682 		vmxon(vmxon_region[curcpu]);
683 }
684 #else /* __FreeBSD__ */
685 static int
686 vmx_cleanup(void)
687 {
688 	/* This is taken care of by the hma registration */
689 	return (0);
690 }
691 
692 static void
693 vmx_restore(void)
694 {
695 	/* No-op on illumos */
696 }
697 #endif /* __FreeBSD__ */
698 
699 static int
700 vmx_init(int ipinum)
701 {
702 	int error, use_tpr_shadow;
703 #ifdef __FreeBSD__
704 	uint64_t basic, fixed0, fixed1, feature_control;
705 #else
706 	uint64_t fixed0, fixed1;
707 #endif
708 	uint32_t tmp, procbased2_vid_bits;
709 
710 #ifdef __FreeBSD__
711 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
712 	if (!(cpu_feature2 & CPUID2_VMX)) {
713 		printf("vmx_init: processor does not support VMX operation\n");
714 		return (ENXIO);
715 	}
716 
717 	/*
718 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
719 	 * are set (bits 0 and 2 respectively).
720 	 */
721 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
722 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
723 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
724 		printf("vmx_init: VMX operation disabled by BIOS\n");
725 		return (ENXIO);
726 	}
727 
728 	/*
729 	 * Verify capabilities MSR_VMX_BASIC:
730 	 * - bit 54 indicates support for INS/OUTS decoding
731 	 */
732 	basic = rdmsr(MSR_VMX_BASIC);
733 	if ((basic & (1UL << 54)) == 0) {
734 		printf("vmx_init: processor does not support desired basic "
735 		    "capabilities\n");
736 		return (EINVAL);
737 	}
738 #endif /* __FreeBSD__ */
739 
740 	/* Check support for primary processor-based VM-execution controls */
741 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
742 			       MSR_VMX_TRUE_PROCBASED_CTLS,
743 			       PROCBASED_CTLS_ONE_SETTING,
744 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
745 	if (error) {
746 		printf("vmx_init: processor does not support desired primary "
747 		       "processor-based controls\n");
748 		return (error);
749 	}
750 
751 	/* Clear the processor-based ctl bits that are set on demand */
752 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
753 
754 	/* Check support for secondary processor-based VM-execution controls */
755 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
756 			       MSR_VMX_PROCBASED_CTLS2,
757 			       PROCBASED_CTLS2_ONE_SETTING,
758 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
759 	if (error) {
760 		printf("vmx_init: processor does not support desired secondary "
761 		       "processor-based controls\n");
762 		return (error);
763 	}
764 
765 	/* Check support for VPID */
766 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
767 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
768 	if (error == 0)
769 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
770 
771 	/* Check support for pin-based VM-execution controls */
772 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
773 			       MSR_VMX_TRUE_PINBASED_CTLS,
774 			       PINBASED_CTLS_ONE_SETTING,
775 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
776 	if (error) {
777 		printf("vmx_init: processor does not support desired "
778 		       "pin-based controls\n");
779 		return (error);
780 	}
781 
782 	/* Check support for VM-exit controls */
783 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
784 			       VM_EXIT_CTLS_ONE_SETTING,
785 			       VM_EXIT_CTLS_ZERO_SETTING,
786 			       &exit_ctls);
787 	if (error) {
788 		printf("vmx_init: processor does not support desired "
789 		    "exit controls\n");
790 		return (error);
791 	}
792 
793 	/* Check support for VM-entry controls */
794 	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
795 	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
796 	    &entry_ctls);
797 	if (error) {
798 		printf("vmx_init: processor does not support desired "
799 		    "entry controls\n");
800 		return (error);
801 	}
802 
803 	/*
804 	 * Check support for optional features by testing them
805 	 * as individual bits
806 	 */
807 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
808 					MSR_VMX_TRUE_PROCBASED_CTLS,
809 					PROCBASED_HLT_EXITING, 0,
810 					&tmp) == 0);
811 
812 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
813 					MSR_VMX_PROCBASED_CTLS,
814 					PROCBASED_MTF, 0,
815 					&tmp) == 0);
816 
817 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
818 					 MSR_VMX_TRUE_PROCBASED_CTLS,
819 					 PROCBASED_PAUSE_EXITING, 0,
820 					 &tmp) == 0);
821 
822 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
823 					MSR_VMX_PROCBASED_CTLS2,
824 					PROCBASED2_UNRESTRICTED_GUEST, 0,
825 				        &tmp) == 0);
826 
827 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
828 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
829 	    &tmp) == 0);
830 
831 	/*
832 	 * Check support for virtual interrupt delivery.
833 	 */
834 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
835 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
836 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
837 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
838 
839 	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
840 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
841 	    &tmp) == 0);
842 
843 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
844 	    procbased2_vid_bits, 0, &tmp);
845 	if (error == 0 && use_tpr_shadow) {
846 		virtual_interrupt_delivery = 1;
847 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
848 		    &virtual_interrupt_delivery);
849 	}
850 
851 	if (virtual_interrupt_delivery) {
852 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
853 		procbased_ctls2 |= procbased2_vid_bits;
854 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
855 
856 		/*
857 		 * No need to emulate accesses to %CR8 if virtual
858 		 * interrupt delivery is enabled.
859 		 */
860 		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
861 		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
862 
863 		/*
864 		 * Check for Posted Interrupts only if Virtual Interrupt
865 		 * Delivery is enabled.
866 		 */
867 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
868 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
869 		    &tmp);
870 		if (error == 0) {
871 #ifdef __FreeBSD__
872 			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
873 			    &IDTVEC(justreturn));
874 			if (pirvec < 0) {
875 				if (bootverbose) {
876 					printf("vmx_init: unable to allocate "
877 					    "posted interrupt vector\n");
878 				}
879 			} else {
880 				posted_interrupts = 1;
881 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
882 				    &posted_interrupts);
883 			}
884 #else
885 			/*
886 			 * If the PSM-provided interfaces for requesting and
887 			 * using a PIR IPI vector are present, use them for
888 			 * posted interrupts.
889 			 */
890 			if (psm_get_pir_ipivect != NULL &&
891 			    psm_send_pir_ipi != NULL) {
892 				pirvec = psm_get_pir_ipivect();
893 				posted_interrupts = 1;
894 			}
895 #endif
896 		}
897 	}
898 
899 	if (posted_interrupts)
900 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
901 
902 	/* Initialize EPT */
903 	error = ept_init(ipinum);
904 	if (error) {
905 		printf("vmx_init: ept initialization failed (%d)\n", error);
906 		return (error);
907 	}
908 
909 #ifdef __FreeBSD__
910 	guest_l1d_flush = (cpu_ia32_arch_caps &
911 	    IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
912 	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
913 
914 	/*
915 	 * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
916 	 * available.  Otherwise fall back to the software flush
917 	 * method which loads enough data from the kernel text to
918 	 * flush existing L1D content, both on VMX entry and on NMI
919 	 * return.
920 	 */
921 	if (guest_l1d_flush) {
922 		if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
923 			guest_l1d_flush_sw = 1;
924 			TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
925 			    &guest_l1d_flush_sw);
926 		}
927 		if (guest_l1d_flush_sw) {
928 			if (nmi_flush_l1d_sw <= 1)
929 				nmi_flush_l1d_sw = 1;
930 		} else {
931 			msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
932 			msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
933 		}
934 	}
935 #else
936 	/* L1D flushing is taken care of by smt_acquire() and friends */
937 	guest_l1d_flush = 0;
938 #endif /* __FreeBSD__ */
939 
940 	/*
941 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
942 	 */
943 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
944 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
945 	cr0_ones_mask = fixed0 & fixed1;
946 	cr0_zeros_mask = ~fixed0 & ~fixed1;
947 
948 	/*
949 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
950 	 * if unrestricted guest execution is allowed.
951 	 */
952 	if (cap_unrestricted_guest)
953 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
954 
955 	/*
956 	 * Do not allow the guest to set CR0_NW or CR0_CD.
957 	 */
958 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
959 
960 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
961 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
962 	cr4_ones_mask = fixed0 & fixed1;
963 	cr4_zeros_mask = ~fixed0 & ~fixed1;
964 
965 #ifdef __FreeBSD__
966 	vpid_init();
967 #endif
968 
969 	vmx_msr_init();
970 
971 #ifdef __FreeBSD__
972 	/* enable VMX operation */
973 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
974 #endif
975 
976 	vmx_initialized = 1;
977 
978 	return (0);
979 }
980 
981 static void
982 vmx_trigger_hostintr(int vector)
983 {
984 #ifdef __FreeBSD__
985 	uintptr_t func;
986 	struct gate_descriptor *gd;
987 
988 	gd = &idt[vector];
989 
990 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
991 	    "invalid vector %d", vector));
992 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
993 	    vector));
994 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
995 	    "has invalid type %d", vector, gd->gd_type));
996 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
997 	    "has invalid dpl %d", vector, gd->gd_dpl));
998 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
999 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
1000 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
1001 	    "IST %d", vector, gd->gd_ist));
1002 
1003 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
1004 	vmx_call_isr(func);
1005 #else
1006 	VERIFY(vector >= 32 && vector <= 255);
1007 	vmx_call_isr(vector - 32);
1008 #endif /* __FreeBSD__ */
1009 }
1010 
1011 static int
1012 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
1013 {
1014 	int error, mask_ident, shadow_ident;
1015 	uint64_t mask_value;
1016 
1017 	if (which != 0 && which != 4)
1018 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
1019 
1020 	if (which == 0) {
1021 		mask_ident = VMCS_CR0_MASK;
1022 		mask_value = cr0_ones_mask | cr0_zeros_mask;
1023 		shadow_ident = VMCS_CR0_SHADOW;
1024 	} else {
1025 		mask_ident = VMCS_CR4_MASK;
1026 		mask_value = cr4_ones_mask | cr4_zeros_mask;
1027 		shadow_ident = VMCS_CR4_SHADOW;
1028 	}
1029 
1030 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
1031 	if (error)
1032 		return (error);
1033 
1034 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
1035 	if (error)
1036 		return (error);
1037 
1038 	return (0);
1039 }
1040 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
1041 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
1042 
1043 static void *
1044 vmx_vminit(struct vm *vm, pmap_t pmap)
1045 {
1046 	uint16_t vpid[VM_MAXCPU];
1047 	int i, error;
1048 	struct vmx *vmx;
1049 	struct vmcs *vmcs;
1050 	uint32_t exc_bitmap;
1051 	uint16_t maxcpus;
1052 
1053 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
1054 	if ((uintptr_t)vmx & PAGE_MASK) {
1055 		panic("malloc of struct vmx not aligned on %d byte boundary",
1056 		      PAGE_SIZE);
1057 	}
1058 	vmx->vm = vm;
1059 
1060 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
1061 
1062 	/*
1063 	 * Clean up EPTP-tagged guest physical and combined mappings
1064 	 *
1065 	 * VMX transitions are not required to invalidate any guest physical
1066 	 * mappings. So, it may be possible for stale guest physical mappings
1067 	 * to be present in the processor TLBs.
1068 	 *
1069 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
1070 	 */
1071 	ept_invalidate_mappings(vmx->eptp);
1072 
1073 	msr_bitmap_initialize(vmx->msr_bitmap);
1074 
1075 	/*
1076 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
1077 	 * The guest FSBASE and GSBASE are saved and restored during
1078 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
1079 	 * always restored from the vmcs host state area on vm-exit.
1080 	 *
1081 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
1082 	 * how they are saved/restored so can be directly accessed by the
1083 	 * guest.
1084 	 *
1085 	 * MSR_EFER is saved and restored in the guest VMCS area on a
1086 	 * VM exit and entry respectively. It is also restored from the
1087 	 * host VMCS area on a VM exit.
1088 	 *
1089 	 * The TSC MSR is exposed read-only. Writes are disallowed as
1090 	 * that will impact the host TSC.  If the guest does a write
1091 	 * the "use TSC offsetting" execution control is enabled and the
1092 	 * difference between the host TSC and the guest TSC is written
1093 	 * into the TSC offset in the VMCS.
1094 	 */
1095 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
1096 	    guest_msr_rw(vmx, MSR_FSBASE) ||
1097 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
1098 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
1099 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
1100 	    guest_msr_rw(vmx, MSR_EFER) ||
1101 	    guest_msr_ro(vmx, MSR_TSC))
1102 		panic("vmx_vminit: error setting guest msr access");
1103 
1104 	vpid_alloc(vpid, VM_MAXCPU);
1105 
1106 	if (virtual_interrupt_delivery) {
1107 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
1108 		    APIC_ACCESS_ADDRESS);
1109 		/* XXX this should really return an error to the caller */
1110 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
1111 	}
1112 
1113 	maxcpus = vm_get_maxcpus(vm);
1114 	for (i = 0; i < maxcpus; i++) {
1115 #ifndef __FreeBSD__
1116 		/*
1117 		 * Cache physical address lookups for various components which
1118 		 * may be required inside the critical_enter() section implied
1119 		 * by VMPTRLD() below.
1120 		 */
1121 		vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap);
1122 		vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]);
1123 		vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]);
1124 #endif /* __FreeBSD__ */
1125 
1126 		vmcs = &vmx->vmcs[i];
1127 		vmcs->identifier = vmx_revision();
1128 #ifndef __FreeBSD__
1129 		vmcs->vmcs_pa = (uint64_t)vtophys(vmcs);
1130 #endif
1131 		error = vmclear(vmcs);
1132 		if (error != 0) {
1133 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
1134 			      error, i);
1135 		}
1136 
1137 		vmx_msr_guest_init(vmx, i);
1138 
1139 		error = vmcs_init(vmcs);
1140 		KASSERT(error == 0, ("vmcs_init error %d", error));
1141 
1142 		VMPTRLD(vmcs);
1143 		error = 0;
1144 #ifdef __FreeBSD__
1145 		/*
1146 		 * The illumos vmx_enter_guest implementation avoids some of
1147 		 * the %rsp-manipulation games which are present in the stock
1148 		 * one from FreeBSD.
1149 		 */
1150 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
1151 #endif
1152 		error += vmwrite(VMCS_EPTP, vmx->eptp);
1153 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
1154 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
1155 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
1156 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
1157 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
1158 #ifdef __FreeBSD__
1159 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
1160 #else
1161 		error += vmwrite(VMCS_MSR_BITMAP, msr_bitmap_pa);
1162 #endif
1163 		error += vmwrite(VMCS_VPID, vpid[i]);
1164 
1165 		if (guest_l1d_flush && !guest_l1d_flush_sw) {
1166 			vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
1167 			    (vm_offset_t)&msr_load_list[0]));
1168 			vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
1169 			    nitems(msr_load_list));
1170 			vmcs_write(VMCS_EXIT_MSR_STORE, 0);
1171 			vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
1172 		}
1173 
1174 		/* exception bitmap */
1175 		if (vcpu_trace_exceptions(vm, i))
1176 			exc_bitmap = 0xffffffff;
1177 		else
1178 			exc_bitmap = 1 << IDT_MC;
1179 		error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
1180 
1181 		vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
1182 		error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
1183 
1184 		if (virtual_interrupt_delivery) {
1185 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
1186 #ifdef __FreeBSD__
1187 			error += vmwrite(VMCS_VIRTUAL_APIC,
1188 			    vtophys(&vmx->apic_page[i]));
1189 #else
1190 			error += vmwrite(VMCS_VIRTUAL_APIC, apic_page_pa);
1191 #endif
1192 			error += vmwrite(VMCS_EOI_EXIT0, 0);
1193 			error += vmwrite(VMCS_EOI_EXIT1, 0);
1194 			error += vmwrite(VMCS_EOI_EXIT2, 0);
1195 			error += vmwrite(VMCS_EOI_EXIT3, 0);
1196 		}
1197 		if (posted_interrupts) {
1198 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
1199 #ifdef __FreeBSD__
1200 			error += vmwrite(VMCS_PIR_DESC,
1201 			    vtophys(&vmx->pir_desc[i]));
1202 #else
1203 			error += vmwrite(VMCS_PIR_DESC, pir_desc_pa);
1204 #endif
1205 		}
1206 		VMCLEAR(vmcs);
1207 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
1208 
1209 		vmx->cap[i].set = 0;
1210 		vmx->cap[i].proc_ctls = procbased_ctls;
1211 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
1212 
1213 		vmx->state[i].nextrip = ~0;
1214 		vmx->state[i].lastcpu = NOCPU;
1215 		vmx->state[i].vpid = vpid[i];
1216 
1217 		/*
1218 		 * Set up the CR0/4 shadows, and init the read shadow
1219 		 * to the power-on register value from the Intel Sys Arch.
1220 		 *  CR0 - 0x60000010
1221 		 *  CR4 - 0
1222 		 */
1223 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
1224 		if (error != 0)
1225 			panic("vmx_setup_cr0_shadow %d", error);
1226 
1227 		error = vmx_setup_cr4_shadow(vmcs, 0);
1228 		if (error != 0)
1229 			panic("vmx_setup_cr4_shadow %d", error);
1230 
1231 		vmx->ctx[i].pmap = pmap;
1232 	}
1233 
1234 	return (vmx);
1235 }
1236 
1237 static int
1238 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
1239 {
1240 #ifdef __FreeBSD__
1241 	int handled, func;
1242 
1243 	func = vmxctx->guest_rax;
1244 #else
1245 	int handled;
1246 #endif
1247 
1248 	handled = x86_emulate_cpuid(vm, vcpu, (uint64_t *)&vmxctx->guest_rax,
1249 	    (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx,
1250 	    (uint64_t *)&vmxctx->guest_rdx);
1251 	return (handled);
1252 }
1253 
1254 static __inline void
1255 vmx_run_trace(struct vmx *vmx, int vcpu)
1256 {
1257 #ifdef KTR
1258 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
1259 #endif
1260 }
1261 
1262 static __inline void
1263 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
1264 	       int handled)
1265 {
1266 #ifdef KTR
1267 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
1268 		 handled ? "handled" : "unhandled",
1269 		 exit_reason_to_str(exit_reason), rip);
1270 #endif
1271 	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip,
1272 	    uint32_t, exit_reason);
1273 }
1274 
1275 static __inline void
1276 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
1277 {
1278 #ifdef KTR
1279 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
1280 #endif
1281 }
1282 
1283 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
1284 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
1285 
1286 /*
1287  * Invalidate guest mappings identified by its vpid from the TLB.
1288  */
1289 static __inline void
1290 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
1291 {
1292 	struct vmxstate *vmxstate;
1293 	struct invvpid_desc invvpid_desc;
1294 
1295 	vmxstate = &vmx->state[vcpu];
1296 	if (vmxstate->vpid == 0)
1297 		return;
1298 
1299 	if (!running) {
1300 		/*
1301 		 * Set the 'lastcpu' to an invalid host cpu.
1302 		 *
1303 		 * This will invalidate TLB entries tagged with the vcpu's
1304 		 * vpid the next time it runs via vmx_set_pcpu_defaults().
1305 		 */
1306 		vmxstate->lastcpu = NOCPU;
1307 		return;
1308 	}
1309 
1310 #ifdef __FreeBSD__
1311 	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
1312 	    "critical section", __func__, vcpu));
1313 #endif
1314 
1315 	/*
1316 	 * Invalidate all mappings tagged with 'vpid'
1317 	 *
1318 	 * We do this because this vcpu was executing on a different host
1319 	 * cpu when it last ran. We do not track whether it invalidated
1320 	 * mappings associated with its 'vpid' during that run. So we must
1321 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
1322 	 * stale and invalidate them.
1323 	 *
1324 	 * Note that we incur this penalty only when the scheduler chooses to
1325 	 * move the thread associated with this vcpu between host cpus.
1326 	 *
1327 	 * Note also that this will invalidate mappings tagged with 'vpid'
1328 	 * for "all" EP4TAs.
1329 	 */
1330 	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
1331 		invvpid_desc._res1 = 0;
1332 		invvpid_desc._res2 = 0;
1333 		invvpid_desc.vpid = vmxstate->vpid;
1334 		invvpid_desc.linear_addr = 0;
1335 		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1336 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
1337 	} else {
1338 		/*
1339 		 * The invvpid can be skipped if an invept is going to
1340 		 * be performed before entering the guest. The invept
1341 		 * will invalidate combined mappings tagged with
1342 		 * 'vmx->eptp' for all vpids.
1343 		 */
1344 		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1345 	}
1346 }
1347 
1348 static void
1349 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
1350 {
1351 	struct vmxstate *vmxstate;
1352 
1353 #ifndef __FreeBSD__
1354 	/*
1355 	 * Regardless of whether the VM appears to have migrated between CPUs,
1356 	 * save the host sysenter stack pointer.  As it points to the kernel
1357 	 * stack of each thread, the correct value must be maintained for every
1358 	 * trip into the critical section.
1359 	 */
1360 	vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR));
1361 
1362 	/*
1363 	 * Perform any needed TSC_OFFSET adjustment based on TSC_MSR writes or
1364 	 * migration between host CPUs with differing TSC values.
1365 	 */
1366 	VERIFY0(vmx_apply_tsc_adjust(vmx, vcpu));
1367 #endif
1368 
1369 	vmxstate = &vmx->state[vcpu];
1370 	if (vmxstate->lastcpu == curcpu)
1371 		return;
1372 
1373 	vmxstate->lastcpu = curcpu;
1374 
1375 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
1376 
1377 #ifndef __FreeBSD__
1378 	/* Load the per-CPU IDT address */
1379 	vmcs_write(VMCS_HOST_IDTR_BASE, vmm_get_host_idtrbase());
1380 #endif
1381 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1382 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1383 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1384 	vmx_invvpid(vmx, vcpu, pmap, 1);
1385 }
1386 
1387 /*
1388  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1389  */
1390 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1391 
1392 static __inline void
1393 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1394 {
1395 
1396 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1397 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1398 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1399 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1400 	}
1401 }
1402 
1403 static __inline void
1404 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1405 {
1406 
1407 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1408 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1409 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1410 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1411 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1412 }
1413 
1414 static __inline void
1415 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1416 {
1417 
1418 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1419 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1420 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1421 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1422 	}
1423 }
1424 
1425 static __inline void
1426 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1427 {
1428 
1429 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1430 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1431 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1432 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1433 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1434 }
1435 
1436 #ifdef __FreeBSD__
1437 int
1438 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
1439 {
1440 	int error;
1441 
1442 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
1443 		vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
1444 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1445 		VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
1446 	}
1447 
1448 	error = vmwrite(VMCS_TSC_OFFSET, offset);
1449 
1450 	return (error);
1451 }
1452 #else /* __FreeBSD__ */
1453 /*
1454  * Set the TSC adjustment, taking into account the offsets measured between
1455  * host physical CPUs.  This is required even if the guest has not set a TSC
1456  * offset since vCPUs inherit the TSC offset of whatever physical CPU it has
1457  * migrated onto.  Without this mitigation, un-synched host TSCs will convey
1458  * the appearance of TSC time-travel to the guest as its vCPUs migrate.
1459  */
1460 static int
1461 vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu)
1462 {
1463 	extern hrtime_t tsc_gethrtime_tick_delta(void);
1464 	const uint64_t target_offset = (vcpu_tsc_offset(vmx->vm, vcpu) +
1465 	    (uint64_t)tsc_gethrtime_tick_delta());
1466 	int error = 0;
1467 
1468 	ASSERT(vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET);
1469 
1470 	if (vmx->tsc_offset_active[vcpu] != target_offset) {
1471 		error = vmwrite(VMCS_TSC_OFFSET, target_offset);
1472 		vmx->tsc_offset_active[vcpu] = target_offset;
1473 	}
1474 
1475 	return (error);
1476 }
1477 #endif /* __FreeBSD__ */
1478 
1479 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
1480 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1481 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
1482 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1483 
1484 #ifndef __FreeBSD__
1485 static uint32_t
1486 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1487 #else
1488 static void
1489 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1490 #endif
1491 {
1492 	uint32_t gi, info;
1493 
1494 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1495 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1496 	    "interruptibility-state %#x", gi));
1497 
1498 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1499 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1500 	    "VM-entry interruption information %#x", info));
1501 
1502 	/*
1503 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
1504 	 * or the VMCS entry check will fail.
1505 	 */
1506 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1507 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1508 
1509 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1510 
1511 	/* Clear the request */
1512 	vm_nmi_clear(vmx->vm, vcpu);
1513 
1514 #ifndef __FreeBSD__
1515 	return (info);
1516 #endif
1517 }
1518 
1519 #ifndef __FreeBSD__
1520 static void
1521 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
1522     uint64_t guestrip)
1523 {
1524 	uint64_t entryinfo, rflags;
1525 	uint32_t gi, info;
1526 	int vector;
1527 	boolean_t extint_pending = B_FALSE;
1528 
1529 	vlapic_tmr_update(vlapic);
1530 
1531 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1532 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1533 
1534 	if (vmx->state[vcpu].nextrip != guestrip &&
1535 	    (gi & HWINTR_BLOCKING) != 0) {
1536 		VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
1537 		    "cleared due to rip change: %#lx/%#lx",
1538 		    vmx->state[vcpu].nextrip, guestrip);
1539 		gi &= ~HWINTR_BLOCKING;
1540 		vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1541 	}
1542 
1543 	/*
1544 	 * It could be that an interrupt is already pending for injection from
1545 	 * the VMCS.  This would be the case if the vCPU exited for conditions
1546 	 * such as an AST before a vm-entry delivered the injection.
1547 	 */
1548 	if ((info & VMCS_INTR_VALID) != 0) {
1549 		goto cantinject;
1550 	}
1551 
1552 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
1553 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
1554 		    "intinfo is not valid: %#lx", __func__, entryinfo));
1555 
1556 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1557 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
1558 
1559 		info = entryinfo;
1560 		vector = info & 0xff;
1561 		if (vector == IDT_BP || vector == IDT_OF) {
1562 			/*
1563 			 * VT-x requires #BP and #OF to be injected as software
1564 			 * exceptions.
1565 			 */
1566 			info &= ~VMCS_INTR_T_MASK;
1567 			info |= VMCS_INTR_T_SWEXCEPTION;
1568 		}
1569 
1570 		if (info & VMCS_INTR_DEL_ERRCODE)
1571 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
1572 
1573 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1574 	}
1575 
1576 	if (vm_nmi_pending(vmx->vm, vcpu)) {
1577 		int need_nmi_exiting = 1;
1578 
1579 		/*
1580 		 * If there are no conditions blocking NMI injection then
1581 		 * inject it directly here otherwise enable "NMI window
1582 		 * exiting" to inject it as soon as we can.
1583 		 *
1584 		 * We also check for STI_BLOCKING because some implementations
1585 		 * don't allow NMI injection in this case. If we are running
1586 		 * on a processor that doesn't have this restriction it will
1587 		 * immediately exit and the NMI will be injected in the
1588 		 * "NMI window exiting" handler.
1589 		 */
1590 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1591 			if ((info & VMCS_INTR_VALID) == 0) {
1592 				info = vmx_inject_nmi(vmx, vcpu);
1593 				need_nmi_exiting = 0;
1594 			} else {
1595 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1596 				    "due to VM-entry intr info %#x", info);
1597 			}
1598 		} else {
1599 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1600 			    "Guest Interruptibility-state %#x", gi);
1601 		}
1602 
1603 		if (need_nmi_exiting) {
1604 			vmx_set_nmi_window_exiting(vmx, vcpu);
1605 			return;
1606 		}
1607 	}
1608 
1609 	/* Check the AT-PIC and APIC for interrupts. */
1610 	if (vm_extint_pending(vmx->vm, vcpu)) {
1611 		/* Ask the legacy pic for a vector to inject */
1612 		vatpic_pending_intr(vmx->vm, &vector);
1613 		extint_pending = B_TRUE;
1614 
1615 		/*
1616 		 * From the Intel SDM, Volume 3, Section "Maskable
1617 		 * Hardware Interrupts":
1618 		 * - maskable interrupt vectors [0,255] can be delivered
1619 		 *   through the INTR pin.
1620 		 */
1621 		KASSERT(vector >= 0 && vector <= 255,
1622 		    ("invalid vector %d from INTR", vector));
1623 	} else if (!virtual_interrupt_delivery) {
1624 		/* Ask the local apic for a vector to inject */
1625 		if (!vlapic_pending_intr(vlapic, &vector))
1626 			return;
1627 
1628 		/*
1629 		 * From the Intel SDM, Volume 3, Section "Maskable
1630 		 * Hardware Interrupts":
1631 		 * - maskable interrupt vectors [16,255] can be delivered
1632 		 *   through the local APIC.
1633 		*/
1634 		KASSERT(vector >= 16 && vector <= 255,
1635 		    ("invalid vector %d from local APIC", vector));
1636 	} else {
1637 		/* No futher injection needed */
1638 		return;
1639 	}
1640 
1641 	/*
1642 	 * Verify that the guest is interruptable and the above logic has not
1643 	 * already queued an event for injection.
1644 	 */
1645 	if ((gi & HWINTR_BLOCKING) != 0) {
1646 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1647 		    "Guest Interruptibility-state %#x", vector, gi);
1648 		goto cantinject;
1649 	}
1650 	if ((info & VMCS_INTR_VALID) != 0) {
1651 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1652 		    "VM-entry intr info %#x", vector, info);
1653 		goto cantinject;
1654 	}
1655 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1656 	if ((rflags & PSL_I) == 0) {
1657 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1658 		    "rflags %#lx", vector, rflags);
1659 		goto cantinject;
1660 	}
1661 
1662 	/* Inject the interrupt */
1663 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1664 	info |= vector;
1665 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1666 
1667 	if (extint_pending) {
1668 		vm_extint_clear(vmx->vm, vcpu);
1669 		vatpic_intr_accepted(vmx->vm, vector);
1670 
1671 		/*
1672 		 * After we accepted the current ExtINT the PIC may
1673 		 * have posted another one.  If that is the case, set
1674 		 * the Interrupt Window Exiting execution control so
1675 		 * we can inject that one too.
1676 		 *
1677 		 * Also, interrupt window exiting allows us to inject any
1678 		 * pending APIC vector that was preempted by the ExtINT
1679 		 * as soon as possible. This applies both for the software
1680 		 * emulated vlapic and the hardware assisted virtual APIC.
1681 		 */
1682 		vmx_set_int_window_exiting(vmx, vcpu);
1683 	} else {
1684 		/* Update the Local APIC ISR */
1685 		vlapic_intr_accepted(vlapic, vector);
1686 	}
1687 
1688 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1689 	return;
1690 
1691 cantinject:
1692 	/*
1693 	 * Set the Interrupt Window Exiting execution control so we can inject
1694 	 * the interrupt as soon as blocking condition goes away.
1695 	 */
1696 	vmx_set_int_window_exiting(vmx, vcpu);
1697 }
1698 #else
1699 static void
1700 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
1701     uint64_t guestrip)
1702 {
1703 	int vector, need_nmi_exiting, extint_pending;
1704 	uint64_t rflags, entryinfo;
1705 	uint32_t gi, info;
1706 
1707 	vlapic_tmr_update(vlapic);
1708 
1709 	if (vmx->state[vcpu].nextrip != guestrip) {
1710 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1711 		if (gi & HWINTR_BLOCKING) {
1712 			VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
1713 			    "cleared due to rip change: %#lx/%#lx",
1714 			    vmx->state[vcpu].nextrip, guestrip);
1715 			gi &= ~HWINTR_BLOCKING;
1716 			vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1717 		}
1718 	}
1719 
1720 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
1721 		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
1722 		    "intinfo is not valid: %#lx", __func__, entryinfo));
1723 
1724 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1725 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1726 		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
1727 
1728 		info = entryinfo;
1729 		vector = info & 0xff;
1730 		if (vector == IDT_BP || vector == IDT_OF) {
1731 			/*
1732 			 * VT-x requires #BP and #OF to be injected as software
1733 			 * exceptions.
1734 			 */
1735 			info &= ~VMCS_INTR_T_MASK;
1736 			info |= VMCS_INTR_T_SWEXCEPTION;
1737 		}
1738 
1739 		if (info & VMCS_INTR_DEL_ERRCODE)
1740 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
1741 
1742 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1743 	}
1744 
1745 	if (vm_nmi_pending(vmx->vm, vcpu)) {
1746 		/*
1747 		 * If there are no conditions blocking NMI injection then
1748 		 * inject it directly here otherwise enable "NMI window
1749 		 * exiting" to inject it as soon as we can.
1750 		 *
1751 		 * We also check for STI_BLOCKING because some implementations
1752 		 * don't allow NMI injection in this case. If we are running
1753 		 * on a processor that doesn't have this restriction it will
1754 		 * immediately exit and the NMI will be injected in the
1755 		 * "NMI window exiting" handler.
1756 		 */
1757 		need_nmi_exiting = 1;
1758 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1759 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1760 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1761 			if ((info & VMCS_INTR_VALID) == 0) {
1762 				vmx_inject_nmi(vmx, vcpu);
1763 				need_nmi_exiting = 0;
1764 			} else {
1765 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1766 				    "due to VM-entry intr info %#x", info);
1767 			}
1768 		} else {
1769 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1770 			    "Guest Interruptibility-state %#x", gi);
1771 		}
1772 
1773 		if (need_nmi_exiting)
1774 			vmx_set_nmi_window_exiting(vmx, vcpu);
1775 	}
1776 
1777 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
1778 
1779 	if (!extint_pending && virtual_interrupt_delivery) {
1780 		vmx_inject_pir(vlapic);
1781 		return;
1782 	}
1783 
1784 	/*
1785 	 * If interrupt-window exiting is already in effect then don't bother
1786 	 * checking for pending interrupts. This is just an optimization and
1787 	 * not needed for correctness.
1788 	 */
1789 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1790 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1791 		    "pending int_window_exiting");
1792 		return;
1793 	}
1794 
1795 	if (!extint_pending) {
1796 		/* Ask the local apic for a vector to inject */
1797 		if (!vlapic_pending_intr(vlapic, &vector))
1798 			return;
1799 
1800 		/*
1801 		 * From the Intel SDM, Volume 3, Section "Maskable
1802 		 * Hardware Interrupts":
1803 		 * - maskable interrupt vectors [16,255] can be delivered
1804 		 *   through the local APIC.
1805 		*/
1806 		KASSERT(vector >= 16 && vector <= 255,
1807 		    ("invalid vector %d from local APIC", vector));
1808 	} else {
1809 		/* Ask the legacy pic for a vector to inject */
1810 		vatpic_pending_intr(vmx->vm, &vector);
1811 
1812 		/*
1813 		 * From the Intel SDM, Volume 3, Section "Maskable
1814 		 * Hardware Interrupts":
1815 		 * - maskable interrupt vectors [0,255] can be delivered
1816 		 *   through the INTR pin.
1817 		 */
1818 		KASSERT(vector >= 0 && vector <= 255,
1819 		    ("invalid vector %d from INTR", vector));
1820 	}
1821 
1822 	/* Check RFLAGS.IF and the interruptibility state of the guest */
1823 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1824 	if ((rflags & PSL_I) == 0) {
1825 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1826 		    "rflags %#lx", vector, rflags);
1827 		goto cantinject;
1828 	}
1829 
1830 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1831 	if (gi & HWINTR_BLOCKING) {
1832 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1833 		    "Guest Interruptibility-state %#x", vector, gi);
1834 		goto cantinject;
1835 	}
1836 
1837 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1838 	if (info & VMCS_INTR_VALID) {
1839 		/*
1840 		 * This is expected and could happen for multiple reasons:
1841 		 * - A vectoring VM-entry was aborted due to astpending
1842 		 * - A VM-exit happened during event injection.
1843 		 * - An exception was injected above.
1844 		 * - An NMI was injected above or after "NMI window exiting"
1845 		 */
1846 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1847 		    "VM-entry intr info %#x", vector, info);
1848 		goto cantinject;
1849 	}
1850 
1851 	/* Inject the interrupt */
1852 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1853 	info |= vector;
1854 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1855 
1856 	if (!extint_pending) {
1857 		/* Update the Local APIC ISR */
1858 		vlapic_intr_accepted(vlapic, vector);
1859 	} else {
1860 		vm_extint_clear(vmx->vm, vcpu);
1861 		vatpic_intr_accepted(vmx->vm, vector);
1862 
1863 		/*
1864 		 * After we accepted the current ExtINT the PIC may
1865 		 * have posted another one.  If that is the case, set
1866 		 * the Interrupt Window Exiting execution control so
1867 		 * we can inject that one too.
1868 		 *
1869 		 * Also, interrupt window exiting allows us to inject any
1870 		 * pending APIC vector that was preempted by the ExtINT
1871 		 * as soon as possible. This applies both for the software
1872 		 * emulated vlapic and the hardware assisted virtual APIC.
1873 		 */
1874 		vmx_set_int_window_exiting(vmx, vcpu);
1875 	}
1876 
1877 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1878 
1879 	return;
1880 
1881 cantinject:
1882 	/*
1883 	 * Set the Interrupt Window Exiting execution control so we can inject
1884 	 * the interrupt as soon as blocking condition goes away.
1885 	 */
1886 	vmx_set_int_window_exiting(vmx, vcpu);
1887 }
1888 #endif /* __FreeBSD__ */
1889 
1890 /*
1891  * If the Virtual NMIs execution control is '1' then the logical processor
1892  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1893  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1894  * virtual-NMI blocking.
1895  *
1896  * This unblocking occurs even if the IRET causes a fault. In this case the
1897  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1898  */
1899 static void
1900 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1901 {
1902 	uint32_t gi;
1903 
1904 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1905 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1906 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1907 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1908 }
1909 
1910 static void
1911 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1912 {
1913 	uint32_t gi;
1914 
1915 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1916 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1917 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1918 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1919 }
1920 
1921 static void
1922 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
1923 {
1924 	uint32_t gi;
1925 
1926 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1927 	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
1928 	    ("NMI blocking is not in effect %#x", gi));
1929 }
1930 
1931 static int
1932 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1933 {
1934 	struct vmxctx *vmxctx;
1935 	uint64_t xcrval;
1936 	const struct xsave_limits *limits;
1937 
1938 	vmxctx = &vmx->ctx[vcpu];
1939 	limits = vmm_get_xsave_limits();
1940 
1941 	/*
1942 	 * Note that the processor raises a GP# fault on its own if
1943 	 * xsetbv is executed for CPL != 0, so we do not have to
1944 	 * emulate that fault here.
1945 	 */
1946 
1947 	/* Only xcr0 is supported. */
1948 	if (vmxctx->guest_rcx != 0) {
1949 		vm_inject_gp(vmx->vm, vcpu);
1950 		return (HANDLED);
1951 	}
1952 
1953 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1954 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1955 		vm_inject_ud(vmx->vm, vcpu);
1956 		return (HANDLED);
1957 	}
1958 
1959 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1960 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
1961 		vm_inject_gp(vmx->vm, vcpu);
1962 		return (HANDLED);
1963 	}
1964 
1965 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
1966 		vm_inject_gp(vmx->vm, vcpu);
1967 		return (HANDLED);
1968 	}
1969 
1970 	/* AVX (YMM_Hi128) requires SSE. */
1971 	if (xcrval & XFEATURE_ENABLED_AVX &&
1972 	    (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
1973 		vm_inject_gp(vmx->vm, vcpu);
1974 		return (HANDLED);
1975 	}
1976 
1977 	/*
1978 	 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
1979 	 * ZMM_Hi256, and Hi16_ZMM.
1980 	 */
1981 	if (xcrval & XFEATURE_AVX512 &&
1982 	    (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
1983 	    (XFEATURE_AVX512 | XFEATURE_AVX)) {
1984 		vm_inject_gp(vmx->vm, vcpu);
1985 		return (HANDLED);
1986 	}
1987 
1988 	/*
1989 	 * Intel MPX requires both bound register state flags to be
1990 	 * set.
1991 	 */
1992 	if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
1993 	    ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
1994 		vm_inject_gp(vmx->vm, vcpu);
1995 		return (HANDLED);
1996 	}
1997 
1998 	/*
1999 	 * This runs "inside" vmrun() with the guest's FPU state, so
2000 	 * modifying xcr0 directly modifies the guest's xcr0, not the
2001 	 * host's.
2002 	 */
2003 	load_xcr(0, xcrval);
2004 	return (HANDLED);
2005 }
2006 
2007 static uint64_t
2008 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
2009 {
2010 	const struct vmxctx *vmxctx;
2011 
2012 	vmxctx = &vmx->ctx[vcpu];
2013 
2014 	switch (ident) {
2015 	case 0:
2016 		return (vmxctx->guest_rax);
2017 	case 1:
2018 		return (vmxctx->guest_rcx);
2019 	case 2:
2020 		return (vmxctx->guest_rdx);
2021 	case 3:
2022 		return (vmxctx->guest_rbx);
2023 	case 4:
2024 		return (vmcs_read(VMCS_GUEST_RSP));
2025 	case 5:
2026 		return (vmxctx->guest_rbp);
2027 	case 6:
2028 		return (vmxctx->guest_rsi);
2029 	case 7:
2030 		return (vmxctx->guest_rdi);
2031 	case 8:
2032 		return (vmxctx->guest_r8);
2033 	case 9:
2034 		return (vmxctx->guest_r9);
2035 	case 10:
2036 		return (vmxctx->guest_r10);
2037 	case 11:
2038 		return (vmxctx->guest_r11);
2039 	case 12:
2040 		return (vmxctx->guest_r12);
2041 	case 13:
2042 		return (vmxctx->guest_r13);
2043 	case 14:
2044 		return (vmxctx->guest_r14);
2045 	case 15:
2046 		return (vmxctx->guest_r15);
2047 	default:
2048 		panic("invalid vmx register %d", ident);
2049 	}
2050 }
2051 
2052 static void
2053 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
2054 {
2055 	struct vmxctx *vmxctx;
2056 
2057 	vmxctx = &vmx->ctx[vcpu];
2058 
2059 	switch (ident) {
2060 	case 0:
2061 		vmxctx->guest_rax = regval;
2062 		break;
2063 	case 1:
2064 		vmxctx->guest_rcx = regval;
2065 		break;
2066 	case 2:
2067 		vmxctx->guest_rdx = regval;
2068 		break;
2069 	case 3:
2070 		vmxctx->guest_rbx = regval;
2071 		break;
2072 	case 4:
2073 		vmcs_write(VMCS_GUEST_RSP, regval);
2074 		break;
2075 	case 5:
2076 		vmxctx->guest_rbp = regval;
2077 		break;
2078 	case 6:
2079 		vmxctx->guest_rsi = regval;
2080 		break;
2081 	case 7:
2082 		vmxctx->guest_rdi = regval;
2083 		break;
2084 	case 8:
2085 		vmxctx->guest_r8 = regval;
2086 		break;
2087 	case 9:
2088 		vmxctx->guest_r9 = regval;
2089 		break;
2090 	case 10:
2091 		vmxctx->guest_r10 = regval;
2092 		break;
2093 	case 11:
2094 		vmxctx->guest_r11 = regval;
2095 		break;
2096 	case 12:
2097 		vmxctx->guest_r12 = regval;
2098 		break;
2099 	case 13:
2100 		vmxctx->guest_r13 = regval;
2101 		break;
2102 	case 14:
2103 		vmxctx->guest_r14 = regval;
2104 		break;
2105 	case 15:
2106 		vmxctx->guest_r15 = regval;
2107 		break;
2108 	default:
2109 		panic("invalid vmx register %d", ident);
2110 	}
2111 }
2112 
2113 static int
2114 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
2115 {
2116 	uint64_t crval, regval;
2117 
2118 	/* We only handle mov to %cr0 at this time */
2119 	if ((exitqual & 0xf0) != 0x00)
2120 		return (UNHANDLED);
2121 
2122 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
2123 
2124 	vmcs_write(VMCS_CR0_SHADOW, regval);
2125 
2126 	crval = regval | cr0_ones_mask;
2127 	crval &= ~cr0_zeros_mask;
2128 	vmcs_write(VMCS_GUEST_CR0, crval);
2129 
2130 	if (regval & CR0_PG) {
2131 		uint64_t efer, entry_ctls;
2132 
2133 		/*
2134 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
2135 		 * the "IA-32e mode guest" bit in VM-entry control must be
2136 		 * equal.
2137 		 */
2138 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
2139 		if (efer & EFER_LME) {
2140 			efer |= EFER_LMA;
2141 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
2142 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
2143 			entry_ctls |= VM_ENTRY_GUEST_LMA;
2144 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
2145 		}
2146 	}
2147 
2148 	return (HANDLED);
2149 }
2150 
2151 static int
2152 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
2153 {
2154 	uint64_t crval, regval;
2155 
2156 	/* We only handle mov to %cr4 at this time */
2157 	if ((exitqual & 0xf0) != 0x00)
2158 		return (UNHANDLED);
2159 
2160 	regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
2161 
2162 	vmcs_write(VMCS_CR4_SHADOW, regval);
2163 
2164 	crval = regval | cr4_ones_mask;
2165 	crval &= ~cr4_zeros_mask;
2166 	vmcs_write(VMCS_GUEST_CR4, crval);
2167 
2168 	return (HANDLED);
2169 }
2170 
2171 static int
2172 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
2173 {
2174 	struct vlapic *vlapic;
2175 	uint64_t cr8;
2176 	int regnum;
2177 
2178 	/* We only handle mov %cr8 to/from a register at this time. */
2179 	if ((exitqual & 0xe0) != 0x00) {
2180 		return (UNHANDLED);
2181 	}
2182 
2183 	vlapic = vm_lapic(vmx->vm, vcpu);
2184 	regnum = (exitqual >> 8) & 0xf;
2185 	if (exitqual & 0x10) {
2186 		cr8 = vlapic_get_cr8(vlapic);
2187 		vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
2188 	} else {
2189 		cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
2190 		vlapic_set_cr8(vlapic, cr8);
2191 	}
2192 
2193 	return (HANDLED);
2194 }
2195 
2196 /*
2197  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
2198  */
2199 static int
2200 vmx_cpl(void)
2201 {
2202 	uint32_t ssar;
2203 
2204 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
2205 	return ((ssar >> 5) & 0x3);
2206 }
2207 
2208 static enum vm_cpu_mode
2209 vmx_cpu_mode(void)
2210 {
2211 	uint32_t csar;
2212 
2213 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
2214 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
2215 		if (csar & 0x2000)
2216 			return (CPU_MODE_64BIT);	/* CS.L = 1 */
2217 		else
2218 			return (CPU_MODE_COMPATIBILITY);
2219 	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
2220 		return (CPU_MODE_PROTECTED);
2221 	} else {
2222 		return (CPU_MODE_REAL);
2223 	}
2224 }
2225 
2226 static enum vm_paging_mode
2227 vmx_paging_mode(void)
2228 {
2229 
2230 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
2231 		return (PAGING_MODE_FLAT);
2232 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
2233 		return (PAGING_MODE_32);
2234 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
2235 		return (PAGING_MODE_64);
2236 	else
2237 		return (PAGING_MODE_PAE);
2238 }
2239 
2240 static uint64_t
2241 inout_str_index(struct vmx *vmx, int vcpuid, int in)
2242 {
2243 	uint64_t val;
2244 	int error;
2245 	enum vm_reg_name reg;
2246 
2247 	reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
2248 	error = vmx_getreg(vmx, vcpuid, reg, &val);
2249 	KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
2250 	return (val);
2251 }
2252 
2253 static uint64_t
2254 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
2255 {
2256 	uint64_t val;
2257 	int error;
2258 
2259 	if (rep) {
2260 		error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
2261 		KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
2262 	} else {
2263 		val = 1;
2264 	}
2265 	return (val);
2266 }
2267 
2268 static int
2269 inout_str_addrsize(uint32_t inst_info)
2270 {
2271 	uint32_t size;
2272 
2273 	size = (inst_info >> 7) & 0x7;
2274 	switch (size) {
2275 	case 0:
2276 		return (2);	/* 16 bit */
2277 	case 1:
2278 		return (4);	/* 32 bit */
2279 	case 2:
2280 		return (8);	/* 64 bit */
2281 	default:
2282 		panic("%s: invalid size encoding %d", __func__, size);
2283 	}
2284 }
2285 
2286 static void
2287 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
2288     struct vm_inout_str *vis)
2289 {
2290 	int error, s;
2291 
2292 	if (in) {
2293 		vis->seg_name = VM_REG_GUEST_ES;
2294 	} else {
2295 		s = (inst_info >> 15) & 0x7;
2296 		vis->seg_name = vm_segment_name(s);
2297 	}
2298 
2299 	error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
2300 	KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
2301 }
2302 
2303 static void
2304 vmx_paging_info(struct vm_guest_paging *paging)
2305 {
2306 	paging->cr3 = vmcs_guest_cr3();
2307 	paging->cpl = vmx_cpl();
2308 	paging->cpu_mode = vmx_cpu_mode();
2309 	paging->paging_mode = vmx_paging_mode();
2310 }
2311 
2312 static void
2313 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
2314 {
2315 	struct vm_guest_paging *paging;
2316 	uint32_t csar;
2317 
2318 	paging = &vmexit->u.inst_emul.paging;
2319 
2320 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
2321 	vmexit->inst_length = 0;
2322 	vmexit->u.inst_emul.gpa = gpa;
2323 	vmexit->u.inst_emul.gla = gla;
2324 	vmx_paging_info(paging);
2325 	switch (paging->cpu_mode) {
2326 	case CPU_MODE_REAL:
2327 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
2328 		vmexit->u.inst_emul.cs_d = 0;
2329 		break;
2330 	case CPU_MODE_PROTECTED:
2331 	case CPU_MODE_COMPATIBILITY:
2332 		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
2333 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
2334 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
2335 		break;
2336 	default:
2337 		vmexit->u.inst_emul.cs_base = 0;
2338 		vmexit->u.inst_emul.cs_d = 0;
2339 		break;
2340 	}
2341 	vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
2342 }
2343 
2344 static int
2345 ept_fault_type(uint64_t ept_qual)
2346 {
2347 	int fault_type;
2348 
2349 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
2350 		fault_type = VM_PROT_WRITE;
2351 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
2352 		fault_type = VM_PROT_EXECUTE;
2353 	else
2354 		fault_type= VM_PROT_READ;
2355 
2356 	return (fault_type);
2357 }
2358 
2359 static boolean_t
2360 ept_emulation_fault(uint64_t ept_qual)
2361 {
2362 	int read, write;
2363 
2364 	/* EPT fault on an instruction fetch doesn't make sense here */
2365 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
2366 		return (FALSE);
2367 
2368 	/* EPT fault must be a read fault or a write fault */
2369 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
2370 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
2371 	if ((read | write) == 0)
2372 		return (FALSE);
2373 
2374 	/*
2375 	 * The EPT violation must have been caused by accessing a
2376 	 * guest-physical address that is a translation of a guest-linear
2377 	 * address.
2378 	 */
2379 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
2380 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
2381 		return (FALSE);
2382 	}
2383 
2384 	return (TRUE);
2385 }
2386 
2387 static __inline int
2388 apic_access_virtualization(struct vmx *vmx, int vcpuid)
2389 {
2390 	uint32_t proc_ctls2;
2391 
2392 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
2393 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
2394 }
2395 
2396 static __inline int
2397 x2apic_virtualization(struct vmx *vmx, int vcpuid)
2398 {
2399 	uint32_t proc_ctls2;
2400 
2401 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
2402 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
2403 }
2404 
2405 static int
2406 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
2407     uint64_t qual)
2408 {
2409 	int error, handled, offset;
2410 	uint32_t *apic_regs, vector;
2411 	bool retu;
2412 
2413 	handled = HANDLED;
2414 	offset = APIC_WRITE_OFFSET(qual);
2415 
2416 	if (!apic_access_virtualization(vmx, vcpuid)) {
2417 		/*
2418 		 * In general there should not be any APIC write VM-exits
2419 		 * unless APIC-access virtualization is enabled.
2420 		 *
2421 		 * However self-IPI virtualization can legitimately trigger
2422 		 * an APIC-write VM-exit so treat it specially.
2423 		 */
2424 		if (x2apic_virtualization(vmx, vcpuid) &&
2425 		    offset == APIC_OFFSET_SELF_IPI) {
2426 			apic_regs = (uint32_t *)(vlapic->apic_page);
2427 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
2428 			vlapic_self_ipi_handler(vlapic, vector);
2429 			return (HANDLED);
2430 		} else
2431 			return (UNHANDLED);
2432 	}
2433 
2434 	switch (offset) {
2435 	case APIC_OFFSET_ID:
2436 		vlapic_id_write_handler(vlapic);
2437 		break;
2438 	case APIC_OFFSET_LDR:
2439 		vlapic_ldr_write_handler(vlapic);
2440 		break;
2441 	case APIC_OFFSET_DFR:
2442 		vlapic_dfr_write_handler(vlapic);
2443 		break;
2444 	case APIC_OFFSET_SVR:
2445 		vlapic_svr_write_handler(vlapic);
2446 		break;
2447 	case APIC_OFFSET_ESR:
2448 		vlapic_esr_write_handler(vlapic);
2449 		break;
2450 	case APIC_OFFSET_ICR_LOW:
2451 		retu = false;
2452 		error = vlapic_icrlo_write_handler(vlapic, &retu);
2453 		if (error != 0 || retu)
2454 			handled = UNHANDLED;
2455 		break;
2456 	case APIC_OFFSET_CMCI_LVT:
2457 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
2458 		vlapic_lvt_write_handler(vlapic, offset);
2459 		break;
2460 	case APIC_OFFSET_TIMER_ICR:
2461 		vlapic_icrtmr_write_handler(vlapic);
2462 		break;
2463 	case APIC_OFFSET_TIMER_DCR:
2464 		vlapic_dcr_write_handler(vlapic);
2465 		break;
2466 	default:
2467 		handled = UNHANDLED;
2468 		break;
2469 	}
2470 	return (handled);
2471 }
2472 
2473 static bool
2474 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
2475 {
2476 
2477 	if (apic_access_virtualization(vmx, vcpuid) &&
2478 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
2479 		return (true);
2480 	else
2481 		return (false);
2482 }
2483 
2484 static int
2485 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2486 {
2487 	uint64_t qual;
2488 	int access_type, offset, allowed;
2489 
2490 	if (!apic_access_virtualization(vmx, vcpuid))
2491 		return (UNHANDLED);
2492 
2493 	qual = vmexit->u.vmx.exit_qualification;
2494 	access_type = APIC_ACCESS_TYPE(qual);
2495 	offset = APIC_ACCESS_OFFSET(qual);
2496 
2497 	allowed = 0;
2498 	if (access_type == 0) {
2499 		/*
2500 		 * Read data access to the following registers is expected.
2501 		 */
2502 		switch (offset) {
2503 		case APIC_OFFSET_APR:
2504 		case APIC_OFFSET_PPR:
2505 		case APIC_OFFSET_RRR:
2506 		case APIC_OFFSET_CMCI_LVT:
2507 		case APIC_OFFSET_TIMER_CCR:
2508 			allowed = 1;
2509 			break;
2510 		default:
2511 			break;
2512 		}
2513 	} else if (access_type == 1) {
2514 		/*
2515 		 * Write data access to the following registers is expected.
2516 		 */
2517 		switch (offset) {
2518 		case APIC_OFFSET_VER:
2519 		case APIC_OFFSET_APR:
2520 		case APIC_OFFSET_PPR:
2521 		case APIC_OFFSET_RRR:
2522 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
2523 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
2524 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
2525 		case APIC_OFFSET_CMCI_LVT:
2526 		case APIC_OFFSET_TIMER_CCR:
2527 			allowed = 1;
2528 			break;
2529 		default:
2530 			break;
2531 		}
2532 	}
2533 
2534 	if (allowed) {
2535 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
2536 		    VIE_INVALID_GLA);
2537 	}
2538 
2539 	/*
2540 	 * Regardless of whether the APIC-access is allowed this handler
2541 	 * always returns UNHANDLED:
2542 	 * - if the access is allowed then it is handled by emulating the
2543 	 *   instruction that caused the VM-exit (outside the critical section)
2544 	 * - if the access is not allowed then it will be converted to an
2545 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
2546 	 */
2547 	return (UNHANDLED);
2548 }
2549 
2550 static enum task_switch_reason
2551 vmx_task_switch_reason(uint64_t qual)
2552 {
2553 	int reason;
2554 
2555 	reason = (qual >> 30) & 0x3;
2556 	switch (reason) {
2557 	case 0:
2558 		return (TSR_CALL);
2559 	case 1:
2560 		return (TSR_IRET);
2561 	case 2:
2562 		return (TSR_JMP);
2563 	case 3:
2564 		return (TSR_IDT_GATE);
2565 	default:
2566 		panic("%s: invalid reason %d", __func__, reason);
2567 	}
2568 }
2569 
2570 static int
2571 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
2572 {
2573 	int error;
2574 
2575 	if (lapic_msr(num))
2576 		error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
2577 	else
2578 		error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
2579 
2580 	return (error);
2581 }
2582 
2583 static int
2584 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
2585 {
2586 	struct vmxctx *vmxctx;
2587 	uint64_t result;
2588 	uint32_t eax, edx;
2589 	int error;
2590 
2591 	if (lapic_msr(num))
2592 		error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
2593 	else
2594 		error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
2595 
2596 	if (error == 0) {
2597 		eax = result;
2598 		vmxctx = &vmx->ctx[vcpuid];
2599 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
2600 		KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
2601 
2602 		edx = result >> 32;
2603 		error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
2604 		KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
2605 	}
2606 
2607 	return (error);
2608 }
2609 
2610 #ifndef __FreeBSD__
2611 #define	__predict_false(x)	(x)
2612 #endif
2613 
2614 static int
2615 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2616 {
2617 	int error, errcode, errcode_valid, handled, in;
2618 	struct vmxctx *vmxctx;
2619 	struct vlapic *vlapic;
2620 	struct vm_inout_str *vis;
2621 	struct vm_task_switch *ts;
2622 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
2623 	uint32_t intr_type, intr_vec, reason;
2624 	uint64_t exitintinfo, qual, gpa;
2625 	bool retu;
2626 
2627 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
2628 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
2629 
2630 	handled = UNHANDLED;
2631 	vmxctx = &vmx->ctx[vcpu];
2632 
2633 	qual = vmexit->u.vmx.exit_qualification;
2634 	reason = vmexit->u.vmx.exit_reason;
2635 	vmexit->exitcode = VM_EXITCODE_BOGUS;
2636 
2637 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
2638 	SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
2639 
2640 	/*
2641 	 * VM-entry failures during or after loading guest state.
2642 	 *
2643 	 * These VM-exits are uncommon but must be handled specially
2644 	 * as most VM-exit fields are not populated as usual.
2645 	 */
2646 	if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
2647 		VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
2648 #ifdef __FreeBSD__
2649 		__asm __volatile("int $18");
2650 #else
2651 		vmm_call_trap(T_MCE);
2652 #endif
2653 		return (1);
2654 	}
2655 
2656 	/*
2657 	 * VM exits that can be triggered during event delivery need to
2658 	 * be handled specially by re-injecting the event if the IDT
2659 	 * vectoring information field's valid bit is set.
2660 	 *
2661 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
2662 	 * for details.
2663 	 */
2664 	idtvec_info = vmcs_idt_vectoring_info();
2665 	if (idtvec_info & VMCS_IDT_VEC_VALID) {
2666 		idtvec_info &= ~(1 << 12); /* clear undefined bit */
2667 		exitintinfo = idtvec_info;
2668 		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2669 			idtvec_err = vmcs_idt_vectoring_err();
2670 			exitintinfo |= (uint64_t)idtvec_err << 32;
2671 		}
2672 		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
2673 		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
2674 		    __func__, error));
2675 
2676 		/*
2677 		 * If 'virtual NMIs' are being used and the VM-exit
2678 		 * happened while injecting an NMI during the previous
2679 		 * VM-entry, then clear "blocking by NMI" in the
2680 		 * Guest Interruptibility-State so the NMI can be
2681 		 * reinjected on the subsequent VM-entry.
2682 		 *
2683 		 * However, if the NMI was being delivered through a task
2684 		 * gate, then the new task must start execution with NMIs
2685 		 * blocked so don't clear NMI blocking in this case.
2686 		 */
2687 		intr_type = idtvec_info & VMCS_INTR_T_MASK;
2688 		if (intr_type == VMCS_INTR_T_NMI) {
2689 			if (reason != EXIT_REASON_TASK_SWITCH)
2690 				vmx_clear_nmi_blocking(vmx, vcpu);
2691 			else
2692 				vmx_assert_nmi_blocking(vmx, vcpu);
2693 		}
2694 
2695 		/*
2696 		 * Update VM-entry instruction length if the event being
2697 		 * delivered was a software interrupt or software exception.
2698 		 */
2699 		if (intr_type == VMCS_INTR_T_SWINTR ||
2700 		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
2701 		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
2702 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2703 		}
2704 	}
2705 
2706 	switch (reason) {
2707 	case EXIT_REASON_TASK_SWITCH:
2708 		ts = &vmexit->u.task_switch;
2709 		ts->tsssel = qual & 0xffff;
2710 		ts->reason = vmx_task_switch_reason(qual);
2711 		ts->ext = 0;
2712 		ts->errcode_valid = 0;
2713 		vmx_paging_info(&ts->paging);
2714 		/*
2715 		 * If the task switch was due to a CALL, JMP, IRET, software
2716 		 * interrupt (INT n) or software exception (INT3, INTO),
2717 		 * then the saved %rip references the instruction that caused
2718 		 * the task switch. The instruction length field in the VMCS
2719 		 * is valid in this case.
2720 		 *
2721 		 * In all other cases (e.g., NMI, hardware exception) the
2722 		 * saved %rip is one that would have been saved in the old TSS
2723 		 * had the task switch completed normally so the instruction
2724 		 * length field is not needed in this case and is explicitly
2725 		 * set to 0.
2726 		 */
2727 		if (ts->reason == TSR_IDT_GATE) {
2728 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
2729 			    ("invalid idtvec_info %#x for IDT task switch",
2730 			    idtvec_info));
2731 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
2732 			if (intr_type != VMCS_INTR_T_SWINTR &&
2733 			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
2734 			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
2735 				/* Task switch triggered by external event */
2736 				ts->ext = 1;
2737 				vmexit->inst_length = 0;
2738 				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2739 					ts->errcode_valid = 1;
2740 					ts->errcode = vmcs_idt_vectoring_err();
2741 				}
2742 			}
2743 		}
2744 		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
2745 		SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
2746 		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
2747 		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
2748 		    ts->ext ? "external" : "internal",
2749 		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
2750 		break;
2751 	case EXIT_REASON_CR_ACCESS:
2752 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
2753 		SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
2754 		switch (qual & 0xf) {
2755 		case 0:
2756 			handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
2757 			break;
2758 		case 4:
2759 			handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
2760 			break;
2761 		case 8:
2762 			handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
2763 			break;
2764 		}
2765 		break;
2766 	case EXIT_REASON_RDMSR:
2767 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
2768 		retu = false;
2769 		ecx = vmxctx->guest_rcx;
2770 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
2771 		SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx);
2772 		error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
2773 		if (error) {
2774 			vmexit->exitcode = VM_EXITCODE_RDMSR;
2775 			vmexit->u.msr.code = ecx;
2776 		} else if (!retu) {
2777 			handled = HANDLED;
2778 		} else {
2779 			/* Return to userspace with a valid exitcode */
2780 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2781 			    ("emulate_rdmsr retu with bogus exitcode"));
2782 		}
2783 		break;
2784 	case EXIT_REASON_WRMSR:
2785 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
2786 		retu = false;
2787 		eax = vmxctx->guest_rax;
2788 		ecx = vmxctx->guest_rcx;
2789 		edx = vmxctx->guest_rdx;
2790 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
2791 		    ecx, (uint64_t)edx << 32 | eax);
2792 		SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx,
2793 		    (uint64_t)edx << 32 | eax);
2794 		error = emulate_wrmsr(vmx, vcpu, ecx,
2795 		    (uint64_t)edx << 32 | eax, &retu);
2796 		if (error) {
2797 			vmexit->exitcode = VM_EXITCODE_WRMSR;
2798 			vmexit->u.msr.code = ecx;
2799 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
2800 		} else if (!retu) {
2801 			handled = HANDLED;
2802 		} else {
2803 			/* Return to userspace with a valid exitcode */
2804 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2805 			    ("emulate_wrmsr retu with bogus exitcode"));
2806 		}
2807 		break;
2808 	case EXIT_REASON_HLT:
2809 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
2810 		SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
2811 		vmexit->exitcode = VM_EXITCODE_HLT;
2812 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2813 		if (virtual_interrupt_delivery)
2814 			vmexit->u.hlt.intr_status =
2815 			    vmcs_read(VMCS_GUEST_INTR_STATUS);
2816 		else
2817 			vmexit->u.hlt.intr_status = 0;
2818 		break;
2819 	case EXIT_REASON_MTF:
2820 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
2821 		SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
2822 		vmexit->exitcode = VM_EXITCODE_MTRAP;
2823 		vmexit->inst_length = 0;
2824 		break;
2825 	case EXIT_REASON_PAUSE:
2826 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
2827 		SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
2828 		vmexit->exitcode = VM_EXITCODE_PAUSE;
2829 		break;
2830 	case EXIT_REASON_INTR_WINDOW:
2831 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
2832 		SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
2833 		vmx_clear_int_window_exiting(vmx, vcpu);
2834 		return (1);
2835 	case EXIT_REASON_EXT_INTR:
2836 		/*
2837 		 * External interrupts serve only to cause VM exits and allow
2838 		 * the host interrupt handler to run.
2839 		 *
2840 		 * If this external interrupt triggers a virtual interrupt
2841 		 * to a VM, then that state will be recorded by the
2842 		 * host interrupt handler in the VM's softc. We will inject
2843 		 * this virtual interrupt during the subsequent VM enter.
2844 		 */
2845 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2846 		SDT_PROBE4(vmm, vmx, exit, interrupt,
2847 		    vmx, vcpu, vmexit, intr_info);
2848 
2849 		/*
2850 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
2851 		 * This appears to be a bug in VMware Fusion?
2852 		 */
2853 		if (!(intr_info & VMCS_INTR_VALID))
2854 			return (1);
2855 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
2856 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
2857 		    ("VM exit interruption info invalid: %#x", intr_info));
2858 		vmx_trigger_hostintr(intr_info & 0xff);
2859 
2860 		/*
2861 		 * This is special. We want to treat this as an 'handled'
2862 		 * VM-exit but not increment the instruction pointer.
2863 		 */
2864 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
2865 		return (1);
2866 	case EXIT_REASON_NMI_WINDOW:
2867 		SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
2868 		/* Exit to allow the pending virtual NMI to be injected */
2869 		if (vm_nmi_pending(vmx->vm, vcpu))
2870 			vmx_inject_nmi(vmx, vcpu);
2871 		vmx_clear_nmi_window_exiting(vmx, vcpu);
2872 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
2873 		return (1);
2874 	case EXIT_REASON_INOUT:
2875 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
2876 		vmexit->exitcode = VM_EXITCODE_INOUT;
2877 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
2878 		vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
2879 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
2880 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
2881 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
2882 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
2883 		if (vmexit->u.inout.string) {
2884 			inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
2885 			vmexit->exitcode = VM_EXITCODE_INOUT_STR;
2886 			vis = &vmexit->u.inout_str;
2887 			vmx_paging_info(&vis->paging);
2888 			vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2889 			vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
2890 			vis->index = inout_str_index(vmx, vcpu, in);
2891 			vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
2892 			vis->addrsize = inout_str_addrsize(inst_info);
2893 			inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
2894 		}
2895 		SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
2896 		break;
2897 	case EXIT_REASON_CPUID:
2898 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
2899 		SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
2900 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
2901 		break;
2902 	case EXIT_REASON_EXCEPTION:
2903 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
2904 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2905 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2906 		    ("VM exit interruption info invalid: %#x", intr_info));
2907 
2908 		intr_vec = intr_info & 0xff;
2909 		intr_type = intr_info & VMCS_INTR_T_MASK;
2910 
2911 		/*
2912 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
2913 		 * fault encountered during the execution of IRET then we must
2914 		 * restore the state of "virtual-NMI blocking" before resuming
2915 		 * the guest.
2916 		 *
2917 		 * See "Resuming Guest Software after Handling an Exception".
2918 		 * See "Information for VM Exits Due to Vectored Events".
2919 		 */
2920 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2921 		    (intr_vec != IDT_DF) &&
2922 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
2923 			vmx_restore_nmi_blocking(vmx, vcpu);
2924 
2925 		/*
2926 		 * The NMI has already been handled in vmx_exit_handle_nmi().
2927 		 */
2928 		if (intr_type == VMCS_INTR_T_NMI)
2929 			return (1);
2930 
2931 		/*
2932 		 * Call the machine check handler by hand. Also don't reflect
2933 		 * the machine check back into the guest.
2934 		 */
2935 		if (intr_vec == IDT_MC) {
2936 			VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
2937 #ifdef __FreeBSD__
2938 			__asm __volatile("int $18");
2939 #else
2940 			vmm_call_trap(T_MCE);
2941 #endif
2942 			return (1);
2943 		}
2944 
2945 		if (intr_vec == IDT_PF) {
2946 			error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
2947 			KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
2948 			    __func__, error));
2949 		}
2950 
2951 		/*
2952 		 * Software exceptions exhibit trap-like behavior. This in
2953 		 * turn requires populating the VM-entry instruction length
2954 		 * so that the %rip in the trap frame is past the INT3/INTO
2955 		 * instruction.
2956 		 */
2957 		if (intr_type == VMCS_INTR_T_SWEXCEPTION)
2958 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2959 
2960 		/* Reflect all other exceptions back into the guest */
2961 		errcode_valid = errcode = 0;
2962 		if (intr_info & VMCS_INTR_DEL_ERRCODE) {
2963 			errcode_valid = 1;
2964 			errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
2965 		}
2966 		VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
2967 		    "the guest", intr_vec, errcode);
2968 		SDT_PROBE5(vmm, vmx, exit, exception,
2969 		    vmx, vcpu, vmexit, intr_vec, errcode);
2970 		error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
2971 		    errcode_valid, errcode, 0);
2972 		KASSERT(error == 0, ("%s: vm_inject_exception error %d",
2973 		    __func__, error));
2974 		return (1);
2975 
2976 	case EXIT_REASON_EPT_FAULT:
2977 		/*
2978 		 * If 'gpa' lies within the address space allocated to
2979 		 * memory then this must be a nested page fault otherwise
2980 		 * this must be an instruction that accesses MMIO space.
2981 		 */
2982 		gpa = vmcs_gpa();
2983 		if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
2984 		    apic_access_fault(vmx, vcpu, gpa)) {
2985 			vmexit->exitcode = VM_EXITCODE_PAGING;
2986 			vmexit->inst_length = 0;
2987 			vmexit->u.paging.gpa = gpa;
2988 			vmexit->u.paging.fault_type = ept_fault_type(qual);
2989 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
2990 			SDT_PROBE5(vmm, vmx, exit, nestedfault,
2991 			    vmx, vcpu, vmexit, gpa, qual);
2992 		} else if (ept_emulation_fault(qual)) {
2993 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
2994 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
2995 			SDT_PROBE4(vmm, vmx, exit, mmiofault,
2996 			    vmx, vcpu, vmexit, gpa);
2997 		}
2998 		/*
2999 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
3000 		 * EPT fault during the execution of IRET then we must restore
3001 		 * the state of "virtual-NMI blocking" before resuming.
3002 		 *
3003 		 * See description of "NMI unblocking due to IRET" in
3004 		 * "Exit Qualification for EPT Violations".
3005 		 */
3006 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
3007 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
3008 			vmx_restore_nmi_blocking(vmx, vcpu);
3009 		break;
3010 	case EXIT_REASON_VIRTUALIZED_EOI:
3011 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
3012 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
3013 		SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
3014 		vmexit->inst_length = 0;	/* trap-like */
3015 		break;
3016 	case EXIT_REASON_APIC_ACCESS:
3017 		SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
3018 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
3019 		break;
3020 	case EXIT_REASON_APIC_WRITE:
3021 		/*
3022 		 * APIC-write VM exit is trap-like so the %rip is already
3023 		 * pointing to the next instruction.
3024 		 */
3025 		vmexit->inst_length = 0;
3026 		vlapic = vm_lapic(vmx->vm, vcpu);
3027 		SDT_PROBE4(vmm, vmx, exit, apicwrite,
3028 		    vmx, vcpu, vmexit, vlapic);
3029 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
3030 		break;
3031 	case EXIT_REASON_XSETBV:
3032 		SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
3033 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
3034 		break;
3035 	case EXIT_REASON_MONITOR:
3036 		SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
3037 		vmexit->exitcode = VM_EXITCODE_MONITOR;
3038 		break;
3039 	case EXIT_REASON_MWAIT:
3040 		SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
3041 		vmexit->exitcode = VM_EXITCODE_MWAIT;
3042 		break;
3043 	case EXIT_REASON_VMCALL:
3044 	case EXIT_REASON_VMCLEAR:
3045 	case EXIT_REASON_VMLAUNCH:
3046 	case EXIT_REASON_VMPTRLD:
3047 	case EXIT_REASON_VMPTRST:
3048 	case EXIT_REASON_VMREAD:
3049 	case EXIT_REASON_VMRESUME:
3050 	case EXIT_REASON_VMWRITE:
3051 	case EXIT_REASON_VMXOFF:
3052 	case EXIT_REASON_VMXON:
3053 		SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit);
3054 		vmexit->exitcode = VM_EXITCODE_VMINSN;
3055 		break;
3056 	default:
3057 		SDT_PROBE4(vmm, vmx, exit, unknown,
3058 		    vmx, vcpu, vmexit, reason);
3059 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
3060 		break;
3061 	}
3062 
3063 	if (handled) {
3064 		/*
3065 		 * It is possible that control is returned to userland
3066 		 * even though we were able to handle the VM exit in the
3067 		 * kernel.
3068 		 *
3069 		 * In such a case we want to make sure that the userland
3070 		 * restarts guest execution at the instruction *after*
3071 		 * the one we just processed. Therefore we update the
3072 		 * guest rip in the VMCS and in 'vmexit'.
3073 		 */
3074 		vmexit->rip += vmexit->inst_length;
3075 		vmexit->inst_length = 0;
3076 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
3077 	} else {
3078 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
3079 			/*
3080 			 * If this VM exit was not claimed by anybody then
3081 			 * treat it as a generic VMX exit.
3082 			 */
3083 			vmexit->exitcode = VM_EXITCODE_VMX;
3084 			vmexit->u.vmx.status = VM_SUCCESS;
3085 			vmexit->u.vmx.inst_type = 0;
3086 			vmexit->u.vmx.inst_error = 0;
3087 		} else {
3088 			/*
3089 			 * The exitcode and collateral have been populated.
3090 			 * The VM exit will be processed further in userland.
3091 			 */
3092 		}
3093 	}
3094 
3095 	SDT_PROBE4(vmm, vmx, exit, return,
3096 	    vmx, vcpu, vmexit, handled);
3097 	return (handled);
3098 }
3099 
3100 static void
3101 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
3102 {
3103 
3104 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
3105 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
3106 	    vmxctx->inst_fail_status));
3107 
3108 	vmexit->inst_length = 0;
3109 	vmexit->exitcode = VM_EXITCODE_VMX;
3110 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
3111 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
3112 	vmexit->u.vmx.exit_reason = ~0;
3113 	vmexit->u.vmx.exit_qualification = ~0;
3114 
3115 	switch (rc) {
3116 	case VMX_VMRESUME_ERROR:
3117 	case VMX_VMLAUNCH_ERROR:
3118 	case VMX_INVEPT_ERROR:
3119 #ifndef __FreeBSD__
3120 	case VMX_VMWRITE_ERROR:
3121 #endif
3122 		vmexit->u.vmx.inst_type = rc;
3123 		break;
3124 	default:
3125 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
3126 	}
3127 }
3128 
3129 /*
3130  * If the NMI-exiting VM execution control is set to '1' then an NMI in
3131  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
3132  * sufficient to simply vector to the NMI handler via a software interrupt.
3133  * However, this must be done before maskable interrupts are enabled
3134  * otherwise the "iret" issued by an interrupt handler will incorrectly
3135  * clear NMI blocking.
3136  */
3137 static __inline void
3138 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
3139 {
3140 	uint32_t intr_info;
3141 
3142 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
3143 
3144 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
3145 		return;
3146 
3147 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
3148 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
3149 	    ("VM exit interruption info invalid: %#x", intr_info));
3150 
3151 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
3152 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
3153 		    "to NMI has invalid vector: %#x", intr_info));
3154 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
3155 #ifdef __FreeBSD__
3156 		__asm __volatile("int $2");
3157 #else
3158 		vmm_call_trap(T_NMIFLT);
3159 #endif
3160 	}
3161 }
3162 
3163 static __inline void
3164 vmx_dr_enter_guest(struct vmxctx *vmxctx)
3165 {
3166 	register_t rflags;
3167 
3168 	/* Save host control debug registers. */
3169 	vmxctx->host_dr7 = rdr7();
3170 	vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
3171 
3172 	/*
3173 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
3174 	 * exceptions in the host based on the guest DRx values.  The
3175 	 * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
3176 	 */
3177 	load_dr7(0);
3178 	wrmsr(MSR_DEBUGCTLMSR, 0);
3179 
3180 	/*
3181 	 * Disable single stepping the kernel to avoid corrupting the
3182 	 * guest DR6.  A debugger might still be able to corrupt the
3183 	 * guest DR6 by setting a breakpoint after this point and then
3184 	 * single stepping.
3185 	 */
3186 	rflags = read_rflags();
3187 	vmxctx->host_tf = rflags & PSL_T;
3188 	write_rflags(rflags & ~PSL_T);
3189 
3190 	/* Save host debug registers. */
3191 	vmxctx->host_dr0 = rdr0();
3192 	vmxctx->host_dr1 = rdr1();
3193 	vmxctx->host_dr2 = rdr2();
3194 	vmxctx->host_dr3 = rdr3();
3195 	vmxctx->host_dr6 = rdr6();
3196 
3197 	/* Restore guest debug registers. */
3198 	load_dr0(vmxctx->guest_dr0);
3199 	load_dr1(vmxctx->guest_dr1);
3200 	load_dr2(vmxctx->guest_dr2);
3201 	load_dr3(vmxctx->guest_dr3);
3202 	load_dr6(vmxctx->guest_dr6);
3203 }
3204 
3205 static __inline void
3206 vmx_dr_leave_guest(struct vmxctx *vmxctx)
3207 {
3208 
3209 	/* Save guest debug registers. */
3210 	vmxctx->guest_dr0 = rdr0();
3211 	vmxctx->guest_dr1 = rdr1();
3212 	vmxctx->guest_dr2 = rdr2();
3213 	vmxctx->guest_dr3 = rdr3();
3214 	vmxctx->guest_dr6 = rdr6();
3215 
3216 	/*
3217 	 * Restore host debug registers.  Restore DR7, DEBUGCTL, and
3218 	 * PSL_T last.
3219 	 */
3220 	load_dr0(vmxctx->host_dr0);
3221 	load_dr1(vmxctx->host_dr1);
3222 	load_dr2(vmxctx->host_dr2);
3223 	load_dr3(vmxctx->host_dr3);
3224 	load_dr6(vmxctx->host_dr6);
3225 	wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
3226 	load_dr7(vmxctx->host_dr7);
3227 	write_rflags(read_rflags() | vmxctx->host_tf);
3228 }
3229 
3230 static int
3231 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
3232     struct vm_eventinfo *evinfo)
3233 {
3234 	int rc, handled, launched;
3235 	struct vmx *vmx;
3236 	struct vm *vm;
3237 	struct vmxctx *vmxctx;
3238 	struct vmcs *vmcs;
3239 	struct vm_exit *vmexit;
3240 	struct vlapic *vlapic;
3241 	uint32_t exit_reason;
3242 #ifdef __FreeBSD__
3243 	struct region_descriptor gdtr, idtr;
3244 	uint16_t ldt_sel;
3245 #endif
3246 
3247 	vmx = arg;
3248 	vm = vmx->vm;
3249 	vmcs = &vmx->vmcs[vcpu];
3250 	vmxctx = &vmx->ctx[vcpu];
3251 	vlapic = vm_lapic(vm, vcpu);
3252 	vmexit = vm_exitinfo(vm, vcpu);
3253 	launched = 0;
3254 
3255 	KASSERT(vmxctx->pmap == pmap,
3256 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
3257 
3258 	vmx_msr_guest_enter(vmx, vcpu);
3259 
3260 	VMPTRLD(vmcs);
3261 
3262 #ifndef __FreeBSD__
3263 	VERIFY(vmx->vmcs_state[vcpu] == VS_NONE && curthread->t_preempt != 0);
3264 	vmx->vmcs_state[vcpu] = VS_LOADED;
3265 #endif
3266 
3267 	/*
3268 	 * XXX
3269 	 * We do this every time because we may setup the virtual machine
3270 	 * from a different process than the one that actually runs it.
3271 	 *
3272 	 * If the life of a virtual machine was spent entirely in the context
3273 	 * of a single process we could do this once in vmx_vminit().
3274 	 */
3275 	vmcs_write(VMCS_HOST_CR3, rcr3());
3276 
3277 	vmcs_write(VMCS_GUEST_RIP, rip);
3278 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
3279 	do {
3280 		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
3281 		    "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
3282 
3283 		handled = UNHANDLED;
3284 		/*
3285 		 * Interrupts are disabled from this point on until the
3286 		 * guest starts executing. This is done for the following
3287 		 * reasons:
3288 		 *
3289 		 * If an AST is asserted on this thread after the check below,
3290 		 * then the IPI_AST notification will not be lost, because it
3291 		 * will cause a VM exit due to external interrupt as soon as
3292 		 * the guest state is loaded.
3293 		 *
3294 		 * A posted interrupt after 'vmx_inject_interrupts()' will
3295 		 * not be "lost" because it will be held pending in the host
3296 		 * APIC because interrupts are disabled. The pending interrupt
3297 		 * will be recognized as soon as the guest state is loaded.
3298 		 *
3299 		 * The same reasoning applies to the IPI generated by
3300 		 * pmap_invalidate_ept().
3301 		 */
3302 #ifdef __FreeBSD__
3303 		disable_intr();
3304 		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
3305 #else
3306 		/*
3307 		 * The bulk of guest interrupt injection is done without
3308 		 * interrupts disabled on the host CPU.  This is necessary
3309 		 * since contended mutexes might force the thread to sleep.
3310 		 */
3311 		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
3312 		disable_intr();
3313 		if (virtual_interrupt_delivery) {
3314 			vmx_inject_pir(vlapic);
3315 		}
3316 #endif /* __FreeBSD__ */
3317 
3318 		/*
3319 		 * Check for vcpu suspension after injecting events because
3320 		 * vmx_inject_interrupts() can suspend the vcpu due to a
3321 		 * triple fault.
3322 		 */
3323 		if (vcpu_suspended(evinfo)) {
3324 			enable_intr();
3325 			vm_exit_suspended(vmx->vm, vcpu, rip);
3326 			break;
3327 		}
3328 
3329 		if (vcpu_runblocked(evinfo)) {
3330 			enable_intr();
3331 			vm_exit_runblock(vmx->vm, vcpu, rip);
3332 			break;
3333 		}
3334 
3335 		if (vcpu_reqidle(evinfo)) {
3336 			enable_intr();
3337 			vm_exit_reqidle(vmx->vm, vcpu, rip);
3338 			break;
3339 		}
3340 
3341 		if (vcpu_should_yield(vm, vcpu)) {
3342 			enable_intr();
3343 			vm_exit_astpending(vmx->vm, vcpu, rip);
3344 			vmx_astpending_trace(vmx, vcpu, rip);
3345 			handled = HANDLED;
3346 			break;
3347 		}
3348 
3349 		if (vcpu_debugged(vm, vcpu)) {
3350 			enable_intr();
3351 			vm_exit_debug(vmx->vm, vcpu, rip);
3352 			break;
3353 		}
3354 
3355 #ifndef __FreeBSD__
3356 		if ((rc = smt_acquire()) != 1) {
3357 			enable_intr();
3358 			vmexit->rip = rip;
3359 			vmexit->inst_length = 0;
3360 			if (rc == -1) {
3361 				vmexit->exitcode = VM_EXITCODE_HT;
3362 			} else {
3363 				vmexit->exitcode = VM_EXITCODE_BOGUS;
3364 				handled = HANDLED;
3365 			}
3366 			break;
3367 		}
3368 
3369 		/*
3370 		 * If this thread has gone off-cpu due to mutex operations
3371 		 * during vmx_run, the VMCS will have been unloaded, forcing a
3372 		 * re-VMLAUNCH as opposed to VMRESUME.
3373 		 */
3374 		launched = (vmx->vmcs_state[vcpu] & VS_LAUNCHED) != 0;
3375 		/*
3376 		 * Restoration of the GDT limit is taken care of by
3377 		 * vmx_savectx().  Since the maximum practical index for the
3378 		 * IDT is 255, restoring its limits from the post-VMX-exit
3379 		 * default of 0xffff is not a concern.
3380 		 *
3381 		 * Only 64-bit hypervisor callers are allowed, which forgoes
3382 		 * the need to restore any LDT descriptor.  Toss an error to
3383 		 * anyone attempting to break that rule.
3384 		 */
3385 		if (curproc->p_model != DATAMODEL_LP64) {
3386 			smt_release();
3387 			enable_intr();
3388 			bzero(vmexit, sizeof (*vmexit));
3389 			vmexit->rip = rip;
3390 			vmexit->exitcode = VM_EXITCODE_VMX;
3391 			vmexit->u.vmx.status = VM_FAIL_INVALID;
3392 			handled = UNHANDLED;
3393 			break;
3394 		}
3395 #else
3396 		/*
3397 		 * VM exits restore the base address but not the
3398 		 * limits of GDTR and IDTR.  The VMCS only stores the
3399 		 * base address, so VM exits set the limits to 0xffff.
3400 		 * Save and restore the full GDTR and IDTR to restore
3401 		 * the limits.
3402 		 *
3403 		 * The VMCS does not save the LDTR at all, and VM
3404 		 * exits clear LDTR as if a NULL selector were loaded.
3405 		 * The userspace hypervisor probably doesn't use a
3406 		 * LDT, but save and restore it to be safe.
3407 		 */
3408 		sgdt(&gdtr);
3409 		sidt(&idtr);
3410 		ldt_sel = sldt();
3411 #endif
3412 
3413 		vmx_run_trace(vmx, vcpu);
3414 		vmx_dr_enter_guest(vmxctx);
3415 		rc = vmx_enter_guest(vmxctx, vmx, launched);
3416 		vmx_dr_leave_guest(vmxctx);
3417 
3418 #ifndef	__FreeBSD__
3419 		vmx->vmcs_state[vcpu] |= VS_LAUNCHED;
3420 		smt_release();
3421 #else
3422 		bare_lgdt(&gdtr);
3423 		lidt(&idtr);
3424 		lldt(ldt_sel);
3425 #endif
3426 
3427 		/* Collect some information for VM exit processing */
3428 		vmexit->rip = rip = vmcs_guest_rip();
3429 		vmexit->inst_length = vmexit_instruction_length();
3430 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
3431 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
3432 
3433 		/* Update 'nextrip' */
3434 		vmx->state[vcpu].nextrip = rip;
3435 
3436 		if (rc == VMX_GUEST_VMEXIT) {
3437 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
3438 			enable_intr();
3439 			handled = vmx_exit_process(vmx, vcpu, vmexit);
3440 		} else {
3441 			enable_intr();
3442 			vmx_exit_inst_error(vmxctx, rc, vmexit);
3443 		}
3444 #ifdef	__FreeBSD__
3445 		launched = 1;
3446 #endif
3447 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
3448 		rip = vmexit->rip;
3449 	} while (handled);
3450 
3451 	/*
3452 	 * If a VM exit has been handled then the exitcode must be BOGUS
3453 	 * If a VM exit is not handled then the exitcode must not be BOGUS
3454 	 */
3455 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
3456 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
3457 		panic("Mismatch between handled (%d) and exitcode (%d)",
3458 		      handled, vmexit->exitcode);
3459 	}
3460 
3461 	if (!handled)
3462 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
3463 
3464 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
3465 	    vmexit->exitcode);
3466 
3467 	VMCLEAR(vmcs);
3468 	vmx_msr_guest_exit(vmx, vcpu);
3469 
3470 #ifndef __FreeBSD__
3471 	VERIFY(vmx->vmcs_state != VS_NONE && curthread->t_preempt != 0);
3472 	vmx->vmcs_state[vcpu] = VS_NONE;
3473 #endif
3474 
3475 	return (0);
3476 }
3477 
3478 static void
3479 vmx_vmcleanup(void *arg)
3480 {
3481 	int i;
3482 	struct vmx *vmx = arg;
3483 	uint16_t maxcpus;
3484 
3485 	if (apic_access_virtualization(vmx, 0))
3486 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
3487 
3488 	maxcpus = vm_get_maxcpus(vmx->vm);
3489 	for (i = 0; i < maxcpus; i++)
3490 		vpid_free(vmx->state[i].vpid);
3491 
3492 	free(vmx, M_VMX);
3493 
3494 	return;
3495 }
3496 
3497 static register_t *
3498 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
3499 {
3500 
3501 	switch (reg) {
3502 	case VM_REG_GUEST_RAX:
3503 		return (&vmxctx->guest_rax);
3504 	case VM_REG_GUEST_RBX:
3505 		return (&vmxctx->guest_rbx);
3506 	case VM_REG_GUEST_RCX:
3507 		return (&vmxctx->guest_rcx);
3508 	case VM_REG_GUEST_RDX:
3509 		return (&vmxctx->guest_rdx);
3510 	case VM_REG_GUEST_RSI:
3511 		return (&vmxctx->guest_rsi);
3512 	case VM_REG_GUEST_RDI:
3513 		return (&vmxctx->guest_rdi);
3514 	case VM_REG_GUEST_RBP:
3515 		return (&vmxctx->guest_rbp);
3516 	case VM_REG_GUEST_R8:
3517 		return (&vmxctx->guest_r8);
3518 	case VM_REG_GUEST_R9:
3519 		return (&vmxctx->guest_r9);
3520 	case VM_REG_GUEST_R10:
3521 		return (&vmxctx->guest_r10);
3522 	case VM_REG_GUEST_R11:
3523 		return (&vmxctx->guest_r11);
3524 	case VM_REG_GUEST_R12:
3525 		return (&vmxctx->guest_r12);
3526 	case VM_REG_GUEST_R13:
3527 		return (&vmxctx->guest_r13);
3528 	case VM_REG_GUEST_R14:
3529 		return (&vmxctx->guest_r14);
3530 	case VM_REG_GUEST_R15:
3531 		return (&vmxctx->guest_r15);
3532 	case VM_REG_GUEST_CR2:
3533 		return (&vmxctx->guest_cr2);
3534 	case VM_REG_GUEST_DR0:
3535 		return (&vmxctx->guest_dr0);
3536 	case VM_REG_GUEST_DR1:
3537 		return (&vmxctx->guest_dr1);
3538 	case VM_REG_GUEST_DR2:
3539 		return (&vmxctx->guest_dr2);
3540 	case VM_REG_GUEST_DR3:
3541 		return (&vmxctx->guest_dr3);
3542 	case VM_REG_GUEST_DR6:
3543 		return (&vmxctx->guest_dr6);
3544 	default:
3545 		break;
3546 	}
3547 	return (NULL);
3548 }
3549 
3550 static int
3551 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
3552 {
3553 	register_t *regp;
3554 
3555 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
3556 		*retval = *regp;
3557 		return (0);
3558 	} else
3559 		return (EINVAL);
3560 }
3561 
3562 static int
3563 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
3564 {
3565 	register_t *regp;
3566 
3567 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
3568 		*regp = val;
3569 		return (0);
3570 	} else
3571 		return (EINVAL);
3572 }
3573 
3574 static int
3575 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
3576 {
3577 	uint64_t gi;
3578 	int error;
3579 
3580 	error = vmcs_getreg(&vmx->vmcs[vcpu], running,
3581 	    VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
3582 	*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
3583 	return (error);
3584 }
3585 
3586 static int
3587 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
3588 {
3589 	struct vmcs *vmcs;
3590 	uint64_t gi;
3591 	int error, ident;
3592 
3593 	/*
3594 	 * Forcing the vcpu into an interrupt shadow is not supported.
3595 	 */
3596 	if (val) {
3597 		error = EINVAL;
3598 		goto done;
3599 	}
3600 
3601 	vmcs = &vmx->vmcs[vcpu];
3602 	ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
3603 	error = vmcs_getreg(vmcs, running, ident, &gi);
3604 	if (error == 0) {
3605 		gi &= ~HWINTR_BLOCKING;
3606 		error = vmcs_setreg(vmcs, running, ident, gi);
3607 	}
3608 done:
3609 	VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
3610 	    error ? "failed" : "succeeded");
3611 	return (error);
3612 }
3613 
3614 static int
3615 vmx_shadow_reg(int reg)
3616 {
3617 	int shreg;
3618 
3619 	shreg = -1;
3620 
3621 	switch (reg) {
3622 	case VM_REG_GUEST_CR0:
3623 		shreg = VMCS_CR0_SHADOW;
3624 		break;
3625 	case VM_REG_GUEST_CR4:
3626 		shreg = VMCS_CR4_SHADOW;
3627 		break;
3628 	default:
3629 		break;
3630 	}
3631 
3632 	return (shreg);
3633 }
3634 
3635 static int
3636 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
3637 {
3638 	int running, hostcpu;
3639 	struct vmx *vmx = arg;
3640 
3641 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3642 	if (running && hostcpu != curcpu)
3643 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
3644 
3645 	if (reg == VM_REG_GUEST_INTR_SHADOW)
3646 		return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
3647 
3648 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
3649 		return (0);
3650 
3651 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
3652 }
3653 
3654 static int
3655 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
3656 {
3657 	int error, hostcpu, running, shadow;
3658 	uint64_t ctls;
3659 	pmap_t pmap;
3660 	struct vmx *vmx = arg;
3661 
3662 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3663 	if (running && hostcpu != curcpu)
3664 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
3665 
3666 	if (reg == VM_REG_GUEST_INTR_SHADOW)
3667 		return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
3668 
3669 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
3670 		return (0);
3671 
3672 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
3673 
3674 	if (error == 0) {
3675 		/*
3676 		 * If the "load EFER" VM-entry control is 1 then the
3677 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
3678 		 * bit in the VM-entry control.
3679 		 */
3680 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
3681 		    (reg == VM_REG_GUEST_EFER)) {
3682 			vmcs_getreg(&vmx->vmcs[vcpu], running,
3683 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
3684 			if (val & EFER_LMA)
3685 				ctls |= VM_ENTRY_GUEST_LMA;
3686 			else
3687 				ctls &= ~VM_ENTRY_GUEST_LMA;
3688 			vmcs_setreg(&vmx->vmcs[vcpu], running,
3689 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
3690 		}
3691 
3692 		shadow = vmx_shadow_reg(reg);
3693 		if (shadow > 0) {
3694 			/*
3695 			 * Store the unmodified value in the shadow
3696 			 */
3697 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
3698 				    VMCS_IDENT(shadow), val);
3699 		}
3700 
3701 		if (reg == VM_REG_GUEST_CR3) {
3702 			/*
3703 			 * Invalidate the guest vcpu's TLB mappings to emulate
3704 			 * the behavior of updating %cr3.
3705 			 *
3706 			 * XXX the processor retains global mappings when %cr3
3707 			 * is updated but vmx_invvpid() does not.
3708 			 */
3709 			pmap = vmx->ctx[vcpu].pmap;
3710 			vmx_invvpid(vmx, vcpu, pmap, running);
3711 		}
3712 	}
3713 
3714 	return (error);
3715 }
3716 
3717 static int
3718 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
3719 {
3720 	int hostcpu, running;
3721 	struct vmx *vmx = arg;
3722 
3723 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3724 	if (running && hostcpu != curcpu)
3725 		panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
3726 
3727 	return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
3728 }
3729 
3730 static int
3731 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
3732 {
3733 	int hostcpu, running;
3734 	struct vmx *vmx = arg;
3735 
3736 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3737 	if (running && hostcpu != curcpu)
3738 		panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
3739 
3740 	return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
3741 }
3742 
3743 static int
3744 vmx_getcap(void *arg, int vcpu, int type, int *retval)
3745 {
3746 	struct vmx *vmx = arg;
3747 	int vcap;
3748 	int ret;
3749 
3750 	ret = ENOENT;
3751 
3752 	vcap = vmx->cap[vcpu].set;
3753 
3754 	switch (type) {
3755 	case VM_CAP_HALT_EXIT:
3756 		if (cap_halt_exit)
3757 			ret = 0;
3758 		break;
3759 	case VM_CAP_PAUSE_EXIT:
3760 		if (cap_pause_exit)
3761 			ret = 0;
3762 		break;
3763 	case VM_CAP_MTRAP_EXIT:
3764 		if (cap_monitor_trap)
3765 			ret = 0;
3766 		break;
3767 	case VM_CAP_UNRESTRICTED_GUEST:
3768 		if (cap_unrestricted_guest)
3769 			ret = 0;
3770 		break;
3771 	case VM_CAP_ENABLE_INVPCID:
3772 		if (cap_invpcid)
3773 			ret = 0;
3774 		break;
3775 	default:
3776 		break;
3777 	}
3778 
3779 	if (ret == 0)
3780 		*retval = (vcap & (1 << type)) ? 1 : 0;
3781 
3782 	return (ret);
3783 }
3784 
3785 static int
3786 vmx_setcap(void *arg, int vcpu, int type, int val)
3787 {
3788 	struct vmx *vmx = arg;
3789 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
3790 	uint32_t baseval;
3791 	uint32_t *pptr;
3792 	int error;
3793 	int flag;
3794 	int reg;
3795 	int retval;
3796 
3797 	retval = ENOENT;
3798 	pptr = NULL;
3799 
3800 	switch (type) {
3801 	case VM_CAP_HALT_EXIT:
3802 		if (cap_halt_exit) {
3803 			retval = 0;
3804 			pptr = &vmx->cap[vcpu].proc_ctls;
3805 			baseval = *pptr;
3806 			flag = PROCBASED_HLT_EXITING;
3807 			reg = VMCS_PRI_PROC_BASED_CTLS;
3808 		}
3809 		break;
3810 	case VM_CAP_MTRAP_EXIT:
3811 		if (cap_monitor_trap) {
3812 			retval = 0;
3813 			pptr = &vmx->cap[vcpu].proc_ctls;
3814 			baseval = *pptr;
3815 			flag = PROCBASED_MTF;
3816 			reg = VMCS_PRI_PROC_BASED_CTLS;
3817 		}
3818 		break;
3819 	case VM_CAP_PAUSE_EXIT:
3820 		if (cap_pause_exit) {
3821 			retval = 0;
3822 			pptr = &vmx->cap[vcpu].proc_ctls;
3823 			baseval = *pptr;
3824 			flag = PROCBASED_PAUSE_EXITING;
3825 			reg = VMCS_PRI_PROC_BASED_CTLS;
3826 		}
3827 		break;
3828 	case VM_CAP_UNRESTRICTED_GUEST:
3829 		if (cap_unrestricted_guest) {
3830 			retval = 0;
3831 			pptr = &vmx->cap[vcpu].proc_ctls2;
3832 			baseval = *pptr;
3833 			flag = PROCBASED2_UNRESTRICTED_GUEST;
3834 			reg = VMCS_SEC_PROC_BASED_CTLS;
3835 		}
3836 		break;
3837 	case VM_CAP_ENABLE_INVPCID:
3838 		if (cap_invpcid) {
3839 			retval = 0;
3840 			pptr = &vmx->cap[vcpu].proc_ctls2;
3841 			baseval = *pptr;
3842 			flag = PROCBASED2_ENABLE_INVPCID;
3843 			reg = VMCS_SEC_PROC_BASED_CTLS;
3844 		}
3845 		break;
3846 	default:
3847 		break;
3848 	}
3849 
3850 	if (retval == 0) {
3851 		if (val) {
3852 			baseval |= flag;
3853 		} else {
3854 			baseval &= ~flag;
3855 		}
3856 		VMPTRLD(vmcs);
3857 		error = vmwrite(reg, baseval);
3858 		VMCLEAR(vmcs);
3859 
3860 		if (error) {
3861 			retval = error;
3862 		} else {
3863 			/*
3864 			 * Update optional stored flags, and record
3865 			 * setting
3866 			 */
3867 			if (pptr != NULL) {
3868 				*pptr = baseval;
3869 			}
3870 
3871 			if (val) {
3872 				vmx->cap[vcpu].set |= (1 << type);
3873 			} else {
3874 				vmx->cap[vcpu].set &= ~(1 << type);
3875 			}
3876 		}
3877 	}
3878 
3879 	return (retval);
3880 }
3881 
3882 struct vlapic_vtx {
3883 	struct vlapic	vlapic;
3884 	struct pir_desc	*pir_desc;
3885 	struct vmx	*vmx;
3886 	u_int	pending_prio;
3887 };
3888 
3889 #define VPR_PRIO_BIT(vpr)	(1 << ((vpr) >> 4))
3890 
3891 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
3892 do {									\
3893 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
3894 	    level ? "level" : "edge", vector);				\
3895 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
3896 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
3897 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
3898 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
3899 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
3900 } while (0)
3901 
3902 /*
3903  * vlapic->ops handlers that utilize the APICv hardware assist described in
3904  * Chapter 29 of the Intel SDM.
3905  */
3906 static int
3907 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
3908 {
3909 	struct vlapic_vtx *vlapic_vtx;
3910 	struct pir_desc *pir_desc;
3911 	uint64_t mask;
3912 	int idx, notify = 0;
3913 
3914 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
3915 	pir_desc = vlapic_vtx->pir_desc;
3916 
3917 	/*
3918 	 * Keep track of interrupt requests in the PIR descriptor. This is
3919 	 * because the virtual APIC page pointed to by the VMCS cannot be
3920 	 * modified if the vcpu is running.
3921 	 */
3922 	idx = vector / 64;
3923 	mask = 1UL << (vector % 64);
3924 	atomic_set_long(&pir_desc->pir[idx], mask);
3925 
3926 	/*
3927 	 * A notification is required whenever the 'pending' bit makes a
3928 	 * transition from 0->1.
3929 	 *
3930 	 * Even if the 'pending' bit is already asserted, notification about
3931 	 * the incoming interrupt may still be necessary.  For example, if a
3932 	 * vCPU is HLTed with a high PPR, a low priority interrupt would cause
3933 	 * the 0->1 'pending' transition with a notification, but the vCPU
3934 	 * would ignore the interrupt for the time being.  The same vCPU would
3935 	 * need to then be notified if a high-priority interrupt arrived which
3936 	 * satisfied the PPR.
3937 	 *
3938 	 * The priorities of interrupts injected while 'pending' is asserted
3939 	 * are tracked in a custom bitfield 'pending_prio'.  Should the
3940 	 * to-be-injected interrupt exceed the priorities already present, the
3941 	 * notification is sent.  The priorities recorded in 'pending_prio' are
3942 	 * cleared whenever the 'pending' bit makes another 0->1 transition.
3943 	 */
3944 	if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
3945 		notify = 1;
3946 		vlapic_vtx->pending_prio = 0;
3947 	} else {
3948 		const u_int old_prio = vlapic_vtx->pending_prio;
3949 		const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
3950 
3951 		if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
3952 			atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
3953 			notify = 1;
3954 		}
3955 	}
3956 
3957 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
3958 	    level, "vmx_set_intr_ready");
3959 	return (notify);
3960 }
3961 
3962 static int
3963 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
3964 {
3965 	struct vlapic_vtx *vlapic_vtx;
3966 	struct pir_desc *pir_desc;
3967 	struct LAPIC *lapic;
3968 	uint64_t pending, pirval;
3969 	uint32_t ppr, vpr;
3970 	int i;
3971 
3972 	/*
3973 	 * This function is only expected to be called from the 'HLT' exit
3974 	 * handler which does not care about the vector that is pending.
3975 	 */
3976 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
3977 
3978 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
3979 	pir_desc = vlapic_vtx->pir_desc;
3980 
3981 	pending = atomic_load_acq_long(&pir_desc->pending);
3982 	if (!pending) {
3983 		/*
3984 		 * While a virtual interrupt may have already been
3985 		 * processed the actual delivery maybe pending the
3986 		 * interruptibility of the guest.  Recognize a pending
3987 		 * interrupt by reevaluating virtual interrupts
3988 		 * following Section 29.2.1 in the Intel SDM Volume 3.
3989 		 */
3990 		struct vm_exit *vmexit;
3991 		uint8_t rvi, ppr;
3992 
3993 		vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
3994 		rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
3995 		lapic = vlapic->apic_page;
3996 		ppr = lapic->ppr & APIC_TPR_INT;
3997 		if (rvi > ppr) {
3998 			return (1);
3999 		}
4000 
4001 		return (0);
4002 	}
4003 
4004 	/*
4005 	 * If there is an interrupt pending then it will be recognized only
4006 	 * if its priority is greater than the processor priority.
4007 	 *
4008 	 * Special case: if the processor priority is zero then any pending
4009 	 * interrupt will be recognized.
4010 	 */
4011 	lapic = vlapic->apic_page;
4012 	ppr = lapic->ppr & APIC_TPR_INT;
4013 	if (ppr == 0)
4014 		return (1);
4015 
4016 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
4017 	    lapic->ppr);
4018 
4019 	vpr = 0;
4020 	for (i = 3; i >= 0; i--) {
4021 		pirval = pir_desc->pir[i];
4022 		if (pirval != 0) {
4023 			vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
4024 			break;
4025 		}
4026 	}
4027 
4028 	/*
4029 	 * If the highest-priority pending interrupt falls short of the
4030 	 * processor priority of this vCPU, ensure that 'pending_prio' does not
4031 	 * have any stale bits which would preclude a higher-priority interrupt
4032 	 * from incurring a notification later.
4033 	 */
4034 	if (vpr <= ppr) {
4035 		const u_int prio_bit = VPR_PRIO_BIT(vpr);
4036 		const u_int old = vlapic_vtx->pending_prio;
4037 
4038 		if (old > prio_bit && (old & prio_bit) == 0) {
4039 			vlapic_vtx->pending_prio = prio_bit;
4040 		}
4041 		return (0);
4042 	}
4043 	return (1);
4044 }
4045 
4046 static void
4047 vmx_intr_accepted(struct vlapic *vlapic, int vector)
4048 {
4049 
4050 	panic("vmx_intr_accepted: not expected to be called");
4051 }
4052 
4053 static void
4054 vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks)
4055 {
4056 	vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]);
4057 	vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]);
4058 	vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]);
4059 	vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]);
4060 }
4061 
4062 static void
4063 vmx_enable_x2apic_mode(struct vlapic *vlapic)
4064 {
4065 	struct vmx *vmx;
4066 	struct vmcs *vmcs;
4067 	uint32_t proc_ctls2;
4068 	int vcpuid, error;
4069 
4070 	vcpuid = vlapic->vcpuid;
4071 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
4072 	vmcs = &vmx->vmcs[vcpuid];
4073 
4074 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
4075 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
4076 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
4077 
4078 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
4079 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
4080 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
4081 
4082 	VMPTRLD(vmcs);
4083 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
4084 	VMCLEAR(vmcs);
4085 
4086 	if (vlapic->vcpuid == 0) {
4087 		/*
4088 		 * The nested page table mappings are shared by all vcpus
4089 		 * so unmap the APIC access page just once.
4090 		 */
4091 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
4092 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
4093 		    __func__, error));
4094 
4095 		/*
4096 		 * The MSR bitmap is shared by all vcpus so modify it only
4097 		 * once in the context of vcpu 0.
4098 		 */
4099 		error = vmx_allow_x2apic_msrs(vmx);
4100 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
4101 		    __func__, error));
4102 	}
4103 }
4104 
4105 static void
4106 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
4107 {
4108 #ifdef __FreeBSD__
4109 	ipi_cpu(hostcpu, pirvec);
4110 #else
4111 	psm_send_pir_ipi(hostcpu);
4112 #endif
4113 }
4114 
4115 /*
4116  * Transfer the pending interrupts in the PIR descriptor to the IRR
4117  * in the virtual APIC page.
4118  */
4119 static void
4120 vmx_inject_pir(struct vlapic *vlapic)
4121 {
4122 	struct vlapic_vtx *vlapic_vtx;
4123 	struct pir_desc *pir_desc;
4124 	struct LAPIC *lapic;
4125 	uint64_t val, pirval;
4126 	int rvi, pirbase = -1;
4127 	uint16_t intr_status_old, intr_status_new;
4128 
4129 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
4130 	pir_desc = vlapic_vtx->pir_desc;
4131 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
4132 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
4133 		    "no posted interrupt pending");
4134 		return;
4135 	}
4136 
4137 	pirval = 0;
4138 	pirbase = -1;
4139 	lapic = vlapic->apic_page;
4140 
4141 	val = atomic_readandclear_long(&pir_desc->pir[0]);
4142 	if (val != 0) {
4143 		lapic->irr0 |= val;
4144 		lapic->irr1 |= val >> 32;
4145 		pirbase = 0;
4146 		pirval = val;
4147 	}
4148 
4149 	val = atomic_readandclear_long(&pir_desc->pir[1]);
4150 	if (val != 0) {
4151 		lapic->irr2 |= val;
4152 		lapic->irr3 |= val >> 32;
4153 		pirbase = 64;
4154 		pirval = val;
4155 	}
4156 
4157 	val = atomic_readandclear_long(&pir_desc->pir[2]);
4158 	if (val != 0) {
4159 		lapic->irr4 |= val;
4160 		lapic->irr5 |= val >> 32;
4161 		pirbase = 128;
4162 		pirval = val;
4163 	}
4164 
4165 	val = atomic_readandclear_long(&pir_desc->pir[3]);
4166 	if (val != 0) {
4167 		lapic->irr6 |= val;
4168 		lapic->irr7 |= val >> 32;
4169 		pirbase = 192;
4170 		pirval = val;
4171 	}
4172 
4173 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
4174 
4175 	/*
4176 	 * Update RVI so the processor can evaluate pending virtual
4177 	 * interrupts on VM-entry.
4178 	 *
4179 	 * It is possible for pirval to be 0 here, even though the
4180 	 * pending bit has been set. The scenario is:
4181 	 * CPU-Y is sending a posted interrupt to CPU-X, which
4182 	 * is running a guest and processing posted interrupts in h/w.
4183 	 * CPU-X will eventually exit and the state seen in s/w is
4184 	 * the pending bit set, but no PIR bits set.
4185 	 *
4186 	 *      CPU-X                      CPU-Y
4187 	 *   (vm running)                (host running)
4188 	 *   rx posted interrupt
4189 	 *   CLEAR pending bit
4190 	 *				 SET PIR bit
4191 	 *   READ/CLEAR PIR bits
4192 	 *				 SET pending bit
4193 	 *   (vm exit)
4194 	 *   pending bit set, PIR 0
4195 	 */
4196 	if (pirval != 0) {
4197 		rvi = pirbase + flsl(pirval) - 1;
4198 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
4199 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
4200 		if (intr_status_new > intr_status_old) {
4201 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
4202 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
4203 			    "guest_intr_status changed from 0x%04x to 0x%04x",
4204 			    intr_status_old, intr_status_new);
4205 		}
4206 	}
4207 }
4208 
4209 static struct vlapic *
4210 vmx_vlapic_init(void *arg, int vcpuid)
4211 {
4212 	struct vmx *vmx;
4213 	struct vlapic *vlapic;
4214 	struct vlapic_vtx *vlapic_vtx;
4215 
4216 	vmx = arg;
4217 
4218 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
4219 	vlapic->vm = vmx->vm;
4220 	vlapic->vcpuid = vcpuid;
4221 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
4222 
4223 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
4224 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
4225 	vlapic_vtx->vmx = vmx;
4226 
4227 	if (virtual_interrupt_delivery) {
4228 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
4229 		vlapic->ops.pending_intr = vmx_pending_intr;
4230 		vlapic->ops.intr_accepted = vmx_intr_accepted;
4231 		vlapic->ops.set_tmr = vmx_set_tmr;
4232 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
4233 	}
4234 
4235 	if (posted_interrupts)
4236 		vlapic->ops.post_intr = vmx_post_intr;
4237 
4238 	vlapic_init(vlapic);
4239 
4240 	return (vlapic);
4241 }
4242 
4243 static void
4244 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
4245 {
4246 
4247 	vlapic_cleanup(vlapic);
4248 	free(vlapic, M_VLAPIC);
4249 }
4250 
4251 #ifndef __FreeBSD__
4252 static void
4253 vmx_savectx(void *arg, int vcpu)
4254 {
4255 	struct vmx *vmx = arg;
4256 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
4257 
4258 	if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) {
4259 		VERIFY3U(vmclear(vmcs), ==, 0);
4260 		vmx_msr_guest_exit(vmx, vcpu);
4261 		/*
4262 		 * Having VMCLEARed the VMCS, it can no longer be re-entered
4263 		 * with VMRESUME, but must be VMLAUNCHed again.
4264 		 */
4265 		vmx->vmcs_state[vcpu] &= ~VS_LAUNCHED;
4266 	}
4267 
4268 	reset_gdtr_limit();
4269 }
4270 
4271 static void
4272 vmx_restorectx(void *arg, int vcpu)
4273 {
4274 	struct vmx *vmx = arg;
4275 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
4276 
4277 	ASSERT0(vmx->vmcs_state[vcpu] & VS_LAUNCHED);
4278 
4279 	if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) {
4280 		vmx_msr_guest_enter(vmx, vcpu);
4281 		VERIFY3U(vmptrld(vmcs), ==, 0);
4282 	}
4283 }
4284 #endif /* __FreeBSD__ */
4285 
4286 struct vmm_ops vmm_ops_intel = {
4287 	vmx_init,
4288 	vmx_cleanup,
4289 	vmx_restore,
4290 	vmx_vminit,
4291 	vmx_run,
4292 	vmx_vmcleanup,
4293 	vmx_getreg,
4294 	vmx_setreg,
4295 	vmx_getdesc,
4296 	vmx_setdesc,
4297 	vmx_getcap,
4298 	vmx_setcap,
4299 	ept_vmspace_alloc,
4300 	ept_vmspace_free,
4301 	vmx_vlapic_init,
4302 	vmx_vlapic_cleanup,
4303 
4304 #ifndef __FreeBSD__
4305 	vmx_savectx,
4306 	vmx_restorectx,
4307 #endif
4308 };
4309 
4310 #ifndef __FreeBSD__
4311 /* Side-effect free HW validation derived from checks in vmx_init. */
4312 int
4313 vmx_x86_supported(const char **msg)
4314 {
4315 	int error;
4316 	uint32_t tmp;
4317 
4318 	ASSERT(msg != NULL);
4319 
4320 	/* Check support for primary processor-based VM-execution controls */
4321 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
4322 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING,
4323 	    PROCBASED_CTLS_ZERO_SETTING, &tmp);
4324 	if (error) {
4325 		*msg = "processor does not support desired primary "
4326 		    "processor-based controls";
4327 		return (error);
4328 	}
4329 
4330 	/* Check support for secondary processor-based VM-execution controls */
4331 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
4332 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING,
4333 	    PROCBASED_CTLS2_ZERO_SETTING, &tmp);
4334 	if (error) {
4335 		*msg = "processor does not support desired secondary "
4336 		    "processor-based controls";
4337 		return (error);
4338 	}
4339 
4340 	/* Check support for pin-based VM-execution controls */
4341 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
4342 	    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING,
4343 	    PINBASED_CTLS_ZERO_SETTING, &tmp);
4344 	if (error) {
4345 		*msg = "processor does not support desired pin-based controls";
4346 		return (error);
4347 	}
4348 
4349 	/* Check support for VM-exit controls */
4350 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
4351 	    VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &tmp);
4352 	if (error) {
4353 		*msg = "processor does not support desired exit controls";
4354 		return (error);
4355 	}
4356 
4357 	/* Check support for VM-entry controls */
4358 	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
4359 	    VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &tmp);
4360 	if (error) {
4361 		*msg = "processor does not support desired entry controls";
4362 		return (error);
4363 	}
4364 
4365 	/* Unrestricted guest is nominally optional, but not for us. */
4366 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
4367 	    PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp);
4368 	if (error) {
4369 		*msg = "processor does not support desired unrestricted guest "
4370 		    "controls";
4371 		return (error);
4372 	}
4373 
4374 	return (0);
4375 }
4376 #endif
4377