xref: /illumos-gate/usr/src/cmd/bhyve/task_switch.c (revision 4c87aefe)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/_iovec.h>
34 #include <sys/mman.h>
35 
36 #include <x86/psl.h>
37 #include <x86/segments.h>
38 #include <x86/specialreg.h>
39 #include <machine/vmm.h>
40 #include <machine/vmm_instruction_emul.h>
41 
42 #include <assert.h>
43 #include <errno.h>
44 #include <stdbool.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 
48 #include <vmmapi.h>
49 
50 #include "bhyverun.h"
51 
52 /*
53  * Using 'struct i386tss' is tempting but causes myriad sign extension
54  * issues because all of its fields are defined as signed integers.
55  */
56 struct tss32 {
57 	uint16_t	tss_link;
58 	uint16_t	rsvd1;
59 	uint32_t	tss_esp0;
60 	uint16_t	tss_ss0;
61 	uint16_t	rsvd2;
62 	uint32_t	tss_esp1;
63 	uint16_t	tss_ss1;
64 	uint16_t	rsvd3;
65 	uint32_t	tss_esp2;
66 	uint16_t	tss_ss2;
67 	uint16_t	rsvd4;
68 	uint32_t	tss_cr3;
69 	uint32_t	tss_eip;
70 	uint32_t	tss_eflags;
71 	uint32_t	tss_eax;
72 	uint32_t	tss_ecx;
73 	uint32_t	tss_edx;
74 	uint32_t	tss_ebx;
75 	uint32_t	tss_esp;
76 	uint32_t	tss_ebp;
77 	uint32_t	tss_esi;
78 	uint32_t	tss_edi;
79 	uint16_t	tss_es;
80 	uint16_t	rsvd5;
81 	uint16_t	tss_cs;
82 	uint16_t	rsvd6;
83 	uint16_t	tss_ss;
84 	uint16_t	rsvd7;
85 	uint16_t	tss_ds;
86 	uint16_t	rsvd8;
87 	uint16_t	tss_fs;
88 	uint16_t	rsvd9;
89 	uint16_t	tss_gs;
90 	uint16_t	rsvd10;
91 	uint16_t	tss_ldt;
92 	uint16_t	rsvd11;
93 	uint16_t	tss_trap;
94 	uint16_t	tss_iomap;
95 };
96 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");
97 
98 #define	SEL_START(sel)	(((sel) & ~0x7))
99 #define	SEL_LIMIT(sel)	(((sel) | 0x7))
100 #define	TSS_BUSY(type)	(((type) & 0x2) != 0)
101 
102 static uint64_t
103 GETREG(struct vmctx *ctx, int vcpu, int reg)
104 {
105 	uint64_t val;
106 	int error;
107 
108 	error = vm_get_register(ctx, vcpu, reg, &val);
109 	assert(error == 0);
110 	return (val);
111 }
112 
113 static void
114 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
115 {
116 	int error;
117 
118 	error = vm_set_register(ctx, vcpu, reg, val);
119 	assert(error == 0);
120 }
121 
122 static struct seg_desc
123 usd_to_seg_desc(struct user_segment_descriptor *usd)
124 {
125 	struct seg_desc seg_desc;
126 
127 	seg_desc.base = (u_int)USD_GETBASE(usd);
128 	if (usd->sd_gran)
129 		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
130 	else
131 		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
132 	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
133 	seg_desc.access |= usd->sd_xx << 12;
134 	seg_desc.access |= usd->sd_def32 << 14;
135 	seg_desc.access |= usd->sd_gran << 15;
136 
137 	return (seg_desc);
138 }
139 
140 /*
141  * Inject an exception with an error code that is a segment selector.
142  * The format of the error code is described in section 6.13, "Error Code",
143  * Intel SDM volume 3.
144  *
145  * Bit 0 (EXT) denotes whether the exception occurred during delivery
146  * of an external event like an interrupt.
147  *
148  * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
149  * in the IDT.
150  *
151  * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
152  */
153 static void
154 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
155 {
156 	/*
157 	 * Bit 2 from the selector is retained as-is in the error code.
158 	 *
159 	 * Bit 1 can be safely cleared because none of the selectors
160 	 * encountered during task switch emulation refer to a task
161 	 * gate in the IDT.
162 	 *
163 	 * Bit 0 is set depending on the value of 'ext'.
164 	 */
165 	sel &= ~0x3;
166 	if (ext)
167 		sel |= 0x1;
168 	vm_inject_fault(ctx, vcpu, vector, 1, sel);
169 }
170 
171 /*
172  * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
173  * and non-zero otherwise.
174  */
175 static int
176 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
177 {
178 	uint64_t base;
179 	uint32_t limit, access;
180 	int error, reg;
181 
182 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
183 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
184 	assert(error == 0);
185 
186 	if (reg == VM_REG_GUEST_LDTR) {
187 		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
188 			return (-1);
189 	}
190 
191 	if (limit < SEL_LIMIT(sel))
192 		return (-1);
193 	else
194 		return (0);
195 }
196 
197 /*
198  * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
199  * by the selector 'sel'.
200  *
201  * Returns 0 on success.
202  * Returns 1 if an exception was injected into the guest.
203  * Returns -1 otherwise.
204  */
205 static int
206 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
207     uint16_t sel, struct user_segment_descriptor *desc, bool doread,
208     int *faultptr)
209 {
210 	struct iovec iov[2];
211 	uint64_t base;
212 	uint32_t limit, access;
213 	int error, reg;
214 
215 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
216 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
217 	assert(error == 0);
218 	assert(limit >= SEL_LIMIT(sel));
219 
220 	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
221 	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
222 	    faultptr);
223 	if (error || *faultptr)
224 		return (error);
225 
226 	if (doread)
227 		vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
228 	else
229 		vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
230 	return (0);
231 }
232 
233 static int
234 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
235     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
236 {
237 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
238 }
239 
240 static int
241 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
242     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
243 {
244 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
245 }
246 
247 /*
248  * Read the TSS descriptor referenced by 'sel' into 'desc'.
249  *
250  * Returns 0 on success.
251  * Returns 1 if an exception was injected into the guest.
252  * Returns -1 otherwise.
253  */
254 static int
255 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
256     uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
257 {
258 	struct vm_guest_paging sup_paging;
259 	int error;
260 
261 	assert(!ISLDT(sel));
262 	assert(IDXSEL(sel) != 0);
263 
264 	/* Fetch the new TSS descriptor */
265 	if (desc_table_limit_check(ctx, vcpu, sel)) {
266 		if (ts->reason == TSR_IRET)
267 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
268 		else
269 			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
270 		return (1);
271 	}
272 
273 	sup_paging = ts->paging;
274 	sup_paging.cpl = 0;		/* implicit supervisor mode */
275 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
276 	return (error);
277 }
278 
279 static bool
280 code_desc(int sd_type)
281 {
282 	/* code descriptor */
283 	return ((sd_type & 0x18) == 0x18);
284 }
285 
286 static bool
287 stack_desc(int sd_type)
288 {
289 	/* writable data descriptor */
290 	return ((sd_type & 0x1A) == 0x12);
291 }
292 
293 static bool
294 data_desc(int sd_type)
295 {
296 	/* data descriptor or a readable code descriptor */
297 	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
298 }
299 
300 static bool
301 ldt_desc(int sd_type)
302 {
303 
304 	return (sd_type == SDT_SYSLDT);
305 }
306 
307 /*
308  * Validate the descriptor 'seg_desc' associated with 'segment'.
309  */
310 static int
311 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
312     int segment, struct seg_desc *seg_desc, int *faultptr)
313 {
314 	struct vm_guest_paging sup_paging;
315 	struct user_segment_descriptor usd;
316 	int error, idtvec;
317 	int cpl, dpl, rpl;
318 	uint16_t sel, cs;
319 	bool ldtseg, codeseg, stackseg, dataseg, conforming;
320 
321 	ldtseg = codeseg = stackseg = dataseg = false;
322 	switch (segment) {
323 	case VM_REG_GUEST_LDTR:
324 		ldtseg = true;
325 		break;
326 	case VM_REG_GUEST_CS:
327 		codeseg = true;
328 		break;
329 	case VM_REG_GUEST_SS:
330 		stackseg = true;
331 		break;
332 	case VM_REG_GUEST_DS:
333 	case VM_REG_GUEST_ES:
334 	case VM_REG_GUEST_FS:
335 	case VM_REG_GUEST_GS:
336 		dataseg = true;
337 		break;
338 	default:
339 		assert(0);
340 	}
341 
342 	/* Get the segment selector */
343 	sel = GETREG(ctx, vcpu, segment);
344 
345 	/* LDT selector must point into the GDT */
346 	if (ldtseg && ISLDT(sel)) {
347 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
348 		return (1);
349 	}
350 
351 	/* Descriptor table limit check */
352 	if (desc_table_limit_check(ctx, vcpu, sel)) {
353 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
354 		return (1);
355 	}
356 
357 	/* NULL selector */
358 	if (IDXSEL(sel) == 0) {
359 		/* Code and stack segment selectors cannot be NULL */
360 		if (codeseg || stackseg) {
361 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
362 			return (1);
363 		}
364 		seg_desc->base = 0;
365 		seg_desc->limit = 0;
366 		seg_desc->access = 0x10000;	/* unusable */
367 		return (0);
368 	}
369 
370 	/* Read the descriptor from the GDT/LDT */
371 	sup_paging = ts->paging;
372 	sup_paging.cpl = 0;	/* implicit supervisor mode */
373 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
374 	if (error || *faultptr)
375 		return (error);
376 
377 	/* Verify that the descriptor type is compatible with the segment */
378 	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
379 	    (codeseg && !code_desc(usd.sd_type)) ||
380 	    (dataseg && !data_desc(usd.sd_type)) ||
381 	    (stackseg && !stack_desc(usd.sd_type))) {
382 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
383 		return (1);
384 	}
385 
386 	/* Segment must be marked present */
387 	if (!usd.sd_p) {
388 		if (ldtseg)
389 			idtvec = IDT_TS;
390 		else if (stackseg)
391 			idtvec = IDT_SS;
392 		else
393 			idtvec = IDT_NP;
394 		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
395 		return (1);
396 	}
397 
398 	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
399 	cpl = cs & SEL_RPL_MASK;
400 	rpl = sel & SEL_RPL_MASK;
401 	dpl = usd.sd_dpl;
402 
403 	if (stackseg && (rpl != cpl || dpl != cpl)) {
404 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
405 		return (1);
406 	}
407 
408 	if (codeseg) {
409 		conforming = (usd.sd_type & 0x4) ? true : false;
410 		if ((conforming && (cpl < dpl)) ||
411 		    (!conforming && (cpl != dpl))) {
412 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
413 			return (1);
414 		}
415 	}
416 
417 	if (dataseg) {
418 		/*
419 		 * A data segment is always non-conforming except when it's
420 		 * descriptor is a readable, conforming code segment.
421 		 */
422 		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
423 			conforming = true;
424 		else
425 			conforming = false;
426 
427 		if (!conforming && (rpl > dpl || cpl > dpl)) {
428 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
429 			return (1);
430 		}
431 	}
432 	*seg_desc = usd_to_seg_desc(&usd);
433 	return (0);
434 }
435 
436 static void
437 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
438     uint32_t eip, struct tss32 *tss, struct iovec *iov)
439 {
440 
441 	/* General purpose registers */
442 	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
443 	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
444 	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
445 	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
446 	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
447 	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
448 	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
449 	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
450 
451 	/* Segment selectors */
452 	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
453 	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
454 	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
455 	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
456 	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
457 	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
458 
459 	/* eflags and eip */
460 	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
461 	if (task_switch->reason == TSR_IRET)
462 		tss->tss_eflags &= ~PSL_NT;
463 	tss->tss_eip = eip;
464 
465 	/* Copy updated old TSS into guest memory */
466 	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
467 }
468 
469 static void
470 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
471 {
472 	int error;
473 
474 	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
475 	assert(error == 0);
476 }
477 
478 /*
479  * Update the vcpu registers to reflect the state of the new task.
480  */
481 static int
482 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
483     uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
484 {
485 	struct seg_desc seg_desc, seg_desc2;
486 	uint64_t *pdpte, maxphyaddr, reserved;
487 	uint32_t eflags;
488 	int error, i;
489 	bool nested;
490 
491 	nested = false;
492 	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
493 		tss->tss_link = ot_sel;
494 		nested = true;
495 	}
496 
497 	eflags = tss->tss_eflags;
498 	if (nested)
499 		eflags |= PSL_NT;
500 
501 	/* LDTR */
502 	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
503 
504 	/* PBDR */
505 	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
506 		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
507 			/*
508 			 * XXX Assuming 36-bit MAXPHYADDR.
509 			 */
510 			maxphyaddr = (1UL << 36) - 1;
511 			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
512 			for (i = 0; i < 4; i++) {
513 				/* Check reserved bits if the PDPTE is valid */
514 				if (!(pdpte[i] & 0x1))
515 					continue;
516 				/*
517 				 * Bits 2:1, 8:5 and bits above the processor's
518 				 * maximum physical address are reserved.
519 				 */
520 				reserved = ~maxphyaddr | 0x1E6;
521 				if (pdpte[i] & reserved) {
522 					vm_inject_gp(ctx, vcpu);
523 					return (1);
524 				}
525 			}
526 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
527 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
528 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
529 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
530 		}
531 		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
532 		ts->paging.cr3 = tss->tss_cr3;
533 	}
534 
535 	/* eflags and eip */
536 	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
537 	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
538 
539 	/* General purpose registers */
540 	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
541 	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
542 	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
543 	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
544 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
545 	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
546 	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
547 	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
548 
549 	/* Segment selectors */
550 	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
551 	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
552 	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
553 	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
554 	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
555 	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
556 
557 	/*
558 	 * If this is a nested task then write out the new TSS to update
559 	 * the previous link field.
560 	 */
561 	if (nested)
562 		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
563 
564 	/* Validate segment descriptors */
565 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
566 	    faultptr);
567 	if (error || *faultptr)
568 		return (error);
569 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
570 
571 	/*
572 	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
573 	 *
574 	 * The SS and CS attribute checks on VM-entry are inter-dependent so
575 	 * we need to make sure that both segments are valid before updating
576 	 * either of them. This ensures that the VMCS state can pass the
577 	 * VM-entry checks so the guest can handle any exception injected
578 	 * during task switch emulation.
579 	 */
580 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
581 	    faultptr);
582 	if (error || *faultptr)
583 		return (error);
584 
585 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
586 	    faultptr);
587 	if (error || *faultptr)
588 		return (error);
589 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
590 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
591 	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
592 
593 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
594 	    faultptr);
595 	if (error || *faultptr)
596 		return (error);
597 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
598 
599 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
600 	    faultptr);
601 	if (error || *faultptr)
602 		return (error);
603 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
604 
605 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
606 	    faultptr);
607 	if (error || *faultptr)
608 		return (error);
609 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
610 
611 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
612 	    faultptr);
613 	if (error || *faultptr)
614 		return (error);
615 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
616 
617 	return (0);
618 }
619 
620 /*
621  * Push an error code on the stack of the new task. This is needed if the
622  * task switch was triggered by a hardware exception that causes an error
623  * code to be saved (e.g. #PF).
624  */
625 static int
626 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
627     int task_type, uint32_t errcode, int *faultptr)
628 {
629 	struct iovec iov[2];
630 	struct seg_desc seg_desc;
631 	int stacksize, bytes, error;
632 	uint64_t gla, cr0, rflags;
633 	uint32_t esp;
634 	uint16_t stacksel;
635 
636 	*faultptr = 0;
637 
638 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
639 	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
640 	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
641 
642 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
643 	    &seg_desc.limit, &seg_desc.access);
644 	assert(error == 0);
645 
646 	/*
647 	 * Section "Error Code" in the Intel SDM vol 3: the error code is
648 	 * pushed on the stack as a doubleword or word (depending on the
649 	 * default interrupt, trap or task gate size).
650 	 */
651 	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
652 		bytes = 4;
653 	else
654 		bytes = 2;
655 
656 	/*
657 	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
658 	 * stack-segment descriptor determines the size of the stack
659 	 * pointer outside of 64-bit mode.
660 	 */
661 	if (SEG_DESC_DEF32(seg_desc.access))
662 		stacksize = 4;
663 	else
664 		stacksize = 2;
665 
666 	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
667 	esp -= bytes;
668 
669 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
670 	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
671 		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
672 		*faultptr = 1;
673 		return (0);
674 	}
675 
676 	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
677 		vm_inject_ac(ctx, vcpu, 1);
678 		*faultptr = 1;
679 		return (0);
680 	}
681 
682 	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
683 	    iov, nitems(iov), faultptr);
684 	if (error || *faultptr)
685 		return (error);
686 
687 	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
688 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
689 	return (0);
690 }
691 
692 /*
693  * Evaluate return value from helper functions and potentially return to
694  * the VM run loop.
695  */
696 #define	CHKERR(error,fault)						\
697 	do {								\
698 		assert((error == 0) || (error == EFAULT));		\
699 		if (error)						\
700 			return (VMEXIT_ABORT);				\
701 		else if (fault)						\
702 			return (VMEXIT_CONTINUE);			\
703 	} while (0)
704 
705 int
706 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
707 {
708 	struct seg_desc nt;
709 	struct tss32 oldtss, newtss;
710 	struct vm_task_switch *task_switch;
711 	struct vm_guest_paging *paging, sup_paging;
712 	struct user_segment_descriptor nt_desc, ot_desc;
713 	struct iovec nt_iov[2], ot_iov[2];
714 	uint64_t cr0, ot_base;
715 	uint32_t eip, ot_lim, access;
716 	int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
717 	enum task_switch_reason reason;
718 	uint16_t nt_sel, ot_sel;
719 
720 	task_switch = &vmexit->u.task_switch;
721 	nt_sel = task_switch->tsssel;
722 	ext = vmexit->u.task_switch.ext;
723 	reason = vmexit->u.task_switch.reason;
724 	paging = &vmexit->u.task_switch.paging;
725 	vcpu = *pvcpu;
726 
727 	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
728 
729 	/*
730 	 * Calculate the instruction pointer to store in the old TSS.
731 	 */
732 	eip = vmexit->rip + vmexit->inst_length;
733 
734 	/*
735 	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
736 	 * The following page table accesses are implicitly supervisor mode:
737 	 * - accesses to GDT or LDT to load segment descriptors
738 	 * - accesses to the task state segment during task switch
739 	 */
740 	sup_paging = *paging;
741 	sup_paging.cpl = 0;	/* implicit supervisor mode */
742 
743 	/* Fetch the new TSS descriptor */
744 	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
745 	    &fault);
746 	CHKERR(error, fault);
747 
748 	nt = usd_to_seg_desc(&nt_desc);
749 
750 	/* Verify the type of the new TSS */
751 	nt_type = SEG_DESC_TYPE(nt.access);
752 	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
753 	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
754 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
755 		goto done;
756 	}
757 
758 	/* TSS descriptor must have present bit set */
759 	if (!SEG_DESC_PRESENT(nt.access)) {
760 		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
761 		goto done;
762 	}
763 
764 	/*
765 	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
766 	 * 44 bytes for a 16-bit TSS.
767 	 */
768 	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
769 		minlimit = 104 - 1;
770 	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
771 		minlimit = 44 - 1;
772 	else
773 		minlimit = 0;
774 
775 	assert(minlimit > 0);
776 	if (nt.limit < minlimit) {
777 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
778 		goto done;
779 	}
780 
781 	/* TSS must be busy if task switch is due to IRET */
782 	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
783 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
784 		goto done;
785 	}
786 
787 	/*
788 	 * TSS must be available (not busy) if task switch reason is
789 	 * CALL, JMP, exception or interrupt.
790 	 */
791 	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
792 		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
793 		goto done;
794 	}
795 
796 	/* Fetch the new TSS */
797 	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
798 	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
799 	CHKERR(error, fault);
800 	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
801 
802 	/* Get the old TSS selector from the guest's task register */
803 	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
804 	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
805 		/*
806 		 * This might happen if a task switch was attempted without
807 		 * ever loading the task register with LTR. In this case the
808 		 * TR would contain the values from power-on:
809 		 * (sel = 0, base = 0, limit = 0xffff).
810 		 */
811 		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
812 		goto done;
813 	}
814 
815 	/* Get the old TSS base and limit from the guest's task register */
816 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
817 	    &access);
818 	assert(error == 0);
819 	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
820 	ot_type = SEG_DESC_TYPE(access);
821 	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
822 
823 	/* Fetch the old TSS descriptor */
824 	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
825 	    &fault);
826 	CHKERR(error, fault);
827 
828 	/* Get the old TSS */
829 	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
830 	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
831 	CHKERR(error, fault);
832 	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
833 
834 	/*
835 	 * Clear the busy bit in the old TSS descriptor if the task switch
836 	 * due to an IRET or JMP instruction.
837 	 */
838 	if (reason == TSR_IRET || reason == TSR_JMP) {
839 		ot_desc.sd_type &= ~0x2;
840 		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
841 		    &ot_desc, &fault);
842 		CHKERR(error, fault);
843 	}
844 
845 	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
846 		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
847 		return (VMEXIT_ABORT);
848 	}
849 
850 	/* Save processor state in old TSS */
851 	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
852 
853 	/*
854 	 * If the task switch was triggered for any reason other than IRET
855 	 * then set the busy bit in the new TSS descriptor.
856 	 */
857 	if (reason != TSR_IRET) {
858 		nt_desc.sd_type |= 0x2;
859 		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
860 		    &nt_desc, &fault);
861 		CHKERR(error, fault);
862 	}
863 
864 	/* Update task register to point at the new TSS */
865 	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
866 
867 	/* Update the hidden descriptor state of the task register */
868 	nt = usd_to_seg_desc(&nt_desc);
869 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
870 
871 	/* Set CR0.TS */
872 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
873 	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
874 
875 	/*
876 	 * We are now committed to the task switch. Any exceptions encountered
877 	 * after this point will be handled in the context of the new task and
878 	 * the saved instruction pointer will belong to the new task.
879 	 */
880 	error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
881 	assert(error == 0);
882 
883 	/* Load processor state from new TSS */
884 	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
885 	    &fault);
886 	CHKERR(error, fault);
887 
888 	/*
889 	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
890 	 * caused an error code to be generated, this error code is copied
891 	 * to the stack of the new task.
892 	 */
893 	if (task_switch->errcode_valid) {
894 		assert(task_switch->ext);
895 		assert(task_switch->reason == TSR_IDT_GATE);
896 		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
897 		    task_switch->errcode, &fault);
898 		CHKERR(error, fault);
899 	}
900 
901 	/*
902 	 * Treatment of virtual-NMI blocking if NMI is delivered through
903 	 * a task gate.
904 	 *
905 	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
906 	 * If the virtual NMIs VM-execution control is 1, VM entry injects
907 	 * an NMI, and delivery of the NMI causes a task switch that causes
908 	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
909 	 * commences.
910 	 *
911 	 * Thus, virtual-NMI blocking is in effect at the time of the task
912 	 * switch VM exit.
913 	 */
914 
915 	/*
916 	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
917 	 *
918 	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
919 	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
920 	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
921 	 *
922 	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
923 	 * VM exit.
924 	 */
925 
926 	/*
927 	 * If the task switch was triggered by an event delivered through
928 	 * the IDT then extinguish the pending event from the vcpu's
929 	 * exitintinfo.
930 	 */
931 	if (task_switch->reason == TSR_IDT_GATE) {
932 		error = vm_set_intinfo(ctx, vcpu, 0);
933 		assert(error == 0);
934 	}
935 
936 	/*
937 	 * XXX should inject debug exception if 'T' bit is 1
938 	 */
939 done:
940 	return (VMEXIT_CONTINUE);
941 }
942