1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/_iovec.h> 34 #include <sys/mman.h> 35 36 #include <x86/psl.h> 37 #include <x86/segments.h> 38 #include <x86/specialreg.h> 39 #include <machine/vmm.h> 40 #include <machine/vmm_instruction_emul.h> 41 42 #include <assert.h> 43 #include <errno.h> 44 #include <stdbool.h> 45 #include <stdio.h> 46 #include <stdlib.h> 47 48 #include <vmmapi.h> 49 50 #include "bhyverun.h" 51 #include "debug.h" 52 53 /* 54 * Using 'struct i386tss' is tempting but causes myriad sign extension 55 * issues because all of its fields are defined as signed integers. 56 */ 57 struct tss32 { 58 uint16_t tss_link; 59 uint16_t rsvd1; 60 uint32_t tss_esp0; 61 uint16_t tss_ss0; 62 uint16_t rsvd2; 63 uint32_t tss_esp1; 64 uint16_t tss_ss1; 65 uint16_t rsvd3; 66 uint32_t tss_esp2; 67 uint16_t tss_ss2; 68 uint16_t rsvd4; 69 uint32_t tss_cr3; 70 uint32_t tss_eip; 71 uint32_t tss_eflags; 72 uint32_t tss_eax; 73 uint32_t tss_ecx; 74 uint32_t tss_edx; 75 uint32_t tss_ebx; 76 uint32_t tss_esp; 77 uint32_t tss_ebp; 78 uint32_t tss_esi; 79 uint32_t tss_edi; 80 uint16_t tss_es; 81 uint16_t rsvd5; 82 uint16_t tss_cs; 83 uint16_t rsvd6; 84 uint16_t tss_ss; 85 uint16_t rsvd7; 86 uint16_t tss_ds; 87 uint16_t rsvd8; 88 uint16_t tss_fs; 89 uint16_t rsvd9; 90 uint16_t tss_gs; 91 uint16_t rsvd10; 92 uint16_t tss_ldt; 93 uint16_t rsvd11; 94 uint16_t tss_trap; 95 uint16_t tss_iomap; 96 }; 97 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); 98 99 #define SEL_START(sel) (((sel) & ~0x7)) 100 #define SEL_LIMIT(sel) (((sel) | 0x7)) 101 #define TSS_BUSY(type) (((type) & 0x2) != 0) 102 103 static uint64_t 104 GETREG(struct vmctx *ctx, int vcpu, int reg) 105 { 106 uint64_t val; 107 int error; 108 109 error = vm_get_register(ctx, vcpu, reg, &val); 110 assert(error == 0); 111 return (val); 112 } 113 114 static void 115 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) 116 { 117 int error; 118 119 error = vm_set_register(ctx, vcpu, reg, val); 120 assert(error == 0); 121 } 122 123 static struct seg_desc 124 usd_to_seg_desc(struct user_segment_descriptor *usd) 125 { 126 struct seg_desc seg_desc; 127 128 seg_desc.base = (u_int)USD_GETBASE(usd); 129 if (usd->sd_gran) 130 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; 131 else 132 seg_desc.limit = (u_int)USD_GETLIMIT(usd); 133 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; 134 seg_desc.access |= usd->sd_xx << 12; 135 seg_desc.access |= usd->sd_def32 << 14; 136 seg_desc.access |= usd->sd_gran << 15; 137 138 return (seg_desc); 139 } 140 141 /* 142 * Inject an exception with an error code that is a segment selector. 143 * The format of the error code is described in section 6.13, "Error Code", 144 * Intel SDM volume 3. 145 * 146 * Bit 0 (EXT) denotes whether the exception occurred during delivery 147 * of an external event like an interrupt. 148 * 149 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor 150 * in the IDT. 151 * 152 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). 153 */ 154 static void 155 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) 156 { 157 /* 158 * Bit 2 from the selector is retained as-is in the error code. 159 * 160 * Bit 1 can be safely cleared because none of the selectors 161 * encountered during task switch emulation refer to a task 162 * gate in the IDT. 163 * 164 * Bit 0 is set depending on the value of 'ext'. 165 */ 166 sel &= ~0x3; 167 if (ext) 168 sel |= 0x1; 169 vm_inject_fault(ctx, vcpu, vector, 1, sel); 170 } 171 172 /* 173 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT 174 * and non-zero otherwise. 175 */ 176 static int 177 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) 178 { 179 uint64_t base; 180 uint32_t limit, access; 181 int error, reg; 182 183 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 184 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 185 assert(error == 0); 186 187 if (reg == VM_REG_GUEST_LDTR) { 188 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) 189 return (-1); 190 } 191 192 if (limit < SEL_LIMIT(sel)) 193 return (-1); 194 else 195 return (0); 196 } 197 198 /* 199 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced 200 * by the selector 'sel'. 201 * 202 * Returns 0 on success. 203 * Returns 1 if an exception was injected into the guest. 204 * Returns -1 otherwise. 205 */ 206 static int 207 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 208 uint16_t sel, struct user_segment_descriptor *desc, bool doread, 209 int *faultptr) 210 { 211 struct iovec iov[2]; 212 uint64_t base; 213 uint32_t limit, access; 214 int error, reg; 215 216 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 217 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 218 assert(error == 0); 219 assert(limit >= SEL_LIMIT(sel)); 220 221 error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), 222 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), 223 faultptr); 224 if (error || *faultptr) 225 return (error); 226 227 if (doread) 228 vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); 229 else 230 vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); 231 return (0); 232 } 233 234 static int 235 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 236 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 237 { 238 return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); 239 } 240 241 static int 242 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 243 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 244 { 245 return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); 246 } 247 248 /* 249 * Read the TSS descriptor referenced by 'sel' into 'desc'. 250 * 251 * Returns 0 on success. 252 * Returns 1 if an exception was injected into the guest. 253 * Returns -1 otherwise. 254 */ 255 static int 256 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 257 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 258 { 259 struct vm_guest_paging sup_paging; 260 int error; 261 262 assert(!ISLDT(sel)); 263 assert(IDXSEL(sel) != 0); 264 265 /* Fetch the new TSS descriptor */ 266 if (desc_table_limit_check(ctx, vcpu, sel)) { 267 if (ts->reason == TSR_IRET) 268 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 269 else 270 sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); 271 return (1); 272 } 273 274 sup_paging = ts->paging; 275 sup_paging.cpl = 0; /* implicit supervisor mode */ 276 error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); 277 return (error); 278 } 279 280 static bool 281 code_desc(int sd_type) 282 { 283 /* code descriptor */ 284 return ((sd_type & 0x18) == 0x18); 285 } 286 287 static bool 288 stack_desc(int sd_type) 289 { 290 /* writable data descriptor */ 291 return ((sd_type & 0x1A) == 0x12); 292 } 293 294 static bool 295 data_desc(int sd_type) 296 { 297 /* data descriptor or a readable code descriptor */ 298 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); 299 } 300 301 static bool 302 ldt_desc(int sd_type) 303 { 304 305 return (sd_type == SDT_SYSLDT); 306 } 307 308 /* 309 * Validate the descriptor 'seg_desc' associated with 'segment'. 310 */ 311 static int 312 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 313 int segment, struct seg_desc *seg_desc, int *faultptr) 314 { 315 struct vm_guest_paging sup_paging; 316 struct user_segment_descriptor usd; 317 int error, idtvec; 318 int cpl, dpl, rpl; 319 uint16_t sel, cs; 320 bool ldtseg, codeseg, stackseg, dataseg, conforming; 321 322 ldtseg = codeseg = stackseg = dataseg = false; 323 switch (segment) { 324 case VM_REG_GUEST_LDTR: 325 ldtseg = true; 326 break; 327 case VM_REG_GUEST_CS: 328 codeseg = true; 329 break; 330 case VM_REG_GUEST_SS: 331 stackseg = true; 332 break; 333 case VM_REG_GUEST_DS: 334 case VM_REG_GUEST_ES: 335 case VM_REG_GUEST_FS: 336 case VM_REG_GUEST_GS: 337 dataseg = true; 338 break; 339 default: 340 assert(0); 341 } 342 343 /* Get the segment selector */ 344 sel = GETREG(ctx, vcpu, segment); 345 346 /* LDT selector must point into the GDT */ 347 if (ldtseg && ISLDT(sel)) { 348 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 349 return (1); 350 } 351 352 /* Descriptor table limit check */ 353 if (desc_table_limit_check(ctx, vcpu, sel)) { 354 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 355 return (1); 356 } 357 358 /* NULL selector */ 359 if (IDXSEL(sel) == 0) { 360 /* Code and stack segment selectors cannot be NULL */ 361 if (codeseg || stackseg) { 362 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 363 return (1); 364 } 365 seg_desc->base = 0; 366 seg_desc->limit = 0; 367 seg_desc->access = 0x10000; /* unusable */ 368 return (0); 369 } 370 371 /* Read the descriptor from the GDT/LDT */ 372 sup_paging = ts->paging; 373 sup_paging.cpl = 0; /* implicit supervisor mode */ 374 error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); 375 if (error || *faultptr) 376 return (error); 377 378 /* Verify that the descriptor type is compatible with the segment */ 379 if ((ldtseg && !ldt_desc(usd.sd_type)) || 380 (codeseg && !code_desc(usd.sd_type)) || 381 (dataseg && !data_desc(usd.sd_type)) || 382 (stackseg && !stack_desc(usd.sd_type))) { 383 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 384 return (1); 385 } 386 387 /* Segment must be marked present */ 388 if (!usd.sd_p) { 389 if (ldtseg) 390 idtvec = IDT_TS; 391 else if (stackseg) 392 idtvec = IDT_SS; 393 else 394 idtvec = IDT_NP; 395 sel_exception(ctx, vcpu, idtvec, sel, ts->ext); 396 return (1); 397 } 398 399 cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 400 cpl = cs & SEL_RPL_MASK; 401 rpl = sel & SEL_RPL_MASK; 402 dpl = usd.sd_dpl; 403 404 if (stackseg && (rpl != cpl || dpl != cpl)) { 405 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 406 return (1); 407 } 408 409 if (codeseg) { 410 conforming = (usd.sd_type & 0x4) ? true : false; 411 if ((conforming && (cpl < dpl)) || 412 (!conforming && (cpl != dpl))) { 413 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 414 return (1); 415 } 416 } 417 418 if (dataseg) { 419 /* 420 * A data segment is always non-conforming except when it's 421 * descriptor is a readable, conforming code segment. 422 */ 423 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) 424 conforming = true; 425 else 426 conforming = false; 427 428 if (!conforming && (rpl > dpl || cpl > dpl)) { 429 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 430 return (1); 431 } 432 } 433 *seg_desc = usd_to_seg_desc(&usd); 434 return (0); 435 } 436 437 static void 438 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, 439 uint32_t eip, struct tss32 *tss, struct iovec *iov) 440 { 441 442 /* General purpose registers */ 443 tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); 444 tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); 445 tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); 446 tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); 447 tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 448 tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); 449 tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); 450 tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); 451 452 /* Segment selectors */ 453 tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); 454 tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 455 tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 456 tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); 457 tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); 458 tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); 459 460 /* eflags and eip */ 461 tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 462 if (task_switch->reason == TSR_IRET) 463 tss->tss_eflags &= ~PSL_NT; 464 tss->tss_eip = eip; 465 466 /* Copy updated old TSS into guest memory */ 467 vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); 468 } 469 470 static void 471 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) 472 { 473 int error; 474 475 error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); 476 assert(error == 0); 477 } 478 479 /* 480 * Update the vcpu registers to reflect the state of the new task. 481 */ 482 static int 483 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 484 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) 485 { 486 struct seg_desc seg_desc, seg_desc2; 487 uint64_t *pdpte, maxphyaddr, reserved; 488 uint32_t eflags; 489 int error, i; 490 bool nested; 491 492 nested = false; 493 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { 494 tss->tss_link = ot_sel; 495 nested = true; 496 } 497 498 eflags = tss->tss_eflags; 499 if (nested) 500 eflags |= PSL_NT; 501 502 /* LDTR */ 503 SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); 504 505 /* PBDR */ 506 if (ts->paging.paging_mode != PAGING_MODE_FLAT) { 507 if (ts->paging.paging_mode == PAGING_MODE_PAE) { 508 /* 509 * XXX Assuming 36-bit MAXPHYADDR. 510 */ 511 maxphyaddr = (1UL << 36) - 1; 512 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); 513 for (i = 0; i < 4; i++) { 514 /* Check reserved bits if the PDPTE is valid */ 515 if (!(pdpte[i] & 0x1)) 516 continue; 517 /* 518 * Bits 2:1, 8:5 and bits above the processor's 519 * maximum physical address are reserved. 520 */ 521 reserved = ~maxphyaddr | 0x1E6; 522 if (pdpte[i] & reserved) { 523 vm_inject_gp(ctx, vcpu); 524 return (1); 525 } 526 } 527 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); 528 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); 529 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); 530 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); 531 } 532 SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); 533 ts->paging.cr3 = tss->tss_cr3; 534 } 535 536 /* eflags and eip */ 537 SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); 538 SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); 539 540 /* General purpose registers */ 541 SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); 542 SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); 543 SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); 544 SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); 545 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); 546 SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); 547 SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); 548 SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); 549 550 /* Segment selectors */ 551 SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); 552 SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); 553 SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); 554 SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); 555 SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); 556 SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); 557 558 /* 559 * If this is a nested task then write out the new TSS to update 560 * the previous link field. 561 */ 562 if (nested) 563 vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); 564 565 /* Validate segment descriptors */ 566 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, 567 faultptr); 568 if (error || *faultptr) 569 return (error); 570 update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); 571 572 /* 573 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. 574 * 575 * The SS and CS attribute checks on VM-entry are inter-dependent so 576 * we need to make sure that both segments are valid before updating 577 * either of them. This ensures that the VMCS state can pass the 578 * VM-entry checks so the guest can handle any exception injected 579 * during task switch emulation. 580 */ 581 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, 582 faultptr); 583 if (error || *faultptr) 584 return (error); 585 586 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, 587 faultptr); 588 if (error || *faultptr) 589 return (error); 590 update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); 591 update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); 592 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; 593 594 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, 595 faultptr); 596 if (error || *faultptr) 597 return (error); 598 update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); 599 600 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, 601 faultptr); 602 if (error || *faultptr) 603 return (error); 604 update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); 605 606 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, 607 faultptr); 608 if (error || *faultptr) 609 return (error); 610 update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); 611 612 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, 613 faultptr); 614 if (error || *faultptr) 615 return (error); 616 update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); 617 618 return (0); 619 } 620 621 /* 622 * Push an error code on the stack of the new task. This is needed if the 623 * task switch was triggered by a hardware exception that causes an error 624 * code to be saved (e.g. #PF). 625 */ 626 static int 627 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 628 int task_type, uint32_t errcode, int *faultptr) 629 { 630 struct iovec iov[2]; 631 struct seg_desc seg_desc; 632 int stacksize, bytes, error; 633 uint64_t gla, cr0, rflags; 634 uint32_t esp; 635 uint16_t stacksel; 636 637 *faultptr = 0; 638 639 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 640 rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 641 stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 642 643 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, 644 &seg_desc.limit, &seg_desc.access); 645 assert(error == 0); 646 647 /* 648 * Section "Error Code" in the Intel SDM vol 3: the error code is 649 * pushed on the stack as a doubleword or word (depending on the 650 * default interrupt, trap or task gate size). 651 */ 652 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) 653 bytes = 4; 654 else 655 bytes = 2; 656 657 /* 658 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the 659 * stack-segment descriptor determines the size of the stack 660 * pointer outside of 64-bit mode. 661 */ 662 if (SEG_DESC_DEF32(seg_desc.access)) 663 stacksize = 4; 664 else 665 stacksize = 2; 666 667 esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 668 esp -= bytes; 669 670 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, 671 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { 672 sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); 673 *faultptr = 1; 674 return (0); 675 } 676 677 if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { 678 vm_inject_ac(ctx, vcpu, 1); 679 *faultptr = 1; 680 return (0); 681 } 682 683 error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, 684 iov, nitems(iov), faultptr); 685 if (error || *faultptr) 686 return (error); 687 688 vm_copyout(ctx, vcpu, &errcode, iov, bytes); 689 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); 690 return (0); 691 } 692 693 /* 694 * Evaluate return value from helper functions and potentially return to 695 * the VM run loop. 696 */ 697 #define CHKERR(error,fault) \ 698 do { \ 699 assert((error == 0) || (error == EFAULT)); \ 700 if (error) \ 701 return (VMEXIT_ABORT); \ 702 else if (fault) \ 703 return (VMEXIT_CONTINUE); \ 704 } while (0) 705 706 int 707 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 708 { 709 struct seg_desc nt; 710 struct tss32 oldtss, newtss; 711 struct vm_task_switch *task_switch; 712 struct vm_guest_paging *paging, sup_paging; 713 struct user_segment_descriptor nt_desc, ot_desc; 714 struct iovec nt_iov[2], ot_iov[2]; 715 uint64_t cr0, ot_base; 716 uint32_t eip, ot_lim, access; 717 int error, ext, fault, minlimit, nt_type, ot_type, vcpu; 718 enum task_switch_reason reason; 719 uint16_t nt_sel, ot_sel; 720 721 task_switch = &vmexit->u.task_switch; 722 nt_sel = task_switch->tsssel; 723 ext = vmexit->u.task_switch.ext; 724 reason = vmexit->u.task_switch.reason; 725 paging = &vmexit->u.task_switch.paging; 726 vcpu = *pvcpu; 727 728 assert(paging->cpu_mode == CPU_MODE_PROTECTED); 729 730 /* 731 * Calculate the instruction pointer to store in the old TSS. 732 */ 733 eip = vmexit->rip + vmexit->inst_length; 734 735 /* 736 * Section 4.6, "Access Rights" in Intel SDM Vol 3. 737 * The following page table accesses are implicitly supervisor mode: 738 * - accesses to GDT or LDT to load segment descriptors 739 * - accesses to the task state segment during task switch 740 */ 741 sup_paging = *paging; 742 sup_paging.cpl = 0; /* implicit supervisor mode */ 743 744 /* Fetch the new TSS descriptor */ 745 error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, 746 &fault); 747 CHKERR(error, fault); 748 749 nt = usd_to_seg_desc(&nt_desc); 750 751 /* Verify the type of the new TSS */ 752 nt_type = SEG_DESC_TYPE(nt.access); 753 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && 754 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { 755 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 756 goto done; 757 } 758 759 /* TSS descriptor must have present bit set */ 760 if (!SEG_DESC_PRESENT(nt.access)) { 761 sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); 762 goto done; 763 } 764 765 /* 766 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and 767 * 44 bytes for a 16-bit TSS. 768 */ 769 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) 770 minlimit = 104 - 1; 771 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) 772 minlimit = 44 - 1; 773 else 774 minlimit = 0; 775 776 assert(minlimit > 0); 777 if (nt.limit < minlimit) { 778 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 779 goto done; 780 } 781 782 /* TSS must be busy if task switch is due to IRET */ 783 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { 784 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 785 goto done; 786 } 787 788 /* 789 * TSS must be available (not busy) if task switch reason is 790 * CALL, JMP, exception or interrupt. 791 */ 792 if (reason != TSR_IRET && TSS_BUSY(nt_type)) { 793 sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); 794 goto done; 795 } 796 797 /* Fetch the new TSS */ 798 error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, 799 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); 800 CHKERR(error, fault); 801 vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); 802 803 /* Get the old TSS selector from the guest's task register */ 804 ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); 805 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { 806 /* 807 * This might happen if a task switch was attempted without 808 * ever loading the task register with LTR. In this case the 809 * TR would contain the values from power-on: 810 * (sel = 0, base = 0, limit = 0xffff). 811 */ 812 sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); 813 goto done; 814 } 815 816 /* Get the old TSS base and limit from the guest's task register */ 817 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, 818 &access); 819 assert(error == 0); 820 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); 821 ot_type = SEG_DESC_TYPE(access); 822 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); 823 824 /* Fetch the old TSS descriptor */ 825 error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, 826 &fault); 827 CHKERR(error, fault); 828 829 /* Get the old TSS */ 830 error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, 831 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); 832 CHKERR(error, fault); 833 vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); 834 835 /* 836 * Clear the busy bit in the old TSS descriptor if the task switch 837 * due to an IRET or JMP instruction. 838 */ 839 if (reason == TSR_IRET || reason == TSR_JMP) { 840 ot_desc.sd_type &= ~0x2; 841 error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, 842 &ot_desc, &fault); 843 CHKERR(error, fault); 844 } 845 846 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { 847 EPRINTLN("Task switch to 16-bit TSS not supported"); 848 return (VMEXIT_ABORT); 849 } 850 851 /* Save processor state in old TSS */ 852 tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); 853 854 /* 855 * If the task switch was triggered for any reason other than IRET 856 * then set the busy bit in the new TSS descriptor. 857 */ 858 if (reason != TSR_IRET) { 859 nt_desc.sd_type |= 0x2; 860 error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, 861 &nt_desc, &fault); 862 CHKERR(error, fault); 863 } 864 865 /* Update task register to point at the new TSS */ 866 SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); 867 868 /* Update the hidden descriptor state of the task register */ 869 nt = usd_to_seg_desc(&nt_desc); 870 update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); 871 872 /* Set CR0.TS */ 873 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 874 SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); 875 876 /* 877 * We are now committed to the task switch. Any exceptions encountered 878 * after this point will be handled in the context of the new task and 879 * the saved instruction pointer will belong to the new task. 880 */ 881 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); 882 assert(error == 0); 883 884 /* Load processor state from new TSS */ 885 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, 886 &fault); 887 CHKERR(error, fault); 888 889 /* 890 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception 891 * caused an error code to be generated, this error code is copied 892 * to the stack of the new task. 893 */ 894 if (task_switch->errcode_valid) { 895 assert(task_switch->ext); 896 assert(task_switch->reason == TSR_IDT_GATE); 897 error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, 898 task_switch->errcode, &fault); 899 CHKERR(error, fault); 900 } 901 902 /* 903 * Treatment of virtual-NMI blocking if NMI is delivered through 904 * a task gate. 905 * 906 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: 907 * If the virtual NMIs VM-execution control is 1, VM entry injects 908 * an NMI, and delivery of the NMI causes a task switch that causes 909 * a VM exit, virtual-NMI blocking is in effect before the VM exit 910 * commences. 911 * 912 * Thus, virtual-NMI blocking is in effect at the time of the task 913 * switch VM exit. 914 */ 915 916 /* 917 * Treatment of virtual-NMI unblocking on IRET from NMI handler task. 918 * 919 * Section "Changes to Instruction Behavior in VMX Non-Root Operation" 920 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. 921 * This unblocking of virtual-NMI occurs even if IRET causes a fault. 922 * 923 * Thus, virtual-NMI blocking is cleared at the time of the task switch 924 * VM exit. 925 */ 926 927 /* 928 * If the task switch was triggered by an event delivered through 929 * the IDT then extinguish the pending event from the vcpu's 930 * exitintinfo. 931 */ 932 if (task_switch->reason == TSR_IDT_GATE) { 933 error = vm_set_intinfo(ctx, vcpu, 0); 934 assert(error == 0); 935 } 936 937 /* 938 * XXX should inject debug exception if 'T' bit is 1 939 */ 940 done: 941 return (VMEXIT_CONTINUE); 942 } 943