1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2014 Neel Natu <neel@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/_iovec.h> 34 #include <sys/mman.h> 35 36 #include <x86/psl.h> 37 #include <x86/segments.h> 38 #include <x86/specialreg.h> 39 #include <machine/vmm.h> 40 #include <machine/vmm_instruction_emul.h> 41 42 #include <assert.h> 43 #include <errno.h> 44 #include <stdbool.h> 45 #include <stdio.h> 46 #include <stdlib.h> 47 48 #include <vmmapi.h> 49 50 #include "bhyverun.h" 51 52 /* 53 * Using 'struct i386tss' is tempting but causes myriad sign extension 54 * issues because all of its fields are defined as signed integers. 55 */ 56 struct tss32 { 57 uint16_t tss_link; 58 uint16_t rsvd1; 59 uint32_t tss_esp0; 60 uint16_t tss_ss0; 61 uint16_t rsvd2; 62 uint32_t tss_esp1; 63 uint16_t tss_ss1; 64 uint16_t rsvd3; 65 uint32_t tss_esp2; 66 uint16_t tss_ss2; 67 uint16_t rsvd4; 68 uint32_t tss_cr3; 69 uint32_t tss_eip; 70 uint32_t tss_eflags; 71 uint32_t tss_eax; 72 uint32_t tss_ecx; 73 uint32_t tss_edx; 74 uint32_t tss_ebx; 75 uint32_t tss_esp; 76 uint32_t tss_ebp; 77 uint32_t tss_esi; 78 uint32_t tss_edi; 79 uint16_t tss_es; 80 uint16_t rsvd5; 81 uint16_t tss_cs; 82 uint16_t rsvd6; 83 uint16_t tss_ss; 84 uint16_t rsvd7; 85 uint16_t tss_ds; 86 uint16_t rsvd8; 87 uint16_t tss_fs; 88 uint16_t rsvd9; 89 uint16_t tss_gs; 90 uint16_t rsvd10; 91 uint16_t tss_ldt; 92 uint16_t rsvd11; 93 uint16_t tss_trap; 94 uint16_t tss_iomap; 95 }; 96 static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); 97 98 #define SEL_START(sel) (((sel) & ~0x7)) 99 #define SEL_LIMIT(sel) (((sel) | 0x7)) 100 #define TSS_BUSY(type) (((type) & 0x2) != 0) 101 102 static uint64_t 103 GETREG(struct vmctx *ctx, int vcpu, int reg) 104 { 105 uint64_t val; 106 int error; 107 108 error = vm_get_register(ctx, vcpu, reg, &val); 109 assert(error == 0); 110 return (val); 111 } 112 113 static void 114 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) 115 { 116 int error; 117 118 error = vm_set_register(ctx, vcpu, reg, val); 119 assert(error == 0); 120 } 121 122 static struct seg_desc 123 usd_to_seg_desc(struct user_segment_descriptor *usd) 124 { 125 struct seg_desc seg_desc; 126 127 seg_desc.base = (u_int)USD_GETBASE(usd); 128 if (usd->sd_gran) 129 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; 130 else 131 seg_desc.limit = (u_int)USD_GETLIMIT(usd); 132 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; 133 seg_desc.access |= usd->sd_xx << 12; 134 seg_desc.access |= usd->sd_def32 << 14; 135 seg_desc.access |= usd->sd_gran << 15; 136 137 return (seg_desc); 138 } 139 140 /* 141 * Inject an exception with an error code that is a segment selector. 142 * The format of the error code is described in section 6.13, "Error Code", 143 * Intel SDM volume 3. 144 * 145 * Bit 0 (EXT) denotes whether the exception occurred during delivery 146 * of an external event like an interrupt. 147 * 148 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor 149 * in the IDT. 150 * 151 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). 152 */ 153 static void 154 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) 155 { 156 /* 157 * Bit 2 from the selector is retained as-is in the error code. 158 * 159 * Bit 1 can be safely cleared because none of the selectors 160 * encountered during task switch emulation refer to a task 161 * gate in the IDT. 162 * 163 * Bit 0 is set depending on the value of 'ext'. 164 */ 165 sel &= ~0x3; 166 if (ext) 167 sel |= 0x1; 168 vm_inject_fault(ctx, vcpu, vector, 1, sel); 169 } 170 171 /* 172 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT 173 * and non-zero otherwise. 174 */ 175 static int 176 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) 177 { 178 uint64_t base; 179 uint32_t limit, access; 180 int error, reg; 181 182 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 183 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 184 assert(error == 0); 185 186 if (reg == VM_REG_GUEST_LDTR) { 187 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) 188 return (-1); 189 } 190 191 if (limit < SEL_LIMIT(sel)) 192 return (-1); 193 else 194 return (0); 195 } 196 197 /* 198 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced 199 * by the selector 'sel'. 200 * 201 * Returns 0 on success. 202 * Returns 1 if an exception was injected into the guest. 203 * Returns -1 otherwise. 204 */ 205 static int 206 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 207 uint16_t sel, struct user_segment_descriptor *desc, bool doread, 208 int *faultptr) 209 { 210 struct iovec iov[2]; 211 uint64_t base; 212 uint32_t limit, access; 213 int error, reg; 214 215 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; 216 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); 217 assert(error == 0); 218 assert(limit >= SEL_LIMIT(sel)); 219 220 error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), 221 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), 222 faultptr); 223 if (error || *faultptr) 224 return (error); 225 226 if (doread) 227 vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); 228 else 229 vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); 230 return (0); 231 } 232 233 static int 234 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 235 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 236 { 237 return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); 238 } 239 240 static int 241 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 242 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 243 { 244 return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); 245 } 246 247 /* 248 * Read the TSS descriptor referenced by 'sel' into 'desc'. 249 * 250 * Returns 0 on success. 251 * Returns 1 if an exception was injected into the guest. 252 * Returns -1 otherwise. 253 */ 254 static int 255 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 256 uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) 257 { 258 struct vm_guest_paging sup_paging; 259 int error; 260 261 assert(!ISLDT(sel)); 262 assert(IDXSEL(sel) != 0); 263 264 /* Fetch the new TSS descriptor */ 265 if (desc_table_limit_check(ctx, vcpu, sel)) { 266 if (ts->reason == TSR_IRET) 267 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 268 else 269 sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); 270 return (1); 271 } 272 273 sup_paging = ts->paging; 274 sup_paging.cpl = 0; /* implicit supervisor mode */ 275 error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); 276 return (error); 277 } 278 279 static bool 280 code_desc(int sd_type) 281 { 282 /* code descriptor */ 283 return ((sd_type & 0x18) == 0x18); 284 } 285 286 static bool 287 stack_desc(int sd_type) 288 { 289 /* writable data descriptor */ 290 return ((sd_type & 0x1A) == 0x12); 291 } 292 293 static bool 294 data_desc(int sd_type) 295 { 296 /* data descriptor or a readable code descriptor */ 297 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); 298 } 299 300 static bool 301 ldt_desc(int sd_type) 302 { 303 304 return (sd_type == SDT_SYSLDT); 305 } 306 307 /* 308 * Validate the descriptor 'seg_desc' associated with 'segment'. 309 */ 310 static int 311 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 312 int segment, struct seg_desc *seg_desc, int *faultptr) 313 { 314 struct vm_guest_paging sup_paging; 315 struct user_segment_descriptor usd; 316 int error, idtvec; 317 int cpl, dpl, rpl; 318 uint16_t sel, cs; 319 bool ldtseg, codeseg, stackseg, dataseg, conforming; 320 321 ldtseg = codeseg = stackseg = dataseg = false; 322 switch (segment) { 323 case VM_REG_GUEST_LDTR: 324 ldtseg = true; 325 break; 326 case VM_REG_GUEST_CS: 327 codeseg = true; 328 break; 329 case VM_REG_GUEST_SS: 330 stackseg = true; 331 break; 332 case VM_REG_GUEST_DS: 333 case VM_REG_GUEST_ES: 334 case VM_REG_GUEST_FS: 335 case VM_REG_GUEST_GS: 336 dataseg = true; 337 break; 338 default: 339 assert(0); 340 } 341 342 /* Get the segment selector */ 343 sel = GETREG(ctx, vcpu, segment); 344 345 /* LDT selector must point into the GDT */ 346 if (ldtseg && ISLDT(sel)) { 347 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 348 return (1); 349 } 350 351 /* Descriptor table limit check */ 352 if (desc_table_limit_check(ctx, vcpu, sel)) { 353 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 354 return (1); 355 } 356 357 /* NULL selector */ 358 if (IDXSEL(sel) == 0) { 359 /* Code and stack segment selectors cannot be NULL */ 360 if (codeseg || stackseg) { 361 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 362 return (1); 363 } 364 seg_desc->base = 0; 365 seg_desc->limit = 0; 366 seg_desc->access = 0x10000; /* unusable */ 367 return (0); 368 } 369 370 /* Read the descriptor from the GDT/LDT */ 371 sup_paging = ts->paging; 372 sup_paging.cpl = 0; /* implicit supervisor mode */ 373 error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); 374 if (error || *faultptr) 375 return (error); 376 377 /* Verify that the descriptor type is compatible with the segment */ 378 if ((ldtseg && !ldt_desc(usd.sd_type)) || 379 (codeseg && !code_desc(usd.sd_type)) || 380 (dataseg && !data_desc(usd.sd_type)) || 381 (stackseg && !stack_desc(usd.sd_type))) { 382 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 383 return (1); 384 } 385 386 /* Segment must be marked present */ 387 if (!usd.sd_p) { 388 if (ldtseg) 389 idtvec = IDT_TS; 390 else if (stackseg) 391 idtvec = IDT_SS; 392 else 393 idtvec = IDT_NP; 394 sel_exception(ctx, vcpu, idtvec, sel, ts->ext); 395 return (1); 396 } 397 398 cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 399 cpl = cs & SEL_RPL_MASK; 400 rpl = sel & SEL_RPL_MASK; 401 dpl = usd.sd_dpl; 402 403 if (stackseg && (rpl != cpl || dpl != cpl)) { 404 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 405 return (1); 406 } 407 408 if (codeseg) { 409 conforming = (usd.sd_type & 0x4) ? true : false; 410 if ((conforming && (cpl < dpl)) || 411 (!conforming && (cpl != dpl))) { 412 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 413 return (1); 414 } 415 } 416 417 if (dataseg) { 418 /* 419 * A data segment is always non-conforming except when it's 420 * descriptor is a readable, conforming code segment. 421 */ 422 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) 423 conforming = true; 424 else 425 conforming = false; 426 427 if (!conforming && (rpl > dpl || cpl > dpl)) { 428 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); 429 return (1); 430 } 431 } 432 *seg_desc = usd_to_seg_desc(&usd); 433 return (0); 434 } 435 436 static void 437 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, 438 uint32_t eip, struct tss32 *tss, struct iovec *iov) 439 { 440 441 /* General purpose registers */ 442 tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); 443 tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); 444 tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); 445 tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); 446 tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 447 tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); 448 tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); 449 tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); 450 451 /* Segment selectors */ 452 tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); 453 tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); 454 tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 455 tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); 456 tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); 457 tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); 458 459 /* eflags and eip */ 460 tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 461 if (task_switch->reason == TSR_IRET) 462 tss->tss_eflags &= ~PSL_NT; 463 tss->tss_eip = eip; 464 465 /* Copy updated old TSS into guest memory */ 466 vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); 467 } 468 469 static void 470 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) 471 { 472 int error; 473 474 error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); 475 assert(error == 0); 476 } 477 478 /* 479 * Update the vcpu registers to reflect the state of the new task. 480 */ 481 static int 482 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, 483 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) 484 { 485 struct seg_desc seg_desc, seg_desc2; 486 uint64_t *pdpte, maxphyaddr, reserved; 487 uint32_t eflags; 488 int error, i; 489 bool nested; 490 491 nested = false; 492 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { 493 tss->tss_link = ot_sel; 494 nested = true; 495 } 496 497 eflags = tss->tss_eflags; 498 if (nested) 499 eflags |= PSL_NT; 500 501 /* LDTR */ 502 SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); 503 504 /* PBDR */ 505 if (ts->paging.paging_mode != PAGING_MODE_FLAT) { 506 if (ts->paging.paging_mode == PAGING_MODE_PAE) { 507 /* 508 * XXX Assuming 36-bit MAXPHYADDR. 509 */ 510 maxphyaddr = (1UL << 36) - 1; 511 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); 512 for (i = 0; i < 4; i++) { 513 /* Check reserved bits if the PDPTE is valid */ 514 if (!(pdpte[i] & 0x1)) 515 continue; 516 /* 517 * Bits 2:1, 8:5 and bits above the processor's 518 * maximum physical address are reserved. 519 */ 520 reserved = ~maxphyaddr | 0x1E6; 521 if (pdpte[i] & reserved) { 522 vm_inject_gp(ctx, vcpu); 523 return (1); 524 } 525 } 526 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); 527 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); 528 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); 529 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); 530 } 531 SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); 532 ts->paging.cr3 = tss->tss_cr3; 533 } 534 535 /* eflags and eip */ 536 SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); 537 SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); 538 539 /* General purpose registers */ 540 SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); 541 SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); 542 SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); 543 SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); 544 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); 545 SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); 546 SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); 547 SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); 548 549 /* Segment selectors */ 550 SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); 551 SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); 552 SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); 553 SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); 554 SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); 555 SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); 556 557 /* 558 * If this is a nested task then write out the new TSS to update 559 * the previous link field. 560 */ 561 if (nested) 562 vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); 563 564 /* Validate segment descriptors */ 565 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, 566 faultptr); 567 if (error || *faultptr) 568 return (error); 569 update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); 570 571 /* 572 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. 573 * 574 * The SS and CS attribute checks on VM-entry are inter-dependent so 575 * we need to make sure that both segments are valid before updating 576 * either of them. This ensures that the VMCS state can pass the 577 * VM-entry checks so the guest can handle any exception injected 578 * during task switch emulation. 579 */ 580 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, 581 faultptr); 582 if (error || *faultptr) 583 return (error); 584 585 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, 586 faultptr); 587 if (error || *faultptr) 588 return (error); 589 update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); 590 update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); 591 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; 592 593 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, 594 faultptr); 595 if (error || *faultptr) 596 return (error); 597 update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); 598 599 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, 600 faultptr); 601 if (error || *faultptr) 602 return (error); 603 update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); 604 605 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, 606 faultptr); 607 if (error || *faultptr) 608 return (error); 609 update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); 610 611 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, 612 faultptr); 613 if (error || *faultptr) 614 return (error); 615 update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); 616 617 return (0); 618 } 619 620 /* 621 * Push an error code on the stack of the new task. This is needed if the 622 * task switch was triggered by a hardware exception that causes an error 623 * code to be saved (e.g. #PF). 624 */ 625 static int 626 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, 627 int task_type, uint32_t errcode, int *faultptr) 628 { 629 struct iovec iov[2]; 630 struct seg_desc seg_desc; 631 int stacksize, bytes, error; 632 uint64_t gla, cr0, rflags; 633 uint32_t esp; 634 uint16_t stacksel; 635 636 *faultptr = 0; 637 638 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 639 rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); 640 stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); 641 642 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, 643 &seg_desc.limit, &seg_desc.access); 644 assert(error == 0); 645 646 /* 647 * Section "Error Code" in the Intel SDM vol 3: the error code is 648 * pushed on the stack as a doubleword or word (depending on the 649 * default interrupt, trap or task gate size). 650 */ 651 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) 652 bytes = 4; 653 else 654 bytes = 2; 655 656 /* 657 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the 658 * stack-segment descriptor determines the size of the stack 659 * pointer outside of 64-bit mode. 660 */ 661 if (SEG_DESC_DEF32(seg_desc.access)) 662 stacksize = 4; 663 else 664 stacksize = 2; 665 666 esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); 667 esp -= bytes; 668 669 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, 670 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { 671 sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); 672 *faultptr = 1; 673 return (0); 674 } 675 676 if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { 677 vm_inject_ac(ctx, vcpu, 1); 678 *faultptr = 1; 679 return (0); 680 } 681 682 error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, 683 iov, nitems(iov), faultptr); 684 if (error || *faultptr) 685 return (error); 686 687 vm_copyout(ctx, vcpu, &errcode, iov, bytes); 688 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); 689 return (0); 690 } 691 692 /* 693 * Evaluate return value from helper functions and potentially return to 694 * the VM run loop. 695 */ 696 #define CHKERR(error,fault) \ 697 do { \ 698 assert((error == 0) || (error == EFAULT)); \ 699 if (error) \ 700 return (VMEXIT_ABORT); \ 701 else if (fault) \ 702 return (VMEXIT_CONTINUE); \ 703 } while (0) 704 705 int 706 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 707 { 708 struct seg_desc nt; 709 struct tss32 oldtss, newtss; 710 struct vm_task_switch *task_switch; 711 struct vm_guest_paging *paging, sup_paging; 712 struct user_segment_descriptor nt_desc, ot_desc; 713 struct iovec nt_iov[2], ot_iov[2]; 714 uint64_t cr0, ot_base; 715 uint32_t eip, ot_lim, access; 716 int error, ext, fault, minlimit, nt_type, ot_type, vcpu; 717 enum task_switch_reason reason; 718 uint16_t nt_sel, ot_sel; 719 720 task_switch = &vmexit->u.task_switch; 721 nt_sel = task_switch->tsssel; 722 ext = vmexit->u.task_switch.ext; 723 reason = vmexit->u.task_switch.reason; 724 paging = &vmexit->u.task_switch.paging; 725 vcpu = *pvcpu; 726 727 assert(paging->cpu_mode == CPU_MODE_PROTECTED); 728 729 /* 730 * Calculate the instruction pointer to store in the old TSS. 731 */ 732 eip = vmexit->rip + vmexit->inst_length; 733 734 /* 735 * Section 4.6, "Access Rights" in Intel SDM Vol 3. 736 * The following page table accesses are implicitly supervisor mode: 737 * - accesses to GDT or LDT to load segment descriptors 738 * - accesses to the task state segment during task switch 739 */ 740 sup_paging = *paging; 741 sup_paging.cpl = 0; /* implicit supervisor mode */ 742 743 /* Fetch the new TSS descriptor */ 744 error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, 745 &fault); 746 CHKERR(error, fault); 747 748 nt = usd_to_seg_desc(&nt_desc); 749 750 /* Verify the type of the new TSS */ 751 nt_type = SEG_DESC_TYPE(nt.access); 752 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && 753 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { 754 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 755 goto done; 756 } 757 758 /* TSS descriptor must have present bit set */ 759 if (!SEG_DESC_PRESENT(nt.access)) { 760 sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); 761 goto done; 762 } 763 764 /* 765 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and 766 * 44 bytes for a 16-bit TSS. 767 */ 768 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) 769 minlimit = 104 - 1; 770 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) 771 minlimit = 44 - 1; 772 else 773 minlimit = 0; 774 775 assert(minlimit > 0); 776 if (nt.limit < minlimit) { 777 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 778 goto done; 779 } 780 781 /* TSS must be busy if task switch is due to IRET */ 782 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { 783 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); 784 goto done; 785 } 786 787 /* 788 * TSS must be available (not busy) if task switch reason is 789 * CALL, JMP, exception or interrupt. 790 */ 791 if (reason != TSR_IRET && TSS_BUSY(nt_type)) { 792 sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); 793 goto done; 794 } 795 796 /* Fetch the new TSS */ 797 error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, 798 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); 799 CHKERR(error, fault); 800 vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); 801 802 /* Get the old TSS selector from the guest's task register */ 803 ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); 804 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { 805 /* 806 * This might happen if a task switch was attempted without 807 * ever loading the task register with LTR. In this case the 808 * TR would contain the values from power-on: 809 * (sel = 0, base = 0, limit = 0xffff). 810 */ 811 sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); 812 goto done; 813 } 814 815 /* Get the old TSS base and limit from the guest's task register */ 816 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, 817 &access); 818 assert(error == 0); 819 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); 820 ot_type = SEG_DESC_TYPE(access); 821 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); 822 823 /* Fetch the old TSS descriptor */ 824 error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, 825 &fault); 826 CHKERR(error, fault); 827 828 /* Get the old TSS */ 829 error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, 830 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); 831 CHKERR(error, fault); 832 vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); 833 834 /* 835 * Clear the busy bit in the old TSS descriptor if the task switch 836 * due to an IRET or JMP instruction. 837 */ 838 if (reason == TSR_IRET || reason == TSR_JMP) { 839 ot_desc.sd_type &= ~0x2; 840 error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, 841 &ot_desc, &fault); 842 CHKERR(error, fault); 843 } 844 845 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { 846 fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); 847 return (VMEXIT_ABORT); 848 } 849 850 /* Save processor state in old TSS */ 851 tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); 852 853 /* 854 * If the task switch was triggered for any reason other than IRET 855 * then set the busy bit in the new TSS descriptor. 856 */ 857 if (reason != TSR_IRET) { 858 nt_desc.sd_type |= 0x2; 859 error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, 860 &nt_desc, &fault); 861 CHKERR(error, fault); 862 } 863 864 /* Update task register to point at the new TSS */ 865 SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); 866 867 /* Update the hidden descriptor state of the task register */ 868 nt = usd_to_seg_desc(&nt_desc); 869 update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); 870 871 /* Set CR0.TS */ 872 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); 873 SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); 874 875 /* 876 * We are now committed to the task switch. Any exceptions encountered 877 * after this point will be handled in the context of the new task and 878 * the saved instruction pointer will belong to the new task. 879 */ 880 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); 881 assert(error == 0); 882 883 /* Load processor state from new TSS */ 884 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, 885 &fault); 886 CHKERR(error, fault); 887 888 /* 889 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception 890 * caused an error code to be generated, this error code is copied 891 * to the stack of the new task. 892 */ 893 if (task_switch->errcode_valid) { 894 assert(task_switch->ext); 895 assert(task_switch->reason == TSR_IDT_GATE); 896 error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, 897 task_switch->errcode, &fault); 898 CHKERR(error, fault); 899 } 900 901 /* 902 * Treatment of virtual-NMI blocking if NMI is delivered through 903 * a task gate. 904 * 905 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: 906 * If the virtual NMIs VM-execution control is 1, VM entry injects 907 * an NMI, and delivery of the NMI causes a task switch that causes 908 * a VM exit, virtual-NMI blocking is in effect before the VM exit 909 * commences. 910 * 911 * Thus, virtual-NMI blocking is in effect at the time of the task 912 * switch VM exit. 913 */ 914 915 /* 916 * Treatment of virtual-NMI unblocking on IRET from NMI handler task. 917 * 918 * Section "Changes to Instruction Behavior in VMX Non-Root Operation" 919 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. 920 * This unblocking of virtual-NMI occurs even if IRET causes a fault. 921 * 922 * Thus, virtual-NMI blocking is cleared at the time of the task switch 923 * VM exit. 924 */ 925 926 /* 927 * If the task switch was triggered by an event delivered through 928 * the IDT then extinguish the pending event from the vcpu's 929 * exitintinfo. 930 */ 931 if (task_switch->reason == TSR_IDT_GATE) { 932 error = vm_set_intinfo(ctx, vcpu, 0); 933 assert(error == 0); 934 } 935 936 /* 937 * XXX should inject debug exception if 'T' bit is 1 938 */ 939 done: 940 return (VMEXIT_CONTINUE); 941 } 942