1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 /*
30  * This file and its contents are supplied under the terms of the
31  * Common Development and Distribution License ("CDDL"), version 1.0.
32  * You may only use this file in accordance with the terms of version
33  * 1.0 of the CDDL.
34  *
35  * A full copy of the text of the CDDL should have accompanied this
36  * source.  A copy of the CDDL is also available via the Internet at
37  * http://www.illumos.org/license/CDDL.
38  *
39  * Copyright 2015 Pluribus Networks Inc.
40  * Copyright 2018 Joyent, Inc.
41  * Copyright 2021 Oxide Computer Company
42  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
43  */
44 
45 #include <sys/cdefs.h>
46 
47 #include <sys/param.h>
48 #include <sys/pcpu.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <sys/vmm_kernel.h>
55 #include <sys/vmm_vm.h>
56 
57 #include <sys/vmm_instruction_emul.h>
58 #include <x86/psl.h>
59 #include <x86/specialreg.h>
60 
61 #include "vmm_ioport.h"
62 
63 enum vie_status {
64 	VIES_INIT		= (1U << 0),
65 	VIES_MMIO		= (1U << 1),
66 	VIES_INOUT		= (1U << 2),
67 	VIES_OTHER		= (1U << 3),
68 	VIES_INST_FETCH		= (1U << 4),
69 	VIES_INST_DECODE	= (1U << 5),
70 	VIES_PENDING_MMIO	= (1U << 6),
71 	VIES_PENDING_INOUT	= (1U << 7),
72 	VIES_REPEAT		= (1U << 8),
73 	VIES_USER_FALLBACK	= (1U << 9),
74 	VIES_COMPLETE		= (1U << 10),
75 };
76 
77 /* State of request to perform emulated access (inout or MMIO) */
78 enum vie_req {
79 	VR_NONE,
80 	VR_PENDING,
81 	VR_DONE,
82 };
83 
84 struct vie_mmio {
85 	uint64_t		data;
86 	uint64_t		gpa;
87 	uint8_t			bytes;
88 	enum vie_req		state;
89 };
90 
91 struct vie_op {
92 	uint8_t		op_byte;	/* actual opcode byte */
93 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
94 	uint16_t	op_flags;
95 };
96 
97 #define	VIE_INST_SIZE	15
98 struct vie {
99 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
100 	uint8_t		num_valid;		/* size of the instruction */
101 	uint8_t		num_processed;
102 
103 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
104 	uint8_t		rex_w:1,		/* REX prefix */
105 			rex_r:1,
106 			rex_x:1,
107 			rex_b:1,
108 			rex_present:1,
109 			repz_present:1,		/* REP/REPE/REPZ prefix */
110 			repnz_present:1,	/* REPNE/REPNZ prefix */
111 			opsize_override:1,	/* Operand size override */
112 			addrsize_override:1,	/* Address size override */
113 			segment_override:1;	/* Segment override */
114 
115 	uint8_t		mod:2,			/* ModRM byte */
116 			reg:4,
117 			rm:4;
118 
119 	uint8_t		ss:2,			/* SIB byte */
120 			vex_present:1,		/* VEX prefixed */
121 			vex_l:1,		/* L bit */
122 			index:4,		/* SIB byte */
123 			base:4;			/* SIB byte */
124 
125 	uint8_t		disp_bytes;
126 	uint8_t		imm_bytes;
127 
128 	uint8_t		scale;
129 
130 	uint8_t		vex_reg:4,	/* vvvv: first source reg specifier */
131 			vex_pp:2,	/* pp */
132 			_sparebits:2;
133 
134 	uint8_t		_sparebytes[2];
135 
136 	int		base_register;		/* VM_REG_GUEST_xyz */
137 	int		index_register;		/* VM_REG_GUEST_xyz */
138 	int		segment_register;	/* VM_REG_GUEST_xyz */
139 
140 	int64_t		displacement;		/* optional addr displacement */
141 	int64_t		immediate;		/* optional immediate operand */
142 
143 	struct vie_op	op;			/* opcode description */
144 
145 	enum vie_status	status;
146 
147 	struct vm_guest_paging paging;		/* guest paging state */
148 
149 	uint64_t	mmio_gpa;		/* faulting GPA */
150 	struct vie_mmio	mmio_req_read;
151 	struct vie_mmio	mmio_req_write;
152 
153 	struct vm_inout	inout;			/* active in/out op */
154 	enum vie_req	inout_req_state;
155 	uint32_t	inout_req_val;		/* value from userspace */
156 };
157 
158 
159 /* struct vie_op.op_type */
160 enum {
161 	VIE_OP_TYPE_NONE = 0,
162 	VIE_OP_TYPE_MOV,
163 	VIE_OP_TYPE_MOVSX,
164 	VIE_OP_TYPE_MOVZX,
165 	VIE_OP_TYPE_MOV_CR,
166 	VIE_OP_TYPE_AND,
167 	VIE_OP_TYPE_OR,
168 	VIE_OP_TYPE_SUB,
169 	VIE_OP_TYPE_TWO_BYTE,
170 	VIE_OP_TYPE_PUSH,
171 	VIE_OP_TYPE_CMP,
172 	VIE_OP_TYPE_POP,
173 	VIE_OP_TYPE_MOVS,
174 	VIE_OP_TYPE_GROUP1,
175 	VIE_OP_TYPE_STOS,
176 	VIE_OP_TYPE_BITTEST,
177 	VIE_OP_TYPE_TWOB_GRP15,
178 	VIE_OP_TYPE_ADD,
179 	VIE_OP_TYPE_TEST,
180 	VIE_OP_TYPE_BEXTR,
181 	VIE_OP_TYPE_CLTS,
182 	VIE_OP_TYPE_MUL,
183 	VIE_OP_TYPE_LAST
184 };
185 
186 /* struct vie_op.op_flags */
187 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
188 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
189 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
190 #define	VIE_OP_F_NO_MODRM	(1 << 3)
191 #define	VIE_OP_F_NO_GLA_VERIFICATION	(1 << 4)
192 #define	VIE_OP_F_REG_REG	(1 << 5)  /* special-case for mov-cr */
193 
194 static const struct vie_op three_byte_opcodes_0f38[256] = {
195 	[0xF7] = {
196 		.op_byte = 0xF7,
197 		.op_type = VIE_OP_TYPE_BEXTR,
198 	},
199 };
200 
201 static const struct vie_op two_byte_opcodes[256] = {
202 	[0x06] = {
203 		.op_byte = 0x06,
204 		.op_type = VIE_OP_TYPE_CLTS,
205 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
206 	},
207 	[0x20] = {
208 		.op_byte = 0x20,
209 		.op_type = VIE_OP_TYPE_MOV_CR,
210 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
211 	},
212 	[0x22] = {
213 		.op_byte = 0x22,
214 		.op_type = VIE_OP_TYPE_MOV_CR,
215 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
216 	},
217 	[0xAE] = {
218 		.op_byte = 0xAE,
219 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
220 	},
221 	[0xAF] = {
222 		.op_byte = 0xAF,
223 		.op_type = VIE_OP_TYPE_MUL,
224 	},
225 	[0xB6] = {
226 		.op_byte = 0xB6,
227 		.op_type = VIE_OP_TYPE_MOVZX,
228 	},
229 	[0xB7] = {
230 		.op_byte = 0xB7,
231 		.op_type = VIE_OP_TYPE_MOVZX,
232 	},
233 	[0xBA] = {
234 		.op_byte = 0xBA,
235 		.op_type = VIE_OP_TYPE_BITTEST,
236 		.op_flags = VIE_OP_F_IMM8,
237 	},
238 	[0xBE] = {
239 		.op_byte = 0xBE,
240 		.op_type = VIE_OP_TYPE_MOVSX,
241 	},
242 };
243 
244 static const struct vie_op one_byte_opcodes[256] = {
245 	[0x03] = {
246 		.op_byte = 0x03,
247 		.op_type = VIE_OP_TYPE_ADD,
248 	},
249 	[0x0F] = {
250 		.op_byte = 0x0F,
251 		.op_type = VIE_OP_TYPE_TWO_BYTE
252 	},
253 	[0x0B] = {
254 		.op_byte = 0x0B,
255 		.op_type = VIE_OP_TYPE_OR,
256 	},
257 	[0x2B] = {
258 		.op_byte = 0x2B,
259 		.op_type = VIE_OP_TYPE_SUB,
260 	},
261 	[0x39] = {
262 		.op_byte = 0x39,
263 		.op_type = VIE_OP_TYPE_CMP,
264 	},
265 	[0x3B] = {
266 		.op_byte = 0x3B,
267 		.op_type = VIE_OP_TYPE_CMP,
268 	},
269 	[0x88] = {
270 		.op_byte = 0x88,
271 		.op_type = VIE_OP_TYPE_MOV,
272 	},
273 	[0x89] = {
274 		.op_byte = 0x89,
275 		.op_type = VIE_OP_TYPE_MOV,
276 	},
277 	[0x8A] = {
278 		.op_byte = 0x8A,
279 		.op_type = VIE_OP_TYPE_MOV,
280 	},
281 	[0x8B] = {
282 		.op_byte = 0x8B,
283 		.op_type = VIE_OP_TYPE_MOV,
284 	},
285 	[0xA1] = {
286 		.op_byte = 0xA1,
287 		.op_type = VIE_OP_TYPE_MOV,
288 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
289 	},
290 	[0xA3] = {
291 		.op_byte = 0xA3,
292 		.op_type = VIE_OP_TYPE_MOV,
293 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
294 	},
295 	[0xA4] = {
296 		.op_byte = 0xA4,
297 		.op_type = VIE_OP_TYPE_MOVS,
298 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
299 	},
300 	[0xA5] = {
301 		.op_byte = 0xA5,
302 		.op_type = VIE_OP_TYPE_MOVS,
303 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
304 	},
305 	[0xAA] = {
306 		.op_byte = 0xAA,
307 		.op_type = VIE_OP_TYPE_STOS,
308 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
309 	},
310 	[0xAB] = {
311 		.op_byte = 0xAB,
312 		.op_type = VIE_OP_TYPE_STOS,
313 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
314 	},
315 	[0xC6] = {
316 		/* XXX Group 11 extended opcode - not just MOV */
317 		.op_byte = 0xC6,
318 		.op_type = VIE_OP_TYPE_MOV,
319 		.op_flags = VIE_OP_F_IMM8,
320 	},
321 	[0xC7] = {
322 		.op_byte = 0xC7,
323 		.op_type = VIE_OP_TYPE_MOV,
324 		.op_flags = VIE_OP_F_IMM,
325 	},
326 	[0x23] = {
327 		.op_byte = 0x23,
328 		.op_type = VIE_OP_TYPE_AND,
329 	},
330 	[0x80] = {
331 		/* Group 1 extended opcode */
332 		.op_byte = 0x80,
333 		.op_type = VIE_OP_TYPE_GROUP1,
334 		.op_flags = VIE_OP_F_IMM8,
335 	},
336 	[0x81] = {
337 		/* Group 1 extended opcode */
338 		.op_byte = 0x81,
339 		.op_type = VIE_OP_TYPE_GROUP1,
340 		.op_flags = VIE_OP_F_IMM,
341 	},
342 	[0x83] = {
343 		/* Group 1 extended opcode */
344 		.op_byte = 0x83,
345 		.op_type = VIE_OP_TYPE_GROUP1,
346 		.op_flags = VIE_OP_F_IMM8,
347 	},
348 	[0x8F] = {
349 		/* XXX Group 1A extended opcode - not just POP */
350 		.op_byte = 0x8F,
351 		.op_type = VIE_OP_TYPE_POP,
352 	},
353 	[0xF6] = {
354 		/* XXX Group 3 extended opcode - not just TEST */
355 		.op_byte = 0xF6,
356 		.op_type = VIE_OP_TYPE_TEST,
357 		.op_flags = VIE_OP_F_IMM8,
358 	},
359 	[0xF7] = {
360 		/* XXX Group 3 extended opcode - not just TEST */
361 		.op_byte = 0xF7,
362 		.op_type = VIE_OP_TYPE_TEST,
363 		.op_flags = VIE_OP_F_IMM,
364 	},
365 	[0xFF] = {
366 		/* XXX Group 5 extended opcode - not just PUSH */
367 		.op_byte = 0xFF,
368 		.op_type = VIE_OP_TYPE_PUSH,
369 	}
370 };
371 
372 /* struct vie.mod */
373 #define	VIE_MOD_INDIRECT		0
374 #define	VIE_MOD_INDIRECT_DISP8		1
375 #define	VIE_MOD_INDIRECT_DISP32		2
376 #define	VIE_MOD_DIRECT			3
377 
378 /* struct vie.rm */
379 #define	VIE_RM_SIB			4
380 #define	VIE_RM_DISP32			5
381 
382 #define	GB				(1024 * 1024 * 1024)
383 
384 
385 /*
386  * Paging defines, previously pulled in from machine/pmap.h
387  */
388 #define	PG_V	(1 << 0) /* Present */
389 #define	PG_RW	(1 << 1) /* Read/Write */
390 #define	PG_U	(1 << 2) /* User/Supervisor */
391 #define	PG_A	(1 << 5) /* Accessed */
392 #define	PG_M	(1 << 6) /* Dirty */
393 #define	PG_PS	(1 << 7) /* Largepage */
394 
395 /*
396  * Paging except defines, previously pulled in from machine/pmap.h
397  */
398 #define	PGEX_P		(1 << 0) /* Non-present/Protection */
399 #define	PGEX_W		(1 << 1) /* Read/Write */
400 #define	PGEX_U		(1 << 2) /* User/Supervisor */
401 #define	PGEX_RSV	(1 << 3) /* (Non-)Reserved */
402 #define	PGEX_I		(1 << 4) /* Instruction */
403 
404 
405 static enum vm_reg_name gpr_map[16] = {
406 	VM_REG_GUEST_RAX,
407 	VM_REG_GUEST_RCX,
408 	VM_REG_GUEST_RDX,
409 	VM_REG_GUEST_RBX,
410 	VM_REG_GUEST_RSP,
411 	VM_REG_GUEST_RBP,
412 	VM_REG_GUEST_RSI,
413 	VM_REG_GUEST_RDI,
414 	VM_REG_GUEST_R8,
415 	VM_REG_GUEST_R9,
416 	VM_REG_GUEST_R10,
417 	VM_REG_GUEST_R11,
418 	VM_REG_GUEST_R12,
419 	VM_REG_GUEST_R13,
420 	VM_REG_GUEST_R14,
421 	VM_REG_GUEST_R15
422 };
423 
424 static const char *gpr_name_map[][16] = {
425 	[1] = {
426 		"a[hl]", "c[hl]", "d[hl]", "b[hl]", "spl", "bpl", "sil", "dil",
427 		"r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b",
428 	},
429 	[2] = {
430 		"ax", "cx", "dx", "bx", "sp", "bp", "si", "di",
431 		"r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w",
432 	},
433 	[4] = {
434 		"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
435 		"r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d",
436 	},
437 	[8] = {
438 		"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
439 		"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
440 	},
441 };
442 
443 static enum vm_reg_name cr_map[16] = {
444 	VM_REG_GUEST_CR0,
445 	VM_REG_LAST,
446 	VM_REG_GUEST_CR2,
447 	VM_REG_GUEST_CR3,
448 	VM_REG_GUEST_CR4,
449 	VM_REG_LAST,
450 	VM_REG_LAST,
451 	VM_REG_LAST,
452 	VM_REG_LAST,
453 	VM_REG_LAST,
454 	VM_REG_LAST,
455 	VM_REG_LAST,
456 	VM_REG_LAST,
457 	VM_REG_LAST,
458 	VM_REG_LAST,
459 	VM_REG_LAST
460 };
461 
462 static uint64_t size2mask[] = {
463 	[1] = 0xff,
464 	[2] = 0xffff,
465 	[4] = 0xffffffff,
466 	[8] = 0xffffffffffffffff,
467 };
468 
469 
470 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid,
471     uint64_t gpa, uint64_t *rval, int bytes);
472 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid,
473     uint64_t gpa, uint64_t wval, int bytes);
474 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
475     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
476     int prot, uint64_t *gla);
477 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
478 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf,
479     uint64_t gla);
480 static uint64_t vie_size2mask(int size);
481 
482 struct vie *
vie_alloc()483 vie_alloc()
484 {
485 	return (kmem_zalloc(sizeof (struct vie), KM_SLEEP));
486 }
487 
488 void
vie_free(struct vie * vie)489 vie_free(struct vie *vie)
490 {
491 	kmem_free(vie, sizeof (struct vie));
492 }
493 
494 enum vm_reg_name
vie_regnum_map(uint8_t regnum)495 vie_regnum_map(uint8_t regnum)
496 {
497 	VERIFY3U(regnum, <, 16);
498 	return (gpr_map[regnum]);
499 }
500 
501 const char *
vie_regnum_name(uint8_t regnum,uint8_t size)502 vie_regnum_name(uint8_t regnum, uint8_t size)
503 {
504 	VERIFY3U(regnum, <, 16);
505 	VERIFY(size == 1 || size == 2 || size == 4 || size == 8);
506 	return (gpr_name_map[size][regnum]);
507 }
508 
509 static void
vie_calc_bytereg(struct vie * vie,enum vm_reg_name * reg,int * lhbr)510 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
511 {
512 	*lhbr = 0;
513 	*reg = gpr_map[vie->reg];
514 
515 	/*
516 	 * 64-bit mode imposes limitations on accessing legacy high byte
517 	 * registers (lhbr).
518 	 *
519 	 * The legacy high-byte registers cannot be addressed if the REX
520 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
521 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
522 	 *
523 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
524 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
525 	 * %ah, %ch, %dh and %bh respectively.
526 	 */
527 	if (!vie->rex_present) {
528 		if (vie->reg & 0x4) {
529 			*lhbr = 1;
530 			*reg = gpr_map[vie->reg & 0x3];
531 		}
532 	}
533 }
534 
535 static int
vie_read_bytereg(struct vie * vie,struct vm * vm,int vcpuid,uint8_t * rval)536 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval)
537 {
538 	uint64_t val;
539 	int error, lhbr;
540 	enum vm_reg_name reg;
541 
542 	vie_calc_bytereg(vie, &reg, &lhbr);
543 	error = vm_get_register(vm, vcpuid, reg, &val);
544 
545 	/*
546 	 * To obtain the value of a legacy high byte register shift the
547 	 * base register right by 8 bits (%ah = %rax >> 8).
548 	 */
549 	if (lhbr)
550 		*rval = val >> 8;
551 	else
552 		*rval = val;
553 	return (error);
554 }
555 
556 static int
vie_write_bytereg(struct vie * vie,struct vm * vm,int vcpuid,uint8_t byte)557 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte)
558 {
559 	uint64_t origval, val, mask;
560 	int error, lhbr;
561 	enum vm_reg_name reg;
562 
563 	vie_calc_bytereg(vie, &reg, &lhbr);
564 	error = vm_get_register(vm, vcpuid, reg, &origval);
565 	if (error == 0) {
566 		val = byte;
567 		mask = 0xff;
568 		if (lhbr) {
569 			/*
570 			 * Shift left by 8 to store 'byte' in a legacy high
571 			 * byte register.
572 			 */
573 			val <<= 8;
574 			mask <<= 8;
575 		}
576 		val |= origval & ~mask;
577 		error = vm_set_register(vm, vcpuid, reg, val);
578 	}
579 	return (error);
580 }
581 
582 static int
vie_update_register(struct vm * vm,int vcpuid,enum vm_reg_name reg,uint64_t val,int size)583 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg,
584     uint64_t val, int size)
585 {
586 	int error;
587 	uint64_t origval;
588 
589 	switch (size) {
590 	case 1:
591 	case 2:
592 		error = vm_get_register(vm, vcpuid, reg, &origval);
593 		if (error)
594 			return (error);
595 		val &= size2mask[size];
596 		val |= origval & ~size2mask[size];
597 		break;
598 	case 4:
599 		val &= 0xffffffffUL;
600 		break;
601 	case 8:
602 		break;
603 	default:
604 		return (EINVAL);
605 	}
606 
607 	error = vm_set_register(vm, vcpuid, reg, val);
608 	return (error);
609 }
610 
611 static int
vie_repeat(struct vie * vie)612 vie_repeat(struct vie *vie)
613 {
614 	vie->status |= VIES_REPEAT;
615 
616 	/*
617 	 * Clear out any cached operation values so the repeated instruction can
618 	 * begin without using that stale state.  Other state, such as the
619 	 * decoding results, are kept around as it will not vary between
620 	 * iterations of a rep-prefixed instruction.
621 	 */
622 	if ((vie->status & VIES_MMIO) != 0) {
623 		vie->mmio_req_read.state = VR_NONE;
624 		vie->mmio_req_write.state = VR_NONE;
625 	} else if ((vie->status & VIES_INOUT) != 0) {
626 		vie->inout_req_state = VR_NONE;
627 	} else {
628 		panic("unexpected emulation state");
629 	}
630 
631 	return (EAGAIN);
632 }
633 
634 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
635 
636 /*
637  * Return the status flags that would result from doing (x - y).
638  */
639 /* BEGIN CSTYLED */
640 #define	GETCC(sz)							\
641 static ulong_t								\
642 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
643 {									\
644 	ulong_t rflags;							\
645 									\
646 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
647 	    "=r" (rflags), "+r" (x) : "m" (y));				\
648 	return (rflags);						\
649 } struct __hack
650 /* END CSTYLED */
651 
652 GETCC(8);
653 GETCC(16);
654 GETCC(32);
655 GETCC(64);
656 
657 static ulong_t
getcc(int opsize,uint64_t x,uint64_t y)658 getcc(int opsize, uint64_t x, uint64_t y)
659 {
660 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
661 	    ("getcc: invalid operand size %d", opsize));
662 
663 	if (opsize == 1)
664 		return (getcc8(x, y));
665 	else if (opsize == 2)
666 		return (getcc16(x, y));
667 	else if (opsize == 4)
668 		return (getcc32(x, y));
669 	else
670 		return (getcc64(x, y));
671 }
672 
673 /*
674  * Macro creation of functions getaddflags{8,16,32,64}
675  */
676 /* BEGIN CSTYLED */
677 #define	GETADDFLAGS(sz)							\
678 static ulong_t								\
679 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
680 {									\
681 	ulong_t rflags;							\
682 									\
683 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
684 	    "=r" (rflags), "+r" (x) : "m" (y));				\
685 	return (rflags);						\
686 } struct __hack
687 /* END CSTYLED */
688 
689 GETADDFLAGS(8);
690 GETADDFLAGS(16);
691 GETADDFLAGS(32);
692 GETADDFLAGS(64);
693 
694 static ulong_t
getaddflags(int opsize,uint64_t x,uint64_t y)695 getaddflags(int opsize, uint64_t x, uint64_t y)
696 {
697 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
698 	    ("getaddflags: invalid operand size %d", opsize));
699 
700 	if (opsize == 1)
701 		return (getaddflags8(x, y));
702 	else if (opsize == 2)
703 		return (getaddflags16(x, y));
704 	else if (opsize == 4)
705 		return (getaddflags32(x, y));
706 	else
707 		return (getaddflags64(x, y));
708 }
709 
710 /*
711  * Macro creation of functions getimulflags{16,32,64}
712  */
713 /* BEGIN CSTYLED */
714 #define	GETIMULFLAGS(sz)						\
715 static ulong_t								\
716 getimulflags##sz(uint##sz##_t x, uint##sz##_t y)			\
717 {									\
718 	ulong_t rflags;							\
719 									\
720 	__asm __volatile("imul %2,%1; pushfq; popq %0" :		\
721 	    "=r" (rflags), "+r" (x) : "m" (y));				\
722 	return (rflags);						\
723 } struct __hack
724 /* END CSTYLED */
725 
726 GETIMULFLAGS(16);
727 GETIMULFLAGS(32);
728 GETIMULFLAGS(64);
729 
730 static ulong_t
getimulflags(int opsize,uint64_t x,uint64_t y)731 getimulflags(int opsize, uint64_t x, uint64_t y)
732 {
733 	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
734 	    ("getimulflags: invalid operand size %d", opsize));
735 
736 	if (opsize == 2)
737 		return (getimulflags16(x, y));
738 	else if (opsize == 4)
739 		return (getimulflags32(x, y));
740 	else
741 		return (getimulflags64(x, y));
742 }
743 
744 /*
745  * Return the status flags that would result from doing (x & y).
746  */
747 /* BEGIN CSTYLED */
748 #define	GETANDFLAGS(sz)							\
749 static ulong_t								\
750 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
751 {									\
752 	ulong_t rflags;							\
753 									\
754 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
755 	    "=r" (rflags), "+r" (x) : "m" (y));				\
756 	return (rflags);						\
757 } struct __hack
758 /* END CSTYLED */
759 
760 GETANDFLAGS(8);
761 GETANDFLAGS(16);
762 GETANDFLAGS(32);
763 GETANDFLAGS(64);
764 
765 static ulong_t
getandflags(int opsize,uint64_t x,uint64_t y)766 getandflags(int opsize, uint64_t x, uint64_t y)
767 {
768 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
769 	    ("getandflags: invalid operand size %d", opsize));
770 
771 	if (opsize == 1)
772 		return (getandflags8(x, y));
773 	else if (opsize == 2)
774 		return (getandflags16(x, y));
775 	else if (opsize == 4)
776 		return (getandflags32(x, y));
777 	else
778 		return (getandflags64(x, y));
779 }
780 
781 static int
vie_emulate_mov_cr(struct vie * vie,struct vm * vm,int vcpuid)782 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid)
783 {
784 	uint64_t val;
785 	int err;
786 	enum vm_reg_name gpr = gpr_map[vie->rm];
787 	enum vm_reg_name cr = cr_map[vie->reg];
788 
789 	uint_t size = 4;
790 	if (vie->paging.cpu_mode == CPU_MODE_64BIT) {
791 		size = 8;
792 	}
793 
794 	switch (vie->op.op_byte) {
795 	case 0x20:
796 		/*
797 		 * MOV control register (ModRM:reg) to reg (ModRM:r/m)
798 		 * 20/r:	mov r32, CR0-CR7
799 		 * 20/r:	mov r64, CR0-CR7
800 		 * REX.R + 20/0:	mov r64, CR8
801 		 */
802 		if (vie->paging.cpl != 0) {
803 			vm_inject_gp(vm, vcpuid);
804 			vie->num_processed = 0;
805 			return (0);
806 		}
807 		err = vm_get_register(vm, vcpuid, cr, &val);
808 		if (err != 0) {
809 			/* #UD for access to non-existent CRs */
810 			vm_inject_ud(vm, vcpuid);
811 			vie->num_processed = 0;
812 			return (0);
813 		}
814 		err = vie_update_register(vm, vcpuid, gpr, val, size);
815 		break;
816 	case 0x22: {
817 		/*
818 		 * MOV reg (ModRM:r/m) to control register (ModRM:reg)
819 		 * 22/r:	mov CR0-CR7, r32
820 		 * 22/r:	mov CR0-CR7, r64
821 		 * REX.R + 22/0:	mov CR8, r64
822 		 */
823 		uint64_t old, diff;
824 
825 		if (vie->paging.cpl != 0) {
826 			vm_inject_gp(vm, vcpuid);
827 			vie->num_processed = 0;
828 			return (0);
829 		}
830 		err = vm_get_register(vm, vcpuid, cr, &old);
831 		if (err != 0) {
832 			/* #UD for access to non-existent CRs */
833 			vm_inject_ud(vm, vcpuid);
834 			vie->num_processed = 0;
835 			return (0);
836 		}
837 		err = vm_get_register(vm, vcpuid, gpr, &val);
838 		VERIFY0(err);
839 		val &= size2mask[size];
840 		diff = old ^ val;
841 
842 		switch (cr) {
843 		case VM_REG_GUEST_CR0:
844 			if ((diff & CR0_PG) != 0) {
845 				uint64_t efer;
846 
847 				err = vm_get_register(vm, vcpuid,
848 				    VM_REG_GUEST_EFER, &efer);
849 				VERIFY0(err);
850 
851 				/* Keep the long-mode state in EFER in sync */
852 				if ((val & CR0_PG) != 0 &&
853 				    (efer & EFER_LME) != 0) {
854 					efer |= EFER_LMA;
855 				}
856 				if ((val & CR0_PG) == 0 &&
857 				    (efer & EFER_LME) != 0) {
858 					efer &= ~EFER_LMA;
859 				}
860 
861 				err = vm_set_register(vm, vcpuid,
862 				    VM_REG_GUEST_EFER, efer);
863 				VERIFY0(err);
864 			}
865 			/* TODO: enforce more of the #GP checks */
866 			err = vm_set_register(vm, vcpuid, cr, val);
867 			VERIFY0(err);
868 			break;
869 		case VM_REG_GUEST_CR2:
870 		case VM_REG_GUEST_CR3:
871 		case VM_REG_GUEST_CR4:
872 			/* TODO: enforce more of the #GP checks */
873 			err = vm_set_register(vm, vcpuid, cr, val);
874 			break;
875 		default:
876 			/* The cr_map mapping should prevent this */
877 			panic("invalid cr %d", cr);
878 		}
879 		break;
880 	}
881 	default:
882 		return (EINVAL);
883 	}
884 	return (err);
885 }
886 
887 static int
vie_emulate_mov(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)888 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
889 {
890 	int error, size;
891 	enum vm_reg_name reg;
892 	uint8_t byte;
893 	uint64_t val;
894 
895 	size = vie->opsize;
896 	error = EINVAL;
897 
898 	switch (vie->op.op_byte) {
899 	case 0x88:
900 		/*
901 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
902 		 * 88/r:	mov r/m8, r8
903 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
904 		 */
905 		size = 1;	/* override for byte operation */
906 		error = vie_read_bytereg(vie, vm, vcpuid, &byte);
907 		if (error == 0) {
908 			error = vie_mmio_write(vie, vm, vcpuid, gpa, byte,
909 			    size);
910 		}
911 		break;
912 	case 0x89:
913 		/*
914 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
915 		 * 89/r:	mov r/m16, r16
916 		 * 89/r:	mov r/m32, r32
917 		 * REX.W + 89/r	mov r/m64, r64
918 		 */
919 		reg = gpr_map[vie->reg];
920 		error = vm_get_register(vm, vcpuid, reg, &val);
921 		if (error == 0) {
922 			val &= size2mask[size];
923 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
924 		}
925 		break;
926 	case 0x8A:
927 		/*
928 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
929 		 * 8A/r:	mov r8, r/m8
930 		 * REX + 8A/r:	mov r8, r/m8
931 		 */
932 		size = 1;	/* override for byte operation */
933 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
934 		if (error == 0)
935 			error = vie_write_bytereg(vie, vm, vcpuid, val);
936 		break;
937 	case 0x8B:
938 		/*
939 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
940 		 * 8B/r:	mov r16, r/m16
941 		 * 8B/r:	mov r32, r/m32
942 		 * REX.W 8B/r:	mov r64, r/m64
943 		 */
944 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
945 		if (error == 0) {
946 			reg = gpr_map[vie->reg];
947 			error = vie_update_register(vm, vcpuid, reg, val, size);
948 		}
949 		break;
950 	case 0xA1:
951 		/*
952 		 * MOV from seg:moffset to AX/EAX/RAX
953 		 * A1:		mov AX, moffs16
954 		 * A1:		mov EAX, moffs32
955 		 * REX.W + A1:	mov RAX, moffs64
956 		 */
957 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
958 		if (error == 0) {
959 			reg = VM_REG_GUEST_RAX;
960 			error = vie_update_register(vm, vcpuid, reg, val, size);
961 		}
962 		break;
963 	case 0xA3:
964 		/*
965 		 * MOV from AX/EAX/RAX to seg:moffset
966 		 * A3:		mov moffs16, AX
967 		 * A3:		mov moffs32, EAX
968 		 * REX.W + A3:	mov moffs64, RAX
969 		 */
970 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
971 		if (error == 0) {
972 			val &= size2mask[size];
973 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
974 		}
975 		break;
976 	case 0xC6:
977 		/*
978 		 * MOV from imm8 to mem (ModRM:r/m)
979 		 * C6/0		mov r/m8, imm8
980 		 * REX + C6/0	mov r/m8, imm8
981 		 */
982 		size = 1;	/* override for byte operation */
983 		val = vie->immediate;
984 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
985 		break;
986 	case 0xC7:
987 		/*
988 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
989 		 * C7/0		mov r/m16, imm16
990 		 * C7/0		mov r/m32, imm32
991 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
992 		 */
993 		val = vie->immediate & size2mask[size];
994 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
995 		break;
996 	default:
997 		break;
998 	}
999 
1000 	return (error);
1001 }
1002 
1003 static int
vie_emulate_movx(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1004 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1005 {
1006 	int error, size;
1007 	enum vm_reg_name reg;
1008 	uint64_t val;
1009 
1010 	size = vie->opsize;
1011 	error = EINVAL;
1012 
1013 	switch (vie->op.op_byte) {
1014 	case 0xB6:
1015 		/*
1016 		 * MOV and zero extend byte from mem (ModRM:r/m) to
1017 		 * reg (ModRM:reg).
1018 		 *
1019 		 * 0F B6/r		movzx r16, r/m8
1020 		 * 0F B6/r		movzx r32, r/m8
1021 		 * REX.W + 0F B6/r	movzx r64, r/m8
1022 		 */
1023 
1024 		/* get the first operand */
1025 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1026 		if (error)
1027 			break;
1028 
1029 		/* get the second operand */
1030 		reg = gpr_map[vie->reg];
1031 
1032 		/* zero-extend byte */
1033 		val = (uint8_t)val;
1034 
1035 		/* write the result */
1036 		error = vie_update_register(vm, vcpuid, reg, val, size);
1037 		break;
1038 	case 0xB7:
1039 		/*
1040 		 * MOV and zero extend word from mem (ModRM:r/m) to
1041 		 * reg (ModRM:reg).
1042 		 *
1043 		 * 0F B7/r		movzx r32, r/m16
1044 		 * REX.W + 0F B7/r	movzx r64, r/m16
1045 		 */
1046 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2);
1047 		if (error)
1048 			return (error);
1049 
1050 		reg = gpr_map[vie->reg];
1051 
1052 		/* zero-extend word */
1053 		val = (uint16_t)val;
1054 
1055 		error = vie_update_register(vm, vcpuid, reg, val, size);
1056 		break;
1057 	case 0xBE:
1058 		/*
1059 		 * MOV and sign extend byte from mem (ModRM:r/m) to
1060 		 * reg (ModRM:reg).
1061 		 *
1062 		 * 0F BE/r		movsx r16, r/m8
1063 		 * 0F BE/r		movsx r32, r/m8
1064 		 * REX.W + 0F BE/r	movsx r64, r/m8
1065 		 */
1066 
1067 		/* get the first operand */
1068 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1069 		if (error)
1070 			break;
1071 
1072 		/* get the second operand */
1073 		reg = gpr_map[vie->reg];
1074 
1075 		/* sign extend byte */
1076 		val = (int8_t)val;
1077 
1078 		/* write the result */
1079 		error = vie_update_register(vm, vcpuid, reg, val, size);
1080 		break;
1081 	default:
1082 		break;
1083 	}
1084 	return (error);
1085 }
1086 
1087 /*
1088  * Helper function to calculate and validate a linear address.
1089  */
1090 static int
vie_get_gla(struct vie * vie,struct vm * vm,int vcpuid,int opsize,int addrsize,int prot,enum vm_reg_name seg,enum vm_reg_name gpr,uint64_t * gla)1091 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize,
1092     int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr,
1093     uint64_t *gla)
1094 {
1095 	struct seg_desc desc;
1096 	uint64_t cr0, val, rflags;
1097 	int error;
1098 	struct vm_guest_paging *paging;
1099 
1100 	paging = &vie->paging;
1101 
1102 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1103 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1104 
1105 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1106 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1107 
1108 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
1109 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
1110 	    __func__, error, seg));
1111 
1112 	error = vm_get_register(vm, vcpuid, gpr, &val);
1113 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
1114 	    error, gpr));
1115 
1116 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
1117 	    addrsize, prot, gla)) {
1118 		if (seg == VM_REG_GUEST_SS)
1119 			vm_inject_ss(vm, vcpuid, 0);
1120 		else
1121 			vm_inject_gp(vm, vcpuid);
1122 		return (-1);
1123 	}
1124 
1125 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
1126 		if (seg == VM_REG_GUEST_SS)
1127 			vm_inject_ss(vm, vcpuid, 0);
1128 		else
1129 			vm_inject_gp(vm, vcpuid);
1130 		return (-1);
1131 	}
1132 
1133 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
1134 		vm_inject_ac(vm, vcpuid, 0);
1135 		return (-1);
1136 	}
1137 
1138 	return (0);
1139 }
1140 
1141 static int
vie_emulate_movs(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1142 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1143 {
1144 	struct vm_copyinfo copyinfo[2];
1145 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
1146 	uint64_t rcx, rdi, rsi, rflags;
1147 	int error, fault, opsize, seg, repeat;
1148 	struct vm_guest_paging *paging;
1149 
1150 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
1151 	val = 0;
1152 	error = 0;
1153 	paging = &vie->paging;
1154 
1155 	/*
1156 	 * XXX although the MOVS instruction is only supposed to be used with
1157 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
1158 	 *
1159 	 * Empirically the "repnz" prefix has identical behavior to "rep"
1160 	 * and the zero flag does not make a difference.
1161 	 */
1162 	repeat = vie->repz_present | vie->repnz_present;
1163 
1164 	if (repeat) {
1165 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1166 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1167 
1168 		/*
1169 		 * The count register is %rcx, %ecx or %cx depending on the
1170 		 * address size of the instruction.
1171 		 */
1172 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
1173 			error = 0;
1174 			goto done;
1175 		}
1176 	}
1177 
1178 	/*
1179 	 *	Source		Destination	Comments
1180 	 *	--------------------------------------------
1181 	 * (1)  memory		memory		n/a
1182 	 * (2)  memory		mmio		emulated
1183 	 * (3)  mmio		memory		emulated
1184 	 * (4)  mmio		mmio		emulated
1185 	 *
1186 	 * At this point we don't have sufficient information to distinguish
1187 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
1188 	 * out because it will succeed only when operating on regular memory.
1189 	 *
1190 	 * XXX the emulation doesn't properly handle the case where 'gpa'
1191 	 * is straddling the boundary between the normal memory and MMIO.
1192 	 */
1193 
1194 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
1195 	if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg,
1196 	    VM_REG_GUEST_RSI, &srcaddr) != 0) {
1197 		goto done;
1198 	}
1199 
1200 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
1201 	    copyinfo, nitems(copyinfo), &fault);
1202 	if (error == 0) {
1203 		if (fault)
1204 			goto done;	/* Resume guest to handle fault */
1205 
1206 		/*
1207 		 * case (2): read from system memory and write to mmio.
1208 		 */
1209 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
1210 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1211 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1212 		if (error)
1213 			goto done;
1214 	} else {
1215 		/*
1216 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
1217 		 * if 'srcaddr' is in the mmio space.
1218 		 */
1219 
1220 		if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize,
1221 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI,
1222 		    &dstaddr) != 0) {
1223 			goto done;
1224 		}
1225 
1226 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
1227 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
1228 		if (error == 0) {
1229 			if (fault)
1230 				goto done;    /* Resume guest to handle fault */
1231 
1232 			/*
1233 			 * case (3): read from MMIO and write to system memory.
1234 			 *
1235 			 * A MMIO read can have side-effects so we
1236 			 * commit to it only after vm_copy_setup() is
1237 			 * successful. If a page-fault needs to be
1238 			 * injected into the guest then it will happen
1239 			 * before the MMIO read is attempted.
1240 			 */
1241 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &val,
1242 			    opsize);
1243 
1244 			if (error == 0) {
1245 				vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
1246 			}
1247 			/*
1248 			 * Regardless of whether the MMIO read was successful or
1249 			 * not, the copy resources must be cleaned up.
1250 			 */
1251 			vm_copy_teardown(vm, vcpuid, copyinfo,
1252 			    nitems(copyinfo));
1253 			if (error != 0) {
1254 				goto done;
1255 			}
1256 		} else {
1257 			/*
1258 			 * Case (4): read from and write to mmio.
1259 			 *
1260 			 * Commit to the MMIO read/write (with potential
1261 			 * side-effects) only after we are sure that the
1262 			 * instruction is not going to be restarted due
1263 			 * to address translation faults.
1264 			 */
1265 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
1266 			    PROT_READ, &srcgpa, &fault);
1267 			if (error || fault)
1268 				goto done;
1269 
1270 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
1271 			    PROT_WRITE, &dstgpa, &fault);
1272 			if (error || fault)
1273 				goto done;
1274 
1275 			error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val,
1276 			    opsize);
1277 			if (error)
1278 				goto done;
1279 
1280 			error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val,
1281 			    opsize);
1282 			if (error)
1283 				goto done;
1284 		}
1285 	}
1286 
1287 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
1288 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
1289 
1290 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1291 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1292 
1293 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1294 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1295 
1296 	if (rflags & PSL_D) {
1297 		rsi -= opsize;
1298 		rdi -= opsize;
1299 	} else {
1300 		rsi += opsize;
1301 		rdi += opsize;
1302 	}
1303 
1304 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
1305 	    vie->addrsize);
1306 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
1307 
1308 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1309 	    vie->addrsize);
1310 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1311 
1312 	if (repeat) {
1313 		rcx = rcx - 1;
1314 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1315 		    rcx, vie->addrsize);
1316 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1317 
1318 		/*
1319 		 * Repeat the instruction if the count register is not zero.
1320 		 */
1321 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1322 			return (vie_repeat(vie));
1323 	}
1324 done:
1325 	return (error);
1326 }
1327 
1328 static int
vie_emulate_stos(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1329 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1330 {
1331 	int error, opsize, repeat;
1332 	uint64_t val;
1333 	uint64_t rcx, rdi, rflags;
1334 
1335 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
1336 	repeat = vie->repz_present | vie->repnz_present;
1337 
1338 	if (repeat) {
1339 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1340 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1341 
1342 		/*
1343 		 * The count register is %rcx, %ecx or %cx depending on the
1344 		 * address size of the instruction.
1345 		 */
1346 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
1347 			return (0);
1348 	}
1349 
1350 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
1351 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
1352 
1353 	error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1354 	if (error)
1355 		return (error);
1356 
1357 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1358 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1359 
1360 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1361 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1362 
1363 	if (rflags & PSL_D)
1364 		rdi -= opsize;
1365 	else
1366 		rdi += opsize;
1367 
1368 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1369 	    vie->addrsize);
1370 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1371 
1372 	if (repeat) {
1373 		rcx = rcx - 1;
1374 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1375 		    rcx, vie->addrsize);
1376 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1377 
1378 		/*
1379 		 * Repeat the instruction if the count register is not zero.
1380 		 */
1381 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1382 			return (vie_repeat(vie));
1383 	}
1384 
1385 	return (0);
1386 }
1387 
1388 static int
vie_emulate_and(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1389 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1390 {
1391 	int error, size;
1392 	enum vm_reg_name reg;
1393 	uint64_t result, rflags, rflags2, val1, val2;
1394 
1395 	size = vie->opsize;
1396 	error = EINVAL;
1397 
1398 	switch (vie->op.op_byte) {
1399 	case 0x23:
1400 		/*
1401 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1402 		 * result in reg.
1403 		 *
1404 		 * 23/r		and r16, r/m16
1405 		 * 23/r		and r32, r/m32
1406 		 * REX.W + 23/r	and r64, r/m64
1407 		 */
1408 
1409 		/* get the first operand */
1410 		reg = gpr_map[vie->reg];
1411 		error = vm_get_register(vm, vcpuid, reg, &val1);
1412 		if (error)
1413 			break;
1414 
1415 		/* get the second operand */
1416 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1417 		if (error)
1418 			break;
1419 
1420 		/* perform the operation and write the result */
1421 		result = val1 & val2;
1422 		error = vie_update_register(vm, vcpuid, reg, result, size);
1423 		break;
1424 	case 0x81:
1425 	case 0x83:
1426 		/*
1427 		 * AND mem (ModRM:r/m) with immediate and store the
1428 		 * result in mem.
1429 		 *
1430 		 * 81 /4		and r/m16, imm16
1431 		 * 81 /4		and r/m32, imm32
1432 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1433 		 *
1434 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1435 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1436 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1437 		 */
1438 
1439 		/* get the first operand */
1440 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1441 		if (error)
1442 			break;
1443 
1444 		/*
1445 		 * perform the operation with the pre-fetched immediate
1446 		 * operand and write the result
1447 		 */
1448 		result = val1 & vie->immediate;
1449 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1450 		break;
1451 	default:
1452 		break;
1453 	}
1454 	if (error)
1455 		return (error);
1456 
1457 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1458 	if (error)
1459 		return (error);
1460 
1461 	/*
1462 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1463 	 * to the result; AF is undefined.
1464 	 *
1465 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1466 	 */
1467 	rflags2 = getcc(size, result, 0);
1468 	rflags &= ~RFLAGS_STATUS_BITS;
1469 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1470 
1471 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1472 	return (error);
1473 }
1474 
1475 static int
vie_emulate_or(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1476 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1477 {
1478 	int error, size;
1479 	enum vm_reg_name reg;
1480 	uint64_t result, rflags, rflags2, val1, val2;
1481 
1482 	size = vie->opsize;
1483 	error = EINVAL;
1484 
1485 	switch (vie->op.op_byte) {
1486 	case 0x0B:
1487 		/*
1488 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1489 		 * result in reg.
1490 		 *
1491 		 * 0b/r		or r16, r/m16
1492 		 * 0b/r		or r32, r/m32
1493 		 * REX.W + 0b/r	or r64, r/m64
1494 		 */
1495 
1496 		/* get the first operand */
1497 		reg = gpr_map[vie->reg];
1498 		error = vm_get_register(vm, vcpuid, reg, &val1);
1499 		if (error)
1500 			break;
1501 
1502 		/* get the second operand */
1503 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1504 		if (error)
1505 			break;
1506 
1507 		/* perform the operation and write the result */
1508 		result = val1 | val2;
1509 		error = vie_update_register(vm, vcpuid, reg, result, size);
1510 		break;
1511 	case 0x81:
1512 	case 0x83:
1513 		/*
1514 		 * OR mem (ModRM:r/m) with immediate and store the
1515 		 * result in mem.
1516 		 *
1517 		 * 81 /1		or r/m16, imm16
1518 		 * 81 /1		or r/m32, imm32
1519 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1520 		 *
1521 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1522 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1523 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1524 		 */
1525 
1526 		/* get the first operand */
1527 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1528 		if (error)
1529 			break;
1530 
1531 		/*
1532 		 * perform the operation with the pre-fetched immediate
1533 		 * operand and write the result
1534 		 */
1535 		result = val1 | vie->immediate;
1536 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1537 		break;
1538 	default:
1539 		break;
1540 	}
1541 	if (error)
1542 		return (error);
1543 
1544 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1545 	if (error)
1546 		return (error);
1547 
1548 	/*
1549 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1550 	 * to the result; AF is undefined.
1551 	 *
1552 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1553 	 */
1554 	rflags2 = getcc(size, result, 0);
1555 	rflags &= ~RFLAGS_STATUS_BITS;
1556 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1557 
1558 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1559 	return (error);
1560 }
1561 
1562 static int
vie_emulate_cmp(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1563 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1564 {
1565 	int error, size;
1566 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1567 	enum vm_reg_name reg;
1568 
1569 	size = vie->opsize;
1570 	switch (vie->op.op_byte) {
1571 	case 0x39:
1572 	case 0x3B:
1573 		/*
1574 		 * 39/r		CMP r/m16, r16
1575 		 * 39/r		CMP r/m32, r32
1576 		 * REX.W 39/r	CMP r/m64, r64
1577 		 *
1578 		 * 3B/r		CMP r16, r/m16
1579 		 * 3B/r		CMP r32, r/m32
1580 		 * REX.W + 3B/r	CMP r64, r/m64
1581 		 *
1582 		 * Compare the first operand with the second operand and
1583 		 * set status flags in EFLAGS register. The comparison is
1584 		 * performed by subtracting the second operand from the first
1585 		 * operand and then setting the status flags.
1586 		 */
1587 
1588 		/* Get the register operand */
1589 		reg = gpr_map[vie->reg];
1590 		error = vm_get_register(vm, vcpuid, reg, &regop);
1591 		if (error)
1592 			return (error);
1593 
1594 		/* Get the memory operand */
1595 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size);
1596 		if (error)
1597 			return (error);
1598 
1599 		if (vie->op.op_byte == 0x3B) {
1600 			op1 = regop;
1601 			op2 = memop;
1602 		} else {
1603 			op1 = memop;
1604 			op2 = regop;
1605 		}
1606 		rflags2 = getcc(size, op1, op2);
1607 		break;
1608 	case 0x80:
1609 	case 0x81:
1610 	case 0x83:
1611 		/*
1612 		 * 80 /7		cmp r/m8, imm8
1613 		 * REX + 80 /7		cmp r/m8, imm8
1614 		 *
1615 		 * 81 /7		cmp r/m16, imm16
1616 		 * 81 /7		cmp r/m32, imm32
1617 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1618 		 *
1619 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1620 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1621 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1622 		 *
1623 		 * Compare mem (ModRM:r/m) with immediate and set
1624 		 * status flags according to the results.  The
1625 		 * comparison is performed by subtracting the
1626 		 * immediate from the first operand and then setting
1627 		 * the status flags.
1628 		 *
1629 		 */
1630 		if (vie->op.op_byte == 0x80)
1631 			size = 1;
1632 
1633 		/* get the first operand */
1634 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1635 		if (error)
1636 			return (error);
1637 
1638 		rflags2 = getcc(size, op1, vie->immediate);
1639 		break;
1640 	default:
1641 		return (EINVAL);
1642 	}
1643 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1644 	if (error)
1645 		return (error);
1646 	rflags &= ~RFLAGS_STATUS_BITS;
1647 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1648 
1649 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1650 	return (error);
1651 }
1652 
1653 static int
vie_emulate_test(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1654 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1655 {
1656 	int error, size;
1657 	uint64_t op1, rflags, rflags2;
1658 
1659 	size = vie->opsize;
1660 	error = EINVAL;
1661 
1662 	switch (vie->op.op_byte) {
1663 	case 0xF6:
1664 		/*
1665 		 * F6 /0		test r/m8, imm8
1666 		 *
1667 		 * Test mem (ModRM:r/m) with immediate and set status
1668 		 * flags according to the results.  The comparison is
1669 		 * performed by anding the immediate from the first
1670 		 * operand and then setting the status flags.
1671 		 */
1672 		if ((vie->reg & 7) != 0)
1673 			return (EINVAL);
1674 
1675 		size = 1;	/* override for byte operation */
1676 
1677 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1678 		if (error)
1679 			return (error);
1680 
1681 		rflags2 = getandflags(size, op1, vie->immediate);
1682 		break;
1683 	case 0xF7:
1684 		/*
1685 		 * F7 /0		test r/m16, imm16
1686 		 * F7 /0		test r/m32, imm32
1687 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1688 		 *
1689 		 * Test mem (ModRM:r/m) with immediate and set status
1690 		 * flags according to the results.  The comparison is
1691 		 * performed by anding the immediate from the first
1692 		 * operand and then setting the status flags.
1693 		 */
1694 		if ((vie->reg & 7) != 0)
1695 			return (EINVAL);
1696 
1697 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1698 		if (error)
1699 			return (error);
1700 
1701 		rflags2 = getandflags(size, op1, vie->immediate);
1702 		break;
1703 	default:
1704 		return (EINVAL);
1705 	}
1706 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1707 	if (error)
1708 		return (error);
1709 
1710 	/*
1711 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1712 	 * to the result; AF is undefined.
1713 	 */
1714 	rflags &= ~RFLAGS_STATUS_BITS;
1715 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1716 
1717 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1718 	return (error);
1719 }
1720 
1721 static int
vie_emulate_bextr(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1722 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1723 {
1724 	uint64_t src1, src2, dst, rflags;
1725 	unsigned start, len, size;
1726 	int error;
1727 	struct vm_guest_paging *paging;
1728 
1729 	size = vie->opsize;
1730 	error = EINVAL;
1731 	paging = &vie->paging;
1732 
1733 	/*
1734 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
1735 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
1736 	 *
1737 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
1738 	 * Vex.vvvv.
1739 	 *
1740 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1741 	 */
1742 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1743 		size = 4;
1744 
1745 	/*
1746 	 * Extracts contiguous bits from the first /source/ operand (second
1747 	 * operand) using an index and length specified in the second /source/
1748 	 * operand (third operand).
1749 	 */
1750 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size);
1751 	if (error)
1752 		return (error);
1753 	error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
1754 	if (error)
1755 		return (error);
1756 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1757 	if (error)
1758 		return (error);
1759 
1760 	start = (src2 & 0xff);
1761 	len = (src2 & 0xff00) >> 8;
1762 
1763 	/* If no bits are extracted, the destination register is cleared. */
1764 	dst = 0;
1765 
1766 	/* If START exceeds the operand size, no bits are extracted. */
1767 	if (start > size * 8)
1768 		goto done;
1769 	/* Length is bounded by both the destination size and start offset. */
1770 	if (start + len > size * 8)
1771 		len = (size * 8) - start;
1772 	if (len == 0)
1773 		goto done;
1774 
1775 	if (start > 0)
1776 		src1 = (src1 >> start);
1777 	if (len < 64)
1778 		src1 = src1 & ((1ull << len) - 1);
1779 	dst = src1;
1780 
1781 done:
1782 	error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
1783 	if (error)
1784 		return (error);
1785 
1786 	/*
1787 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1788 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1789 	 */
1790 	rflags &= ~RFLAGS_STATUS_BITS;
1791 	if (dst == 0)
1792 		rflags |= PSL_Z;
1793 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
1794 	    8);
1795 	return (error);
1796 }
1797 
1798 static int
vie_emulate_add(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1799 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1800 {
1801 	int error, size;
1802 	uint64_t nval, rflags, rflags2, val1, val2;
1803 	enum vm_reg_name reg;
1804 
1805 	size = vie->opsize;
1806 	error = EINVAL;
1807 
1808 	switch (vie->op.op_byte) {
1809 	case 0x03:
1810 		/*
1811 		 * ADD r/m to r and store the result in r
1812 		 *
1813 		 * 03/r			ADD r16, r/m16
1814 		 * 03/r			ADD r32, r/m32
1815 		 * REX.W + 03/r		ADD r64, r/m64
1816 		 */
1817 
1818 		/* get the first operand */
1819 		reg = gpr_map[vie->reg];
1820 		error = vm_get_register(vm, vcpuid, reg, &val1);
1821 		if (error)
1822 			break;
1823 
1824 		/* get the second operand */
1825 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1826 		if (error)
1827 			break;
1828 
1829 		/* perform the operation and write the result */
1830 		nval = val1 + val2;
1831 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1832 		break;
1833 	default:
1834 		break;
1835 	}
1836 
1837 	if (!error) {
1838 		rflags2 = getaddflags(size, val1, val2);
1839 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1840 		    &rflags);
1841 		if (error)
1842 			return (error);
1843 
1844 		rflags &= ~RFLAGS_STATUS_BITS;
1845 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1846 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1847 		    rflags, 8);
1848 	}
1849 
1850 	return (error);
1851 }
1852 
1853 static int
vie_emulate_sub(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1854 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1855 {
1856 	int error, size;
1857 	uint64_t nval, rflags, rflags2, val1, val2;
1858 	enum vm_reg_name reg;
1859 
1860 	size = vie->opsize;
1861 	error = EINVAL;
1862 
1863 	switch (vie->op.op_byte) {
1864 	case 0x2B:
1865 		/*
1866 		 * SUB r/m from r and store the result in r
1867 		 *
1868 		 * 2B/r		SUB r16, r/m16
1869 		 * 2B/r		SUB r32, r/m32
1870 		 * REX.W + 2B/r	SUB r64, r/m64
1871 		 */
1872 
1873 		/* get the first operand */
1874 		reg = gpr_map[vie->reg];
1875 		error = vm_get_register(vm, vcpuid, reg, &val1);
1876 		if (error)
1877 			break;
1878 
1879 		/* get the second operand */
1880 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1881 		if (error)
1882 			break;
1883 
1884 		/* perform the operation and write the result */
1885 		nval = val1 - val2;
1886 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1887 		break;
1888 	default:
1889 		break;
1890 	}
1891 
1892 	if (!error) {
1893 		rflags2 = getcc(size, val1, val2);
1894 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1895 		    &rflags);
1896 		if (error)
1897 			return (error);
1898 
1899 		rflags &= ~RFLAGS_STATUS_BITS;
1900 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1901 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1902 		    rflags, 8);
1903 	}
1904 
1905 	return (error);
1906 }
1907 
1908 static int
vie_emulate_mul(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1909 vie_emulate_mul(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1910 {
1911 	int error, size;
1912 	uint64_t rflags, rflags2, val1, val2;
1913 	__int128_t nval;
1914 	enum vm_reg_name reg;
1915 	ulong_t (*getflags)(int, uint64_t, uint64_t) = NULL;
1916 
1917 	size = vie->opsize;
1918 	error = EINVAL;
1919 
1920 	switch (vie->op.op_byte) {
1921 	case 0xAF:
1922 		/*
1923 		 * Multiply the contents of a destination register by
1924 		 * the contents of a register or memory operand and
1925 		 * put the signed result in the destination register.
1926 		 *
1927 		 * AF/r		IMUL r16, r/m16
1928 		 * AF/r		IMUL r32, r/m32
1929 		 * REX.W + AF/r	IMUL r64, r/m64
1930 		 */
1931 
1932 		getflags = getimulflags;
1933 
1934 		/* get the first operand */
1935 		reg = gpr_map[vie->reg];
1936 		error = vm_get_register(vm, vcpuid, reg, &val1);
1937 		if (error != 0)
1938 			break;
1939 
1940 		/* get the second operand */
1941 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1942 		if (error != 0)
1943 			break;
1944 
1945 		/* perform the operation and write the result */
1946 		nval = (int64_t)val1 * (int64_t)val2;
1947 
1948 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1949 
1950 		DTRACE_PROBE4(vie__imul,
1951 		    const char *, vie_regnum_name(vie->reg, size),
1952 		    uint64_t, val1, uint64_t, val2, __uint128_t, nval);
1953 
1954 		break;
1955 	default:
1956 		break;
1957 	}
1958 
1959 	if (error == 0) {
1960 		rflags2 = getflags(size, val1, val2);
1961 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1962 		    &rflags);
1963 		if (error)
1964 			return (error);
1965 
1966 		rflags &= ~RFLAGS_STATUS_BITS;
1967 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1968 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1969 		    rflags, 8);
1970 
1971 		DTRACE_PROBE2(vie__imul__rflags,
1972 		    uint64_t, rflags, uint64_t, rflags2);
1973 	}
1974 
1975 	return (error);
1976 }
1977 
1978 static int
vie_emulate_stack_op(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)1979 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1980 {
1981 	struct vm_copyinfo copyinfo[2];
1982 	struct seg_desc ss_desc;
1983 	uint64_t cr0, rflags, rsp, stack_gla, val;
1984 	int error, fault, size, stackaddrsize, pushop;
1985 	struct vm_guest_paging *paging;
1986 
1987 	val = 0;
1988 	size = vie->opsize;
1989 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1990 	paging = &vie->paging;
1991 
1992 	/*
1993 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1994 	 */
1995 	if (paging->cpu_mode == CPU_MODE_REAL) {
1996 		stackaddrsize = 2;
1997 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1998 		/*
1999 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
2000 		 * - Stack pointer size is always 64-bits.
2001 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
2002 		 * - 16-bit PUSH/POP is supported by using the operand size
2003 		 *   override prefix (66H).
2004 		 */
2005 		stackaddrsize = 8;
2006 		size = vie->opsize_override ? 2 : 8;
2007 	} else {
2008 		/*
2009 		 * In protected or compatibility mode the 'B' flag in the
2010 		 * stack-segment descriptor determines the size of the
2011 		 * stack pointer.
2012 		 */
2013 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
2014 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
2015 		    __func__, error));
2016 		if (SEG_DESC_DEF32(ss_desc.access))
2017 			stackaddrsize = 4;
2018 		else
2019 			stackaddrsize = 2;
2020 	}
2021 
2022 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
2023 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
2024 
2025 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2026 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2027 
2028 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
2029 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
2030 	if (pushop) {
2031 		rsp -= size;
2032 	}
2033 
2034 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
2035 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
2036 	    &stack_gla)) {
2037 		vm_inject_ss(vm, vcpuid, 0);
2038 		return (0);
2039 	}
2040 
2041 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
2042 		vm_inject_ss(vm, vcpuid, 0);
2043 		return (0);
2044 	}
2045 
2046 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
2047 		vm_inject_ac(vm, vcpuid, 0);
2048 		return (0);
2049 	}
2050 
2051 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
2052 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
2053 	    &fault);
2054 	if (error || fault)
2055 		return (error);
2056 
2057 	if (pushop) {
2058 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
2059 		if (error == 0)
2060 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
2061 	} else {
2062 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
2063 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
2064 		rsp += size;
2065 	}
2066 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2067 
2068 	if (error == 0) {
2069 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
2070 		    stackaddrsize);
2071 		KASSERT(error == 0, ("error %d updating rsp", error));
2072 	}
2073 	return (error);
2074 }
2075 
2076 static int
vie_emulate_push(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)2077 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2078 {
2079 	int error;
2080 
2081 	/*
2082 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2083 	 *
2084 	 * PUSH is part of the group 5 extended opcodes and is identified
2085 	 * by ModRM:reg = b110.
2086 	 */
2087 	if ((vie->reg & 7) != 6)
2088 		return (EINVAL);
2089 
2090 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
2091 	return (error);
2092 }
2093 
2094 static int
vie_emulate_pop(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)2095 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2096 {
2097 	int error;
2098 
2099 	/*
2100 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
2101 	 *
2102 	 * POP is part of the group 1A extended opcodes and is identified
2103 	 * by ModRM:reg = b000.
2104 	 */
2105 	if ((vie->reg & 7) != 0)
2106 		return (EINVAL);
2107 
2108 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
2109 	return (error);
2110 }
2111 
2112 static int
vie_emulate_group1(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)2113 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2114 {
2115 	int error;
2116 
2117 	switch (vie->reg & 7) {
2118 	case 0x1:	/* OR */
2119 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
2120 		break;
2121 	case 0x4:	/* AND */
2122 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
2123 		break;
2124 	case 0x7:	/* CMP */
2125 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2126 		break;
2127 	default:
2128 		error = EINVAL;
2129 		break;
2130 	}
2131 
2132 	return (error);
2133 }
2134 
2135 static int
vie_emulate_bittest(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)2136 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2137 {
2138 	uint64_t val, rflags;
2139 	int error, bitmask, bitoff;
2140 
2141 	/*
2142 	 * 0F BA is a Group 8 extended opcode.
2143 	 *
2144 	 * Currently we only emulate the 'Bit Test' instruction which is
2145 	 * identified by a ModR/M:reg encoding of 100b.
2146 	 */
2147 	if ((vie->reg & 7) != 4)
2148 		return (EINVAL);
2149 
2150 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2151 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2152 
2153 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize);
2154 	if (error)
2155 		return (error);
2156 
2157 	/*
2158 	 * Intel SDM, Vol 2, Table 3-2:
2159 	 * "Range of Bit Positions Specified by Bit Offset Operands"
2160 	 */
2161 	bitmask = vie->opsize * 8 - 1;
2162 	bitoff = vie->immediate & bitmask;
2163 
2164 	/* Copy the bit into the Carry flag in %rflags */
2165 	if (val & (1UL << bitoff))
2166 		rflags |= PSL_C;
2167 	else
2168 		rflags &= ~PSL_C;
2169 
2170 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
2171 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
2172 
2173 	return (0);
2174 }
2175 
2176 static int
vie_emulate_twob_group15(struct vie * vie,struct vm * vm,int vcpuid,uint64_t gpa)2177 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid,
2178     uint64_t gpa)
2179 {
2180 	int error;
2181 	uint64_t buf;
2182 
2183 	switch (vie->reg & 7) {
2184 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
2185 		if (vie->mod == 0x3) {
2186 			/*
2187 			 * SFENCE.  Ignore it, VM exit provides enough
2188 			 * barriers on its own.
2189 			 */
2190 			error = 0;
2191 		} else {
2192 			/*
2193 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
2194 			 * rights.
2195 			 */
2196 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1);
2197 		}
2198 		break;
2199 	default:
2200 		error = EINVAL;
2201 		break;
2202 	}
2203 
2204 	return (error);
2205 }
2206 
2207 static int
vie_emulate_clts(struct vie * vie,struct vm * vm,int vcpuid)2208 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid)
2209 {
2210 	uint64_t val;
2211 	int error __maybe_unused;
2212 
2213 	if (vie->paging.cpl != 0) {
2214 		vm_inject_gp(vm, vcpuid);
2215 		vie->num_processed = 0;
2216 		return (0);
2217 	}
2218 
2219 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val);
2220 	ASSERT(error == 0);
2221 
2222 	/* Clear %cr0.TS */
2223 	val &= ~CR0_TS;
2224 
2225 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val);
2226 	ASSERT(error == 0);
2227 
2228 	return (0);
2229 }
2230 
2231 static int
vie_mmio_read(struct vie * vie,struct vm * vm,int cpuid,uint64_t gpa,uint64_t * rval,int bytes)2232 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2233     uint64_t *rval, int bytes)
2234 {
2235 	int err;
2236 
2237 	if (vie->mmio_req_read.state == VR_DONE) {
2238 		ASSERT(vie->mmio_req_read.bytes == bytes);
2239 		ASSERT(vie->mmio_req_read.gpa == gpa);
2240 
2241 		*rval = vie->mmio_req_read.data;
2242 		return (0);
2243 	}
2244 
2245 	err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes);
2246 	if (err == 0) {
2247 		/*
2248 		 * A successful read from an in-kernel-emulated device may come
2249 		 * with side effects, so stash the result in case it's used for
2250 		 * an instruction which subsequently needs to issue an MMIO
2251 		 * write to userspace.
2252 		 */
2253 		ASSERT(vie->mmio_req_read.state == VR_NONE);
2254 
2255 		vie->mmio_req_read.bytes = bytes;
2256 		vie->mmio_req_read.gpa = gpa;
2257 		vie->mmio_req_read.data = *rval;
2258 		vie->mmio_req_read.state = VR_DONE;
2259 
2260 	} else if (err == ESRCH) {
2261 		/* Hope that userspace emulation can fulfill this read */
2262 		vie->mmio_req_read.bytes = bytes;
2263 		vie->mmio_req_read.gpa = gpa;
2264 		vie->mmio_req_read.state = VR_PENDING;
2265 		vie->status |= VIES_PENDING_MMIO;
2266 	} else if (err < 0) {
2267 		/*
2268 		 * The MMIO read failed in such a way that fallback to handling
2269 		 * in userspace is required.
2270 		 */
2271 		vie->status |= VIES_USER_FALLBACK;
2272 	}
2273 	return (err);
2274 }
2275 
2276 static int
vie_mmio_write(struct vie * vie,struct vm * vm,int cpuid,uint64_t gpa,uint64_t wval,int bytes)2277 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2278     uint64_t wval, int bytes)
2279 {
2280 	int err;
2281 
2282 	if (vie->mmio_req_write.state == VR_DONE) {
2283 		ASSERT(vie->mmio_req_write.bytes == bytes);
2284 		ASSERT(vie->mmio_req_write.gpa == gpa);
2285 
2286 		return (0);
2287 	}
2288 
2289 	err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes);
2290 	if (err == 0) {
2291 		/*
2292 		 * A successful write to an in-kernel-emulated device probably
2293 		 * results in side effects, so stash the fact that such a write
2294 		 * succeeded in case the operation requires other work.
2295 		 */
2296 		vie->mmio_req_write.bytes = bytes;
2297 		vie->mmio_req_write.gpa = gpa;
2298 		vie->mmio_req_write.data = wval;
2299 		vie->mmio_req_write.state = VR_DONE;
2300 	} else if (err == ESRCH) {
2301 		/* Hope that userspace emulation can fulfill this write */
2302 		vie->mmio_req_write.bytes = bytes;
2303 		vie->mmio_req_write.gpa = gpa;
2304 		vie->mmio_req_write.data = wval;
2305 		vie->mmio_req_write.state = VR_PENDING;
2306 		vie->status |= VIES_PENDING_MMIO;
2307 	} else if (err < 0) {
2308 		/*
2309 		 * The MMIO write failed in such a way that fallback to handling
2310 		 * in userspace is required.
2311 		 */
2312 		vie->status |= VIES_USER_FALLBACK;
2313 	}
2314 	return (err);
2315 }
2316 
2317 int
vie_emulate_mmio(struct vie * vie,struct vm * vm,int vcpuid)2318 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid)
2319 {
2320 	int error;
2321 	uint64_t gpa;
2322 
2323 	if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) !=
2324 	    (VIES_INST_DECODE | VIES_MMIO)) {
2325 		return (EINVAL);
2326 	}
2327 
2328 	gpa = vie->mmio_gpa;
2329 
2330 	switch (vie->op.op_type) {
2331 	case VIE_OP_TYPE_GROUP1:
2332 		error = vie_emulate_group1(vie, vm, vcpuid, gpa);
2333 		break;
2334 	case VIE_OP_TYPE_POP:
2335 		error = vie_emulate_pop(vie, vm, vcpuid, gpa);
2336 		break;
2337 	case VIE_OP_TYPE_PUSH:
2338 		error = vie_emulate_push(vie, vm, vcpuid, gpa);
2339 		break;
2340 	case VIE_OP_TYPE_CMP:
2341 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2342 		break;
2343 	case VIE_OP_TYPE_MOV:
2344 		error = vie_emulate_mov(vie, vm, vcpuid, gpa);
2345 		break;
2346 	case VIE_OP_TYPE_MOVSX:
2347 	case VIE_OP_TYPE_MOVZX:
2348 		error = vie_emulate_movx(vie, vm, vcpuid, gpa);
2349 		break;
2350 	case VIE_OP_TYPE_MOVS:
2351 		error = vie_emulate_movs(vie, vm, vcpuid, gpa);
2352 		break;
2353 	case VIE_OP_TYPE_STOS:
2354 		error = vie_emulate_stos(vie, vm, vcpuid, gpa);
2355 		break;
2356 	case VIE_OP_TYPE_AND:
2357 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
2358 		break;
2359 	case VIE_OP_TYPE_OR:
2360 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
2361 		break;
2362 	case VIE_OP_TYPE_SUB:
2363 		error = vie_emulate_sub(vie, vm, vcpuid, gpa);
2364 		break;
2365 	case VIE_OP_TYPE_BITTEST:
2366 		error = vie_emulate_bittest(vie, vm, vcpuid, gpa);
2367 		break;
2368 	case VIE_OP_TYPE_TWOB_GRP15:
2369 		error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa);
2370 		break;
2371 	case VIE_OP_TYPE_ADD:
2372 		error = vie_emulate_add(vie, vm, vcpuid, gpa);
2373 		break;
2374 	case VIE_OP_TYPE_TEST:
2375 		error = vie_emulate_test(vie, vm, vcpuid, gpa);
2376 		break;
2377 	case VIE_OP_TYPE_BEXTR:
2378 		error = vie_emulate_bextr(vie, vm, vcpuid, gpa);
2379 		break;
2380 	case VIE_OP_TYPE_MUL:
2381 		error = vie_emulate_mul(vie, vm, vcpuid, gpa);
2382 		break;
2383 	default:
2384 		error = EINVAL;
2385 		break;
2386 	}
2387 
2388 	if (error == ESRCH) {
2389 		/* Return to userspace with the mmio request */
2390 		return (-1);
2391 	}
2392 
2393 	return (error);
2394 }
2395 
2396 static int
vie_emulate_inout_port(struct vie * vie,struct vm * vm,int vcpuid,uint32_t * eax)2397 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid,
2398     uint32_t *eax)
2399 {
2400 	uint32_t mask, val;
2401 	bool in;
2402 	int err;
2403 
2404 	mask = vie_size2mask(vie->inout.bytes);
2405 	in = (vie->inout.flags & INOUT_IN) != 0;
2406 
2407 	if (!in) {
2408 		val = *eax & mask;
2409 	}
2410 
2411 	if (vie->inout_req_state != VR_DONE) {
2412 		err = vm_ioport_access(vm, vcpuid, in, vie->inout.port,
2413 		    vie->inout.bytes, &val);
2414 		val &= mask;
2415 	} else {
2416 		/*
2417 		 * This port access was handled in userspace and the result was
2418 		 * injected in to be handled now.
2419 		 */
2420 		val = vie->inout_req_val & mask;
2421 		vie->inout_req_state = VR_NONE;
2422 		err = 0;
2423 	}
2424 
2425 	if (err == ESRCH) {
2426 		vie->status |= VIES_PENDING_INOUT;
2427 		vie->inout_req_state = VR_PENDING;
2428 		return (err);
2429 	} else if (err != 0) {
2430 		return (err);
2431 	}
2432 
2433 	if (in) {
2434 		*eax = (*eax & ~mask) | val;
2435 	}
2436 	return (0);
2437 }
2438 
2439 static enum vm_reg_name
vie_inout_segname(const struct vie * vie)2440 vie_inout_segname(const struct vie *vie)
2441 {
2442 	uint8_t segidx = vie->inout.segment;
2443 	const enum vm_reg_name segmap[] = {
2444 		VM_REG_GUEST_ES,
2445 		VM_REG_GUEST_CS,
2446 		VM_REG_GUEST_SS,
2447 		VM_REG_GUEST_DS,
2448 		VM_REG_GUEST_FS,
2449 		VM_REG_GUEST_GS,
2450 	};
2451 	const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0]));
2452 
2453 	if (segidx >= maxidx) {
2454 		panic("unexpected segment index %u", segidx);
2455 	}
2456 	return (segmap[segidx]);
2457 }
2458 
2459 static int
vie_emulate_inout_str(struct vie * vie,struct vm * vm,int vcpuid)2460 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid)
2461 {
2462 	uint8_t bytes, addrsize;
2463 	uint64_t index, count = 0, gla, rflags;
2464 	int prot, err, fault;
2465 	bool in, repeat;
2466 	enum vm_reg_name seg_reg, idx_reg;
2467 	struct vm_copyinfo copyinfo[2];
2468 
2469 	in = (vie->inout.flags & INOUT_IN) != 0;
2470 	bytes = vie->inout.bytes;
2471 	addrsize = vie->inout.addrsize;
2472 	prot = in ? PROT_WRITE : PROT_READ;
2473 
2474 	ASSERT(bytes == 1 || bytes == 2 || bytes == 4);
2475 	ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8);
2476 
2477 	idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
2478 	seg_reg = vie_inout_segname(vie);
2479 	err = vm_get_register(vm, vcpuid, idx_reg, &index);
2480 	ASSERT(err == 0);
2481 	index = index & vie_size2mask(addrsize);
2482 
2483 	repeat = (vie->inout.flags & INOUT_REP) != 0;
2484 
2485 	/* Count register */
2486 	if (repeat) {
2487 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count);
2488 		count &= vie_size2mask(addrsize);
2489 
2490 		if (count == 0) {
2491 			/*
2492 			 * If we were asked to emulate a REP INS/OUTS when the
2493 			 * count register is zero, no further work is required.
2494 			 */
2495 			return (0);
2496 		}
2497 	} else {
2498 		count = 1;
2499 	}
2500 
2501 	gla = 0;
2502 	if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg,
2503 	    idx_reg, &gla) != 0) {
2504 		/* vie_get_gla() already injected the appropriate fault */
2505 		return (0);
2506 	}
2507 
2508 	/*
2509 	 * The INS/OUTS emulate currently assumes that the memory target resides
2510 	 * within the guest system memory, rather than a device MMIO region.  If
2511 	 * such a case becomes a necessity, that additional handling could be
2512 	 * put in place.
2513 	 */
2514 	err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot,
2515 	    copyinfo, nitems(copyinfo), &fault);
2516 
2517 	if (err) {
2518 		/* Unrecoverable error */
2519 		return (err);
2520 	} else if (fault) {
2521 		/* Resume guest to handle fault */
2522 		return (0);
2523 	}
2524 
2525 	if (!in) {
2526 		vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes);
2527 	}
2528 
2529 	err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2530 
2531 	if (err == 0 && in) {
2532 		vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes);
2533 	}
2534 
2535 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2536 
2537 	if (err == 0) {
2538 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
2539 		    &rflags);
2540 		ASSERT(err == 0);
2541 
2542 		/* Update index */
2543 		if (rflags & PSL_D) {
2544 			index -= bytes;
2545 		} else {
2546 			index += bytes;
2547 		}
2548 
2549 		/* Update index register */
2550 		err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize);
2551 		ASSERT(err == 0);
2552 
2553 		/*
2554 		 * Update count register only if the instruction had a repeat
2555 		 * prefix.
2556 		 */
2557 		if ((vie->inout.flags & INOUT_REP) != 0) {
2558 			count--;
2559 			err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
2560 			    count, addrsize);
2561 			ASSERT(err == 0);
2562 
2563 			if (count != 0) {
2564 				return (vie_repeat(vie));
2565 			}
2566 		}
2567 	}
2568 
2569 	return (err);
2570 }
2571 
2572 int
vie_emulate_inout(struct vie * vie,struct vm * vm,int vcpuid)2573 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid)
2574 {
2575 	int err = 0;
2576 
2577 	if ((vie->status & VIES_INOUT) == 0) {
2578 		return (EINVAL);
2579 	}
2580 
2581 	if ((vie->inout.flags & INOUT_STR) == 0) {
2582 		/*
2583 		 * For now, using the 'rep' prefixes with plain (non-string)
2584 		 * in/out is not supported.
2585 		 */
2586 		if ((vie->inout.flags & INOUT_REP) != 0) {
2587 			return (EINVAL);
2588 		}
2589 
2590 		err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2591 		if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) {
2592 			/*
2593 			 * With the inX access now a success, the result needs
2594 			 * to be stored in the guest %rax.
2595 			 */
2596 			err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2597 			    vie->inout.eax);
2598 			VERIFY0(err);
2599 		}
2600 	} else {
2601 		vie->status &= ~VIES_REPEAT;
2602 		err = vie_emulate_inout_str(vie, vm, vcpuid);
2603 
2604 	}
2605 	if (err < 0) {
2606 		/*
2607 		 * Access to an I/O port failed in such a way that fallback to
2608 		 * handling in userspace is required.
2609 		 */
2610 		vie->status |= VIES_USER_FALLBACK;
2611 	} else if (err == ESRCH) {
2612 		ASSERT(vie->status & VIES_PENDING_INOUT);
2613 		/* Return to userspace with the in/out request */
2614 		err = -1;
2615 	}
2616 
2617 	return (err);
2618 }
2619 
2620 int
vie_emulate_other(struct vie * vie,struct vm * vm,int vcpuid)2621 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid)
2622 {
2623 	int error;
2624 
2625 	if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) !=
2626 	    (VIES_INST_DECODE | VIES_OTHER)) {
2627 		return (EINVAL);
2628 	}
2629 
2630 	switch (vie->op.op_type) {
2631 	case VIE_OP_TYPE_CLTS:
2632 		error = vie_emulate_clts(vie, vm, vcpuid);
2633 		break;
2634 	case VIE_OP_TYPE_MOV_CR:
2635 		error = vie_emulate_mov_cr(vie, vm, vcpuid);
2636 		break;
2637 	default:
2638 		error = EINVAL;
2639 		break;
2640 	}
2641 
2642 	return (error);
2643 }
2644 
2645 void
vie_reset(struct vie * vie)2646 vie_reset(struct vie *vie)
2647 {
2648 	vie->status = 0;
2649 	vie->num_processed = vie->num_valid = 0;
2650 }
2651 
2652 void
vie_advance_pc(struct vie * vie,uint64_t * nextrip)2653 vie_advance_pc(struct vie *vie, uint64_t *nextrip)
2654 {
2655 	VERIFY((vie->status & VIES_REPEAT) == 0);
2656 
2657 	*nextrip += vie->num_processed;
2658 	vie_reset(vie);
2659 }
2660 
2661 void
vie_exitinfo(const struct vie * vie,struct vm_exit * vme)2662 vie_exitinfo(const struct vie *vie, struct vm_exit *vme)
2663 {
2664 	if (vie->status & VIES_USER_FALLBACK) {
2665 		/*
2666 		 * Despite the fact that the instruction was successfully
2667 		 * decoded, some aspect of the emulation failed in such a way
2668 		 * that it is left up to userspace to complete the operation.
2669 		 */
2670 		vie_fallback_exitinfo(vie, vme);
2671 	} else if (vie->status & VIES_MMIO) {
2672 		vme->exitcode = VM_EXITCODE_MMIO;
2673 		if (vie->mmio_req_read.state == VR_PENDING) {
2674 			vme->u.mmio.gpa = vie->mmio_req_read.gpa;
2675 			vme->u.mmio.data = 0;
2676 			vme->u.mmio.bytes = vie->mmio_req_read.bytes;
2677 			vme->u.mmio.read = 1;
2678 		} else if (vie->mmio_req_write.state == VR_PENDING) {
2679 			vme->u.mmio.gpa = vie->mmio_req_write.gpa;
2680 			vme->u.mmio.data = vie->mmio_req_write.data &
2681 			    vie_size2mask(vie->mmio_req_write.bytes);
2682 			vme->u.mmio.bytes = vie->mmio_req_write.bytes;
2683 			vme->u.mmio.read = 0;
2684 		} else {
2685 			panic("bad pending MMIO state");
2686 		}
2687 	} else if (vie->status & VIES_INOUT) {
2688 		vme->exitcode = VM_EXITCODE_INOUT;
2689 		vme->u.inout.port = vie->inout.port;
2690 		vme->u.inout.bytes = vie->inout.bytes;
2691 		if ((vie->inout.flags & INOUT_IN) != 0) {
2692 			vme->u.inout.flags = INOUT_IN;
2693 			vme->u.inout.eax = 0;
2694 		} else {
2695 			vme->u.inout.flags = 0;
2696 			vme->u.inout.eax = vie->inout.eax &
2697 			    vie_size2mask(vie->inout.bytes);
2698 		}
2699 	} else {
2700 		panic("no pending operation");
2701 	}
2702 }
2703 
2704 /*
2705  * In the case of a decoding or verification failure, bailing out to userspace
2706  * to do the instruction emulation is our only option for now.
2707  */
2708 void
vie_fallback_exitinfo(const struct vie * vie,struct vm_exit * vme)2709 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme)
2710 {
2711 	if ((vie->status & VIES_INST_FETCH) == 0) {
2712 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
2713 	} else {
2714 		ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst));
2715 
2716 		bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst));
2717 		vme->u.inst_emul.num_valid = vie->num_valid;
2718 	}
2719 	vme->exitcode = VM_EXITCODE_INST_EMUL;
2720 }
2721 
2722 void
vie_cs_info(const struct vie * vie,struct vm * vm,int vcpuid,uint64_t * cs_base,int * cs_d)2723 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base,
2724     int *cs_d)
2725 {
2726 	struct seg_desc cs_desc;
2727 	int error __maybe_unused;
2728 
2729 	error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc);
2730 	ASSERT(error == 0);
2731 
2732 	/* Initialization required for the paging info to be populated */
2733 	VERIFY(vie->status & VIES_INIT);
2734 	switch (vie->paging.cpu_mode) {
2735 	case CPU_MODE_REAL:
2736 		*cs_base = cs_desc.base;
2737 		*cs_d = 0;
2738 		break;
2739 	case CPU_MODE_PROTECTED:
2740 	case CPU_MODE_COMPATIBILITY:
2741 		*cs_base = cs_desc.base;
2742 		*cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0;
2743 		break;
2744 	default:
2745 		*cs_base = 0;
2746 		*cs_d = 0;
2747 		break;
2748 	}
2749 }
2750 
2751 bool
vie_pending(const struct vie * vie)2752 vie_pending(const struct vie *vie)
2753 {
2754 	/*
2755 	 * These VIE status bits indicate conditions which must be addressed
2756 	 * through either device IO fulfillment (with corresponding
2757 	 * vie_fulfill_*()) or complete userspace emulation (followed by a
2758 	 * vie_reset()).
2759 	 */
2760 	const enum vie_status of_interest =
2761 	    VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK;
2762 
2763 	return ((vie->status & of_interest) != 0);
2764 }
2765 
2766 bool
vie_needs_fetch(const struct vie * vie)2767 vie_needs_fetch(const struct vie *vie)
2768 {
2769 	if (vie->status & VIES_INST_FETCH) {
2770 		ASSERT(vie->num_valid != 0);
2771 		return (false);
2772 	}
2773 	return (true);
2774 }
2775 
2776 static int
vie_alignment_check(int cpl,int size,uint64_t cr0,uint64_t rf,uint64_t gla)2777 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
2778 {
2779 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2780 	    ("%s: invalid size %d", __func__, size));
2781 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
2782 
2783 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
2784 		return (0);
2785 
2786 	return ((gla & (size - 1)) ? 1 : 0);
2787 }
2788 
2789 static int
vie_canonical_check(enum vm_cpu_mode cpu_mode,uint64_t gla)2790 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
2791 {
2792 	uint64_t mask;
2793 
2794 	if (cpu_mode != CPU_MODE_64BIT)
2795 		return (0);
2796 
2797 	/*
2798 	 * The value of the bit 47 in the 'gla' should be replicated in the
2799 	 * most significant 16 bits.
2800 	 */
2801 	mask = ~((1UL << 48) - 1);
2802 	if (gla & (1UL << 47))
2803 		return ((gla & mask) != mask);
2804 	else
2805 		return ((gla & mask) != 0);
2806 }
2807 
2808 static uint64_t
vie_size2mask(int size)2809 vie_size2mask(int size)
2810 {
2811 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2812 	    ("vie_size2mask: invalid size %d", size));
2813 	return (size2mask[size]);
2814 }
2815 
2816 static int
vie_calculate_gla(enum vm_cpu_mode cpu_mode,enum vm_reg_name seg,struct seg_desc * desc,uint64_t offset,int length,int addrsize,int prot,uint64_t * gla)2817 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
2818     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
2819     int prot, uint64_t *gla)
2820 {
2821 	uint64_t firstoff, low_limit, high_limit, segbase;
2822 	int glasize, type;
2823 
2824 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
2825 	    ("%s: invalid segment %d", __func__, seg));
2826 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
2827 	    ("%s: invalid operand size %d", __func__, length));
2828 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
2829 	    ("%s: invalid prot %x", __func__, prot));
2830 
2831 	firstoff = offset;
2832 	if (cpu_mode == CPU_MODE_64BIT) {
2833 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
2834 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
2835 		glasize = 8;
2836 	} else {
2837 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
2838 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
2839 		glasize = 4;
2840 		/*
2841 		 * If the segment selector is loaded with a NULL selector
2842 		 * then the descriptor is unusable and attempting to use
2843 		 * it results in a #GP(0).
2844 		 */
2845 		if (SEG_DESC_UNUSABLE(desc->access))
2846 			return (-1);
2847 
2848 		/*
2849 		 * The processor generates a #NP exception when a segment
2850 		 * register is loaded with a selector that points to a
2851 		 * descriptor that is not present. If this was the case then
2852 		 * it would have been checked before the VM-exit.
2853 		 */
2854 		KASSERT(SEG_DESC_PRESENT(desc->access),
2855 		    ("segment %d not present: %x", seg, desc->access));
2856 
2857 		/*
2858 		 * The descriptor type must indicate a code/data segment.
2859 		 */
2860 		type = SEG_DESC_TYPE(desc->access);
2861 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
2862 		    "descriptor type %x", seg, type));
2863 
2864 		if (prot & PROT_READ) {
2865 			/* #GP on a read access to a exec-only code segment */
2866 			if ((type & 0xA) == 0x8)
2867 				return (-1);
2868 		}
2869 
2870 		if (prot & PROT_WRITE) {
2871 			/*
2872 			 * #GP on a write access to a code segment or a
2873 			 * read-only data segment.
2874 			 */
2875 			if (type & 0x8)			/* code segment */
2876 				return (-1);
2877 
2878 			if ((type & 0xA) == 0)		/* read-only data seg */
2879 				return (-1);
2880 		}
2881 
2882 		/*
2883 		 * 'desc->limit' is fully expanded taking granularity into
2884 		 * account.
2885 		 */
2886 		if ((type & 0xC) == 0x4) {
2887 			/* expand-down data segment */
2888 			low_limit = desc->limit + 1;
2889 			high_limit = SEG_DESC_DEF32(desc->access) ?
2890 			    0xffffffff : 0xffff;
2891 		} else {
2892 			/* code segment or expand-up data segment */
2893 			low_limit = 0;
2894 			high_limit = desc->limit;
2895 		}
2896 
2897 		while (length > 0) {
2898 			offset &= vie_size2mask(addrsize);
2899 			if (offset < low_limit || offset > high_limit)
2900 				return (-1);
2901 			offset++;
2902 			length--;
2903 		}
2904 	}
2905 
2906 	/*
2907 	 * In 64-bit mode all segments except %fs and %gs have a segment
2908 	 * base address of 0.
2909 	 */
2910 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2911 	    seg != VM_REG_GUEST_GS) {
2912 		segbase = 0;
2913 	} else {
2914 		segbase = desc->base;
2915 	}
2916 
2917 	/*
2918 	 * Truncate 'firstoff' to the effective address size before adding
2919 	 * it to the segment base.
2920 	 */
2921 	firstoff &= vie_size2mask(addrsize);
2922 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
2923 	return (0);
2924 }
2925 
2926 void
vie_init_mmio(struct vie * vie,const char * inst_bytes,uint8_t inst_length,const struct vm_guest_paging * paging,uint64_t gpa)2927 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length,
2928     const struct vm_guest_paging *paging, uint64_t gpa)
2929 {
2930 	KASSERT(inst_length <= VIE_INST_SIZE,
2931 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
2932 
2933 	bzero(vie, sizeof (struct vie));
2934 
2935 	vie->base_register = VM_REG_LAST;
2936 	vie->index_register = VM_REG_LAST;
2937 	vie->segment_register = VM_REG_LAST;
2938 	vie->status = VIES_INIT | VIES_MMIO;
2939 
2940 	if (inst_length != 0) {
2941 		bcopy(inst_bytes, vie->inst, inst_length);
2942 		vie->num_valid = inst_length;
2943 		vie->status |= VIES_INST_FETCH;
2944 	}
2945 
2946 	vie->paging = *paging;
2947 	vie->mmio_gpa = gpa;
2948 }
2949 
2950 void
vie_init_inout(struct vie * vie,const struct vm_inout * inout,uint8_t inst_len,const struct vm_guest_paging * paging)2951 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len,
2952     const struct vm_guest_paging *paging)
2953 {
2954 	bzero(vie, sizeof (struct vie));
2955 
2956 	vie->status = VIES_INIT | VIES_INOUT;
2957 
2958 	vie->inout = *inout;
2959 	vie->paging = *paging;
2960 
2961 	/*
2962 	 * Since VMX/SVM assists already decoded the nature of the in/out
2963 	 * instruction, let the status reflect that.
2964 	 */
2965 	vie->status |= VIES_INST_FETCH | VIES_INST_DECODE;
2966 	vie->num_processed = inst_len;
2967 }
2968 
2969 void
vie_init_other(struct vie * vie,const struct vm_guest_paging * paging)2970 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging)
2971 {
2972 	bzero(vie, sizeof (struct vie));
2973 
2974 	vie->base_register = VM_REG_LAST;
2975 	vie->index_register = VM_REG_LAST;
2976 	vie->segment_register = VM_REG_LAST;
2977 	vie->status = VIES_INIT | VIES_OTHER;
2978 
2979 	vie->paging = *paging;
2980 }
2981 
2982 int
vie_fulfill_mmio(struct vie * vie,const struct vm_mmio * result)2983 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result)
2984 {
2985 	struct vie_mmio *pending;
2986 
2987 	if ((vie->status & VIES_MMIO) == 0 ||
2988 	    (vie->status & VIES_PENDING_MMIO) == 0) {
2989 		return (EINVAL);
2990 	}
2991 
2992 	if (result->read) {
2993 		pending = &vie->mmio_req_read;
2994 	} else {
2995 		pending = &vie->mmio_req_write;
2996 	}
2997 
2998 	if (pending->state != VR_PENDING ||
2999 	    pending->bytes != result->bytes || pending->gpa != result->gpa) {
3000 		return (EINVAL);
3001 	}
3002 
3003 	if (result->read) {
3004 		pending->data = result->data & vie_size2mask(pending->bytes);
3005 	}
3006 	pending->state = VR_DONE;
3007 	vie->status &= ~VIES_PENDING_MMIO;
3008 
3009 	return (0);
3010 }
3011 
3012 int
vie_fulfill_inout(struct vie * vie,const struct vm_inout * result)3013 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result)
3014 {
3015 	if ((vie->status & VIES_INOUT) == 0 ||
3016 	    (vie->status & VIES_PENDING_INOUT) == 0) {
3017 		return (EINVAL);
3018 	}
3019 	if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) ||
3020 	    vie->inout.bytes != result->bytes ||
3021 	    vie->inout.port != result->port) {
3022 		return (EINVAL);
3023 	}
3024 
3025 	if (result->flags & INOUT_IN) {
3026 		vie->inout_req_val = result->eax &
3027 		    vie_size2mask(vie->inout.bytes);
3028 	}
3029 	vie->inout_req_state = VR_DONE;
3030 	vie->status &= ~(VIES_PENDING_INOUT);
3031 
3032 	return (0);
3033 }
3034 
3035 uint64_t
vie_mmio_gpa(const struct vie * vie)3036 vie_mmio_gpa(const struct vie *vie)
3037 {
3038 	return (vie->mmio_gpa);
3039 }
3040 
3041 static int
pf_error_code(int usermode,int prot,int rsvd,uint64_t pte)3042 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
3043 {
3044 	int error_code = 0;
3045 
3046 	if (pte & PG_V)
3047 		error_code |= PGEX_P;
3048 	if (prot & PROT_WRITE)
3049 		error_code |= PGEX_W;
3050 	if (usermode)
3051 		error_code |= PGEX_U;
3052 	if (rsvd)
3053 		error_code |= PGEX_RSV;
3054 	if (prot & PROT_EXEC)
3055 		error_code |= PGEX_I;
3056 
3057 	return (error_code);
3058 }
3059 
3060 static void
ptp_release(vm_page_t ** vmp)3061 ptp_release(vm_page_t **vmp)
3062 {
3063 	if (*vmp != NULL) {
3064 		(void) vmp_release(*vmp);
3065 		*vmp = NULL;
3066 	}
3067 }
3068 
3069 static void *
ptp_hold(struct vm * vm,int vcpu,uintptr_t gpa,size_t len,vm_page_t ** vmp)3070 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp)
3071 {
3072 	vm_client_t *vmc = vm_get_vmclient(vm, vcpu);
3073 	const uintptr_t hold_gpa = gpa & PAGEMASK;
3074 
3075 	/* Hold must not cross a page boundary */
3076 	VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE);
3077 
3078 	if (*vmp != NULL) {
3079 		(void) vmp_release(*vmp);
3080 	}
3081 
3082 	*vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE);
3083 	if (*vmp == NULL) {
3084 		return (NULL);
3085 	}
3086 
3087 	return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa));
3088 }
3089 
3090 static int
_vm_gla2gpa(struct vm * vm,int vcpuid,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault,bool check_only)3091 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3092     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
3093 {
3094 	int nlevels, pfcode;
3095 	int ptpshift = 0, ptpindex = 0;
3096 	uint64_t ptpphys;
3097 	uint64_t *ptpbase = NULL, pte = 0, pgsize = 0;
3098 	vm_page_t *cookie = NULL;
3099 	const bool usermode = paging->cpl == 3;
3100 	const bool writable = (prot & PROT_WRITE) != 0;
3101 
3102 	*guest_fault = 0;
3103 restart:
3104 	ptpphys = paging->cr3;		/* root of the page tables */
3105 	ptp_release(&cookie);
3106 
3107 	if (vie_canonical_check(paging->cpu_mode, gla)) {
3108 		/*
3109 		 * XXX assuming a non-stack reference otherwise a stack fault
3110 		 * should be generated.
3111 		 */
3112 		if (!check_only)
3113 			vm_inject_gp(vm, vcpuid);
3114 		*guest_fault = 1;
3115 		return (0);
3116 	}
3117 
3118 	if (paging->paging_mode == PAGING_MODE_FLAT) {
3119 		*gpa = gla;
3120 		return (0);
3121 	}
3122 
3123 	if (paging->paging_mode == PAGING_MODE_32) {
3124 		uint32_t *ptpbase32, pte32;
3125 
3126 		nlevels = 2;
3127 		while (--nlevels >= 0) {
3128 			/* Zero out the lower 12 bits. */
3129 			ptpphys &= ~0xfff;
3130 
3131 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
3132 			    &cookie);
3133 
3134 			if (ptpbase32 == NULL) {
3135 				return (EFAULT);
3136 			}
3137 
3138 			ptpshift = PAGE_SHIFT + nlevels * 10;
3139 			ptpindex = (gla >> ptpshift) & 0x3FF;
3140 			pgsize = 1UL << ptpshift;
3141 
3142 			pte32 = ptpbase32[ptpindex];
3143 
3144 			if ((pte32 & PG_V) == 0 ||
3145 			    (usermode && (pte32 & PG_U) == 0) ||
3146 			    (writable && (pte32 & PG_RW) == 0)) {
3147 				if (!check_only) {
3148 					pfcode = pf_error_code(usermode, prot,
3149 					    0, pte32);
3150 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3151 				}
3152 
3153 				ptp_release(&cookie);
3154 				*guest_fault = 1;
3155 				return (0);
3156 			}
3157 
3158 			/*
3159 			 * Emulate the x86 MMU's management of the accessed
3160 			 * and dirty flags. While the accessed flag is set
3161 			 * at every level of the page table, the dirty flag
3162 			 * is only set at the last level providing the guest
3163 			 * physical address.
3164 			 */
3165 			if (!check_only && (pte32 & PG_A) == 0) {
3166 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
3167 				    pte32, pte32 | PG_A) == 0) {
3168 					goto restart;
3169 				}
3170 			}
3171 
3172 			/* XXX must be ignored if CR4.PSE=0 */
3173 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
3174 				break;
3175 
3176 			ptpphys = pte32;
3177 		}
3178 
3179 		/* Set the dirty bit in the page table entry if necessary */
3180 		if (!check_only && writable && (pte32 & PG_M) == 0) {
3181 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
3182 			    pte32, pte32 | PG_M) == 0) {
3183 				goto restart;
3184 			}
3185 		}
3186 
3187 		/* Zero out the lower 'ptpshift' bits */
3188 		pte32 >>= ptpshift; pte32 <<= ptpshift;
3189 		*gpa = pte32 | (gla & (pgsize - 1));
3190 		ptp_release(&cookie);
3191 		return (0);
3192 	}
3193 
3194 	if (paging->paging_mode == PAGING_MODE_PAE) {
3195 		/* Zero out the lower 5 bits and the upper 32 bits */
3196 		ptpphys &= 0xffffffe0UL;
3197 
3198 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4,
3199 		    &cookie);
3200 		if (ptpbase == NULL) {
3201 			return (EFAULT);
3202 		}
3203 
3204 		ptpindex = (gla >> 30) & 0x3;
3205 
3206 		pte = ptpbase[ptpindex];
3207 
3208 		if ((pte & PG_V) == 0) {
3209 			if (!check_only) {
3210 				pfcode = pf_error_code(usermode, prot, 0, pte);
3211 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3212 			}
3213 
3214 			ptp_release(&cookie);
3215 			*guest_fault = 1;
3216 			return (0);
3217 		}
3218 
3219 		ptpphys = pte;
3220 
3221 		nlevels = 2;
3222 	} else {
3223 		nlevels = 4;
3224 	}
3225 
3226 	while (--nlevels >= 0) {
3227 		/* Zero out the lower 12 bits and the upper 12 bits */
3228 		ptpphys &= 0x000ffffffffff000UL;
3229 
3230 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
3231 		if (ptpbase == NULL) {
3232 			return (EFAULT);
3233 		}
3234 
3235 		ptpshift = PAGE_SHIFT + nlevels * 9;
3236 		ptpindex = (gla >> ptpshift) & 0x1FF;
3237 		pgsize = 1UL << ptpshift;
3238 
3239 		pte = ptpbase[ptpindex];
3240 
3241 		if ((pte & PG_V) == 0 ||
3242 		    (usermode && (pte & PG_U) == 0) ||
3243 		    (writable && (pte & PG_RW) == 0)) {
3244 			if (!check_only) {
3245 				pfcode = pf_error_code(usermode, prot, 0, pte);
3246 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3247 			}
3248 
3249 			ptp_release(&cookie);
3250 			*guest_fault = 1;
3251 			return (0);
3252 		}
3253 
3254 		/* Set the accessed bit in the page table entry */
3255 		if (!check_only && (pte & PG_A) == 0) {
3256 			if (atomic_cmpset_64(&ptpbase[ptpindex],
3257 			    pte, pte | PG_A) == 0) {
3258 				goto restart;
3259 			}
3260 		}
3261 
3262 		if (nlevels > 0 && (pte & PG_PS) != 0) {
3263 			if (pgsize > 1 * GB) {
3264 				if (!check_only) {
3265 					pfcode = pf_error_code(usermode, prot,
3266 					    1, pte);
3267 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3268 				}
3269 
3270 				ptp_release(&cookie);
3271 				*guest_fault = 1;
3272 				return (0);
3273 			}
3274 			break;
3275 		}
3276 
3277 		ptpphys = pte;
3278 	}
3279 
3280 	/* Set the dirty bit in the page table entry if necessary */
3281 	if (!check_only && writable && (pte & PG_M) == 0) {
3282 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
3283 			goto restart;
3284 	}
3285 	ptp_release(&cookie);
3286 
3287 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
3288 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
3289 	*gpa = pte | (gla & (pgsize - 1));
3290 	return (0);
3291 }
3292 
3293 int
vm_gla2gpa(struct vm * vm,int vcpuid,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)3294 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3295     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3296 {
3297 
3298 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3299 	    false));
3300 }
3301 
3302 int
vm_gla2gpa_nofault(struct vm * vm,int vcpuid,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * guest_fault)3303 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3304     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3305 {
3306 
3307 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3308 	    true));
3309 }
3310 
3311 int
vie_fetch_instruction(struct vie * vie,struct vm * vm,int vcpuid,uint64_t rip,int * faultptr)3312 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip,
3313     int *faultptr)
3314 {
3315 	struct vm_copyinfo copyinfo[2];
3316 	int error, prot;
3317 
3318 	if ((vie->status & VIES_INIT) == 0) {
3319 		return (EINVAL);
3320 	}
3321 
3322 	prot = PROT_READ | PROT_EXEC;
3323 	error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE,
3324 	    prot, copyinfo, nitems(copyinfo), faultptr);
3325 	if (error || *faultptr)
3326 		return (error);
3327 
3328 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE);
3329 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
3330 	vie->num_valid = VIE_INST_SIZE;
3331 	vie->status |= VIES_INST_FETCH;
3332 	return (0);
3333 }
3334 
3335 static int
vie_peek(struct vie * vie,uint8_t * x)3336 vie_peek(struct vie *vie, uint8_t *x)
3337 {
3338 
3339 	if (vie->num_processed < vie->num_valid) {
3340 		*x = vie->inst[vie->num_processed];
3341 		return (0);
3342 	} else
3343 		return (-1);
3344 }
3345 
3346 static void
vie_advance(struct vie * vie)3347 vie_advance(struct vie *vie)
3348 {
3349 
3350 	vie->num_processed++;
3351 }
3352 
3353 static bool
segment_override(uint8_t x,int * seg)3354 segment_override(uint8_t x, int *seg)
3355 {
3356 
3357 	switch (x) {
3358 	case 0x2E:
3359 		*seg = VM_REG_GUEST_CS;
3360 		break;
3361 	case 0x36:
3362 		*seg = VM_REG_GUEST_SS;
3363 		break;
3364 	case 0x3E:
3365 		*seg = VM_REG_GUEST_DS;
3366 		break;
3367 	case 0x26:
3368 		*seg = VM_REG_GUEST_ES;
3369 		break;
3370 	case 0x64:
3371 		*seg = VM_REG_GUEST_FS;
3372 		break;
3373 	case 0x65:
3374 		*seg = VM_REG_GUEST_GS;
3375 		break;
3376 	default:
3377 		return (false);
3378 	}
3379 	return (true);
3380 }
3381 
3382 static int
decode_prefixes(struct vie * vie,enum vm_cpu_mode cpu_mode,int cs_d)3383 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
3384 {
3385 	uint8_t x;
3386 
3387 	while (1) {
3388 		if (vie_peek(vie, &x))
3389 			return (-1);
3390 
3391 		if (x == 0x66)
3392 			vie->opsize_override = 1;
3393 		else if (x == 0x67)
3394 			vie->addrsize_override = 1;
3395 		else if (x == 0xF3)
3396 			vie->repz_present = 1;
3397 		else if (x == 0xF2)
3398 			vie->repnz_present = 1;
3399 		else if (segment_override(x, &vie->segment_register))
3400 			vie->segment_override = 1;
3401 		else
3402 			break;
3403 
3404 		vie_advance(vie);
3405 	}
3406 
3407 	/*
3408 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
3409 	 * - Only one REX prefix is allowed per instruction.
3410 	 * - The REX prefix must immediately precede the opcode byte or the
3411 	 *   escape opcode byte.
3412 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
3413 	 *   the mandatory prefix must come before the REX prefix.
3414 	 */
3415 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
3416 		vie->rex_present = 1;
3417 		vie->rex_w = x & 0x8 ? 1 : 0;
3418 		vie->rex_r = x & 0x4 ? 1 : 0;
3419 		vie->rex_x = x & 0x2 ? 1 : 0;
3420 		vie->rex_b = x & 0x1 ? 1 : 0;
3421 		vie_advance(vie);
3422 	}
3423 
3424 	/*
3425 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
3426 	 */
3427 	if ((cpu_mode == CPU_MODE_64BIT ||
3428 	    cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) {
3429 		const struct vie_op *optab;
3430 
3431 		/* 3-byte VEX prefix. */
3432 		vie->vex_present = 1;
3433 
3434 		vie_advance(vie);
3435 		if (vie_peek(vie, &x))
3436 			return (-1);
3437 
3438 		/*
3439 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
3440 		 * relative to REX encoding.
3441 		 */
3442 		vie->rex_r = x & 0x80 ? 0 : 1;
3443 		vie->rex_x = x & 0x40 ? 0 : 1;
3444 		vie->rex_b = x & 0x20 ? 0 : 1;
3445 
3446 		switch (x & 0x1F) {
3447 		case 0x2:
3448 			/* 0F 38. */
3449 			optab = three_byte_opcodes_0f38;
3450 			break;
3451 		case 0x1:
3452 			/* 0F class - nothing handled here yet. */
3453 			/* FALLTHROUGH */
3454 		case 0x3:
3455 			/* 0F 3A class - nothing handled here yet. */
3456 			/* FALLTHROUGH */
3457 		default:
3458 			/* Reserved (#UD). */
3459 			return (-1);
3460 		}
3461 
3462 		vie_advance(vie);
3463 		if (vie_peek(vie, &x))
3464 			return (-1);
3465 
3466 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
3467 		vie->rex_w = x & 0x80 ? 1 : 0;
3468 
3469 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
3470 		vie->vex_l = !!(x & 0x4);
3471 		vie->vex_pp = (x & 0x3);
3472 
3473 		/* PP: 1=66 2=F3 3=F2 prefixes. */
3474 		switch (vie->vex_pp) {
3475 		case 0x1:
3476 			vie->opsize_override = 1;
3477 			break;
3478 		case 0x2:
3479 			vie->repz_present = 1;
3480 			break;
3481 		case 0x3:
3482 			vie->repnz_present = 1;
3483 			break;
3484 		}
3485 
3486 		vie_advance(vie);
3487 
3488 		/* Opcode, sans literal prefix prefix. */
3489 		if (vie_peek(vie, &x))
3490 			return (-1);
3491 
3492 		vie->op = optab[x];
3493 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
3494 			return (-1);
3495 
3496 		vie_advance(vie);
3497 	}
3498 
3499 	/*
3500 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
3501 	 */
3502 	if (cpu_mode == CPU_MODE_64BIT) {
3503 		/*
3504 		 * Default address size is 64-bits and default operand size
3505 		 * is 32-bits.
3506 		 */
3507 		vie->addrsize = vie->addrsize_override ? 4 : 8;
3508 		if (vie->rex_w)
3509 			vie->opsize = 8;
3510 		else if (vie->opsize_override)
3511 			vie->opsize = 2;
3512 		else
3513 			vie->opsize = 4;
3514 	} else if (cs_d) {
3515 		/* Default address and operand sizes are 32-bits */
3516 		vie->addrsize = vie->addrsize_override ? 2 : 4;
3517 		vie->opsize = vie->opsize_override ? 2 : 4;
3518 	} else {
3519 		/* Default address and operand sizes are 16-bits */
3520 		vie->addrsize = vie->addrsize_override ? 4 : 2;
3521 		vie->opsize = vie->opsize_override ? 4 : 2;
3522 	}
3523 	return (0);
3524 }
3525 
3526 static int
decode_two_byte_opcode(struct vie * vie)3527 decode_two_byte_opcode(struct vie *vie)
3528 {
3529 	uint8_t x;
3530 
3531 	if (vie_peek(vie, &x))
3532 		return (-1);
3533 
3534 	vie->op = two_byte_opcodes[x];
3535 
3536 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3537 		return (-1);
3538 
3539 	vie_advance(vie);
3540 	return (0);
3541 }
3542 
3543 static int
decode_opcode(struct vie * vie)3544 decode_opcode(struct vie *vie)
3545 {
3546 	uint8_t x;
3547 
3548 	if (vie_peek(vie, &x))
3549 		return (-1);
3550 
3551 	/* Already did this via VEX prefix. */
3552 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
3553 		return (0);
3554 
3555 	vie->op = one_byte_opcodes[x];
3556 
3557 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3558 		return (-1);
3559 
3560 	vie_advance(vie);
3561 
3562 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
3563 		return (decode_two_byte_opcode(vie));
3564 
3565 	return (0);
3566 }
3567 
3568 static int
decode_modrm(struct vie * vie,enum vm_cpu_mode cpu_mode)3569 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
3570 {
3571 	uint8_t x;
3572 	/*
3573 	 * Handling mov-to/from-cr is special since it is not issuing
3574 	 * mmio/pio requests and can be done in real mode.  We must bypass some
3575 	 * of the other existing decoding restrictions for it.
3576 	 */
3577 	const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0);
3578 
3579 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
3580 		return (0);
3581 
3582 	if (cpu_mode == CPU_MODE_REAL && !is_movcr)
3583 		return (-1);
3584 
3585 	if (vie_peek(vie, &x))
3586 		return (-1);
3587 
3588 	vie->mod = (x >> 6) & 0x3;
3589 	vie->rm =  (x >> 0) & 0x7;
3590 	vie->reg = (x >> 3) & 0x7;
3591 
3592 	/*
3593 	 * A direct addressing mode makes no sense in the context of an EPT
3594 	 * fault. There has to be a memory access involved to cause the
3595 	 * EPT fault.
3596 	 */
3597 	if (vie->mod == VIE_MOD_DIRECT && !is_movcr)
3598 		return (-1);
3599 
3600 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
3601 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
3602 		/*
3603 		 * Table 2-5: Special Cases of REX Encodings
3604 		 *
3605 		 * mod=0, r/m=5 is used in the compatibility mode to
3606 		 * indicate a disp32 without a base register.
3607 		 *
3608 		 * mod!=3, r/m=4 is used in the compatibility mode to
3609 		 * indicate that the SIB byte is present.
3610 		 *
3611 		 * The 'b' bit in the REX prefix is don't care in
3612 		 * this case.
3613 		 */
3614 	} else {
3615 		vie->rm |= (vie->rex_b << 3);
3616 	}
3617 
3618 	vie->reg |= (vie->rex_r << 3);
3619 
3620 	/* SIB */
3621 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
3622 		goto done;
3623 
3624 	vie->base_register = gpr_map[vie->rm];
3625 
3626 	switch (vie->mod) {
3627 	case VIE_MOD_INDIRECT_DISP8:
3628 		vie->disp_bytes = 1;
3629 		break;
3630 	case VIE_MOD_INDIRECT_DISP32:
3631 		vie->disp_bytes = 4;
3632 		break;
3633 	case VIE_MOD_INDIRECT:
3634 		if (vie->rm == VIE_RM_DISP32) {
3635 			vie->disp_bytes = 4;
3636 			/*
3637 			 * Table 2-7. RIP-Relative Addressing
3638 			 *
3639 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
3640 			 * whereas in compatibility mode it just implies disp32.
3641 			 */
3642 
3643 			if (cpu_mode == CPU_MODE_64BIT)
3644 				vie->base_register = VM_REG_GUEST_RIP;
3645 			else
3646 				vie->base_register = VM_REG_LAST;
3647 		}
3648 		break;
3649 	}
3650 
3651 done:
3652 	vie_advance(vie);
3653 
3654 	return (0);
3655 }
3656 
3657 static int
decode_sib(struct vie * vie)3658 decode_sib(struct vie *vie)
3659 {
3660 	uint8_t x;
3661 
3662 	/* Proceed only if SIB byte is present */
3663 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
3664 		return (0);
3665 
3666 	if (vie_peek(vie, &x))
3667 		return (-1);
3668 
3669 	/* De-construct the SIB byte */
3670 	vie->ss = (x >> 6) & 0x3;
3671 	vie->index = (x >> 3) & 0x7;
3672 	vie->base = (x >> 0) & 0x7;
3673 
3674 	/* Apply the REX prefix modifiers */
3675 	vie->index |= vie->rex_x << 3;
3676 	vie->base |= vie->rex_b << 3;
3677 
3678 	switch (vie->mod) {
3679 	case VIE_MOD_INDIRECT_DISP8:
3680 		vie->disp_bytes = 1;
3681 		break;
3682 	case VIE_MOD_INDIRECT_DISP32:
3683 		vie->disp_bytes = 4;
3684 		break;
3685 	}
3686 
3687 	if (vie->mod == VIE_MOD_INDIRECT &&
3688 	    (vie->base == 5 || vie->base == 13)) {
3689 		/*
3690 		 * Special case when base register is unused if mod = 0
3691 		 * and base = %rbp or %r13.
3692 		 *
3693 		 * Documented in:
3694 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3695 		 * Table 2-5: Special Cases of REX Encodings
3696 		 */
3697 		vie->disp_bytes = 4;
3698 	} else {
3699 		vie->base_register = gpr_map[vie->base];
3700 	}
3701 
3702 	/*
3703 	 * All encodings of 'index' are valid except for %rsp (4).
3704 	 *
3705 	 * Documented in:
3706 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3707 	 * Table 2-5: Special Cases of REX Encodings
3708 	 */
3709 	if (vie->index != 4)
3710 		vie->index_register = gpr_map[vie->index];
3711 
3712 	/* 'scale' makes sense only in the context of an index register */
3713 	if (vie->index_register < VM_REG_LAST)
3714 		vie->scale = 1 << vie->ss;
3715 
3716 	vie_advance(vie);
3717 
3718 	return (0);
3719 }
3720 
3721 static int
decode_displacement(struct vie * vie)3722 decode_displacement(struct vie *vie)
3723 {
3724 	int n, i;
3725 	uint8_t x;
3726 
3727 	union {
3728 		char	buf[4];
3729 		int8_t	signed8;
3730 		int32_t	signed32;
3731 	} u;
3732 
3733 	if ((n = vie->disp_bytes) == 0)
3734 		return (0);
3735 
3736 	if (n != 1 && n != 4)
3737 		panic("decode_displacement: invalid disp_bytes %d", n);
3738 
3739 	for (i = 0; i < n; i++) {
3740 		if (vie_peek(vie, &x))
3741 			return (-1);
3742 
3743 		u.buf[i] = x;
3744 		vie_advance(vie);
3745 	}
3746 
3747 	if (n == 1)
3748 		vie->displacement = u.signed8;		/* sign-extended */
3749 	else
3750 		vie->displacement = u.signed32;		/* sign-extended */
3751 
3752 	return (0);
3753 }
3754 
3755 static int
decode_immediate(struct vie * vie)3756 decode_immediate(struct vie *vie)
3757 {
3758 	int i, n;
3759 	uint8_t x;
3760 	union {
3761 		char	buf[4];
3762 		int8_t	signed8;
3763 		int16_t	signed16;
3764 		int32_t	signed32;
3765 	} u;
3766 
3767 	/* Figure out immediate operand size (if any) */
3768 	if (vie->op.op_flags & VIE_OP_F_IMM) {
3769 		/*
3770 		 * Section 2.2.1.5 "Immediates", Intel SDM:
3771 		 * In 64-bit mode the typical size of immediate operands
3772 		 * remains 32-bits. When the operand size if 64-bits, the
3773 		 * processor sign-extends all immediates to 64-bits prior
3774 		 * to their use.
3775 		 */
3776 		if (vie->opsize == 4 || vie->opsize == 8)
3777 			vie->imm_bytes = 4;
3778 		else
3779 			vie->imm_bytes = 2;
3780 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
3781 		vie->imm_bytes = 1;
3782 	}
3783 
3784 	if ((n = vie->imm_bytes) == 0)
3785 		return (0);
3786 
3787 	KASSERT(n == 1 || n == 2 || n == 4,
3788 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
3789 
3790 	for (i = 0; i < n; i++) {
3791 		if (vie_peek(vie, &x))
3792 			return (-1);
3793 
3794 		u.buf[i] = x;
3795 		vie_advance(vie);
3796 	}
3797 
3798 	/* sign-extend the immediate value before use */
3799 	if (n == 1)
3800 		vie->immediate = u.signed8;
3801 	else if (n == 2)
3802 		vie->immediate = u.signed16;
3803 	else
3804 		vie->immediate = u.signed32;
3805 
3806 	return (0);
3807 }
3808 
3809 static int
decode_moffset(struct vie * vie)3810 decode_moffset(struct vie *vie)
3811 {
3812 	int i, n;
3813 	uint8_t x;
3814 	union {
3815 		char	buf[8];
3816 		uint64_t u64;
3817 	} u;
3818 
3819 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
3820 		return (0);
3821 
3822 	/*
3823 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
3824 	 * The memory offset size follows the address-size of the instruction.
3825 	 */
3826 	n = vie->addrsize;
3827 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
3828 
3829 	u.u64 = 0;
3830 	for (i = 0; i < n; i++) {
3831 		if (vie_peek(vie, &x))
3832 			return (-1);
3833 
3834 		u.buf[i] = x;
3835 		vie_advance(vie);
3836 	}
3837 	vie->displacement = u.u64;
3838 	return (0);
3839 }
3840 
3841 /*
3842  * Verify that the 'guest linear address' provided as collateral of the nested
3843  * page table fault matches with our instruction decoding.
3844  */
3845 int
vie_verify_gla(struct vie * vie,struct vm * vm,int cpuid,uint64_t gla)3846 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla)
3847 {
3848 	int error;
3849 	uint64_t base, segbase, idx, gla2;
3850 	enum vm_reg_name seg;
3851 	struct seg_desc desc;
3852 
3853 	ASSERT((vie->status & VIES_INST_DECODE) != 0);
3854 
3855 	/*
3856 	 * If there was no valid GLA context with the exit, or the decoded
3857 	 * instruction acts on more than one address, verification is done.
3858 	 */
3859 	if (gla == VIE_INVALID_GLA ||
3860 	    (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) {
3861 		return (0);
3862 	}
3863 
3864 	base = 0;
3865 	if (vie->base_register != VM_REG_LAST) {
3866 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
3867 		if (error) {
3868 			printf("verify_gla: error %d getting base reg %d\n",
3869 			    error, vie->base_register);
3870 			return (-1);
3871 		}
3872 
3873 		/*
3874 		 * RIP-relative addressing starts from the following
3875 		 * instruction
3876 		 */
3877 		if (vie->base_register == VM_REG_GUEST_RIP)
3878 			base += vie->num_processed;
3879 	}
3880 
3881 	idx = 0;
3882 	if (vie->index_register != VM_REG_LAST) {
3883 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
3884 		if (error) {
3885 			printf("verify_gla: error %d getting index reg %d\n",
3886 			    error, vie->index_register);
3887 			return (-1);
3888 		}
3889 	}
3890 
3891 	/*
3892 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
3893 	 *
3894 	 * In 64-bit mode, segmentation is generally (but not
3895 	 * completely) disabled.  The exceptions are the FS and GS
3896 	 * segments.
3897 	 *
3898 	 * In legacy IA-32 mode, when the ESP or EBP register is used
3899 	 * as the base, the SS segment is the default segment.  For
3900 	 * other data references, except when relative to stack or
3901 	 * string destination the DS segment is the default.  These
3902 	 * can be overridden to allow other segments to be accessed.
3903 	 */
3904 	if (vie->segment_override) {
3905 		seg = vie->segment_register;
3906 	} else if (vie->base_register == VM_REG_GUEST_RSP ||
3907 	    vie->base_register == VM_REG_GUEST_RBP) {
3908 		seg = VM_REG_GUEST_SS;
3909 	} else {
3910 		seg = VM_REG_GUEST_DS;
3911 	}
3912 	if (vie->paging.cpu_mode == CPU_MODE_64BIT &&
3913 	    seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) {
3914 		segbase = 0;
3915 	} else {
3916 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
3917 		if (error) {
3918 			printf("verify_gla: error %d getting segment"
3919 			    " descriptor %d", error, vie->segment_register);
3920 			return (-1);
3921 		}
3922 		segbase = desc.base;
3923 	}
3924 
3925 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
3926 	gla2 &= size2mask[vie->addrsize];
3927 	if (gla != gla2) {
3928 		printf("verify_gla mismatch: segbase(0x%0lx)"
3929 		    "base(0x%0lx), scale(%d), index(0x%0lx), "
3930 		    "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
3931 		    segbase, base, vie->scale, idx, vie->displacement,
3932 		    gla, gla2);
3933 		return (-1);
3934 	}
3935 
3936 	return (0);
3937 }
3938 
3939 int
vie_decode_instruction(struct vie * vie,struct vm * vm,int cpuid,int cs_d)3940 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d)
3941 {
3942 	enum vm_cpu_mode cpu_mode;
3943 
3944 	if ((vie->status & VIES_INST_FETCH) == 0) {
3945 		return (EINVAL);
3946 	}
3947 
3948 	cpu_mode = vie->paging.cpu_mode;
3949 
3950 	if (decode_prefixes(vie, cpu_mode, cs_d))
3951 		return (-1);
3952 
3953 	if (decode_opcode(vie))
3954 		return (-1);
3955 
3956 	if (decode_modrm(vie, cpu_mode))
3957 		return (-1);
3958 
3959 	if (decode_sib(vie))
3960 		return (-1);
3961 
3962 	if (decode_displacement(vie))
3963 		return (-1);
3964 
3965 	if (decode_immediate(vie))
3966 		return (-1);
3967 
3968 	if (decode_moffset(vie))
3969 		return (-1);
3970 
3971 	vie->status |= VIES_INST_DECODE;
3972 
3973 	return (0);
3974 }
3975