xref: /illumos-gate/usr/src/uts/intel/os/hma_fpu.c (revision 7c8c0b82)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2018, Joyent, Inc.
14  * Copyright 2022 Oxide Computer Company
15  */
16 
17 /*
18  * This implements the hypervisor multiplexor FPU API. Its purpose is to make it
19  * easy to switch between the host and guest hypervisor while hiding all the
20  * details about CR0.TS and how to save the host's state as required.
21  */
22 
23 #include <sys/pcb.h>
24 #include <sys/kmem.h>
25 #include <sys/debug.h>
26 #include <sys/cmn_err.h>
27 #include <sys/ddi.h>
28 #include <sys/sunddi.h>
29 #include <sys/hma.h>
30 #include <sys/x86_archext.h>
31 #include <sys/archsystm.h>
32 #include <sys/controlregs.h>
33 #include <sys/sysmacros.h>
34 #include <sys/stdbool.h>
35 #include <sys/ontrap.h>
36 #include <sys/cpuvar.h>
37 #include <sys/disp.h>
38 
39 struct hma_fpu {
40 	fpu_ctx_t	hf_guest_fpu;
41 	kthread_t	*hf_curthread;
42 	boolean_t	hf_inguest;
43 };
44 
45 int
hma_fpu_init(hma_fpu_t * fpu)46 hma_fpu_init(hma_fpu_t *fpu)
47 {
48 	struct xsave_state *xs;
49 
50 	ASSERT0(fpu->hf_inguest);
51 
52 	switch (fp_save_mech) {
53 	case FP_FXSAVE:
54 		bcopy(&sse_initial, fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx,
55 		    sizeof (struct fxsave_state));
56 		fpu->hf_guest_fpu.fpu_xsave_mask = 0;
57 		break;
58 	case FP_XSAVE:
59 		/*
60 		 * Zero everything in the xsave case as we may have data in
61 		 * the structure that's not part of the initial value (which
62 		 * only really deals with a small portion of the xsave state).
63 		 */
64 		xs = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs;
65 		bzero(xs, cpuid_get_xsave_size());
66 		bcopy(&avx_initial, xs, sizeof (*xs));
67 		xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
68 		fpu->hf_guest_fpu.fpu_xsave_mask = XFEATURE_FP_ALL;
69 		break;
70 	default:
71 		panic("Invalid fp_save_mech");
72 	}
73 
74 	fpu->hf_guest_fpu.fpu_flags = FPU_EN | FPU_VALID;
75 
76 	return (0);
77 }
78 
79 void
hma_fpu_free(hma_fpu_t * fpu)80 hma_fpu_free(hma_fpu_t *fpu)
81 {
82 	if (fpu == NULL)
83 		return;
84 
85 	ASSERT3P(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, !=, NULL);
86 	kmem_cache_free(fpsave_cachep,
87 	    fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic);
88 	kmem_free(fpu, sizeof (*fpu));
89 }
90 
91 hma_fpu_t *
hma_fpu_alloc(int kmflag)92 hma_fpu_alloc(int kmflag)
93 {
94 	hma_fpu_t *fpu;
95 
96 	fpu = kmem_zalloc(sizeof (hma_fpu_t), kmflag);
97 	if (fpu == NULL)
98 		return (NULL);
99 
100 	fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic =
101 	    kmem_cache_alloc(fpsave_cachep, kmflag);
102 	if (fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic == NULL) {
103 		kmem_free(fpu, sizeof (hma_fpu_t));
104 		return (NULL);
105 	}
106 	fpu->hf_inguest = B_FALSE;
107 
108 	/*
109 	 * Make sure the entire structure is zero.
110 	 */
111 	switch (fp_save_mech) {
112 	case FP_FXSAVE:
113 		bzero(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic,
114 		    sizeof (struct fxsave_state));
115 		break;
116 	case FP_XSAVE:
117 		bzero(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic,
118 		    cpuid_get_xsave_size());
119 		break;
120 	default:
121 		panic("Invalid fp_save_mech");
122 	}
123 
124 	return (fpu);
125 }
126 
127 void
hma_fpu_start_guest(hma_fpu_t * fpu)128 hma_fpu_start_guest(hma_fpu_t *fpu)
129 {
130 	/*
131 	 * Note, we don't check / assert whether or not t_prempt is true because
132 	 * there are contexts where this is safe to call (from a context op)
133 	 * where t_preempt may not be set.
134 	 */
135 	ASSERT3S(fpu->hf_inguest, ==, B_FALSE);
136 	ASSERT3P(fpu->hf_curthread, ==, NULL);
137 	ASSERT3P(curthread->t_lwp, !=, NULL);
138 	ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_EN, !=, 0);
139 	ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_VALID, !=, 0);
140 
141 	fpu->hf_inguest = B_TRUE;
142 	fpu->hf_curthread = curthread;
143 
144 
145 	fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu);
146 	fp_restore(&fpu->hf_guest_fpu);
147 	fpu->hf_guest_fpu.fpu_flags &= ~FPU_VALID;
148 }
149 
150 /*
151  * Since fp_save() assumes a thread-centric view of the FPU usage -- it will
152  * assert if attempting to save elsewhere than the thread PCB, and will elide
153  * action if the FPU is not enabled -- we cannot use it for the manual saving of
154  * FPU contents.  To work around that, we call the save mechanism directly.
155  */
156 static void
do_fp_save(fpu_ctx_t * fpu)157 do_fp_save(fpu_ctx_t *fpu)
158 {
159 	/*
160 	 * For our manual saving, we expect that the thread PCB never be the
161 	 * landing zone for the data.
162 	 */
163 	ASSERT(curthread->t_lwp == NULL ||
164 	    fpu != &curthread->t_lwp->lwp_pcb.pcb_fpu);
165 
166 	switch (fp_save_mech) {
167 	case FP_FXSAVE:
168 		fpxsave(fpu->fpu_regs.kfpu_u.kfpu_fx);
169 		break;
170 	case FP_XSAVE:
171 		xsavep(fpu->fpu_regs.kfpu_u.kfpu_xs, fpu->fpu_xsave_mask);
172 		break;
173 	default:
174 		panic("Invalid fp_save_mech");
175 	}
176 	fpu->fpu_flags |= FPU_VALID;
177 }
178 
179 
180 void
hma_fpu_stop_guest(hma_fpu_t * fpu)181 hma_fpu_stop_guest(hma_fpu_t *fpu)
182 {
183 	ASSERT3S(fpu->hf_inguest, ==, B_TRUE);
184 	ASSERT3P(fpu->hf_curthread, ==, curthread);
185 	ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_EN, !=, 0);
186 	ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_VALID, ==, 0);
187 
188 	do_fp_save(&fpu->hf_guest_fpu);
189 
190 	fp_restore(&curthread->t_lwp->lwp_pcb.pcb_fpu);
191 
192 	fpu->hf_inguest = B_FALSE;
193 	fpu->hf_curthread = NULL;
194 }
195 
196 /*
197  * Will output up to `ndesc` records into `descp`.  The required size for an
198  * XSAVE area containing all of the data fields supported by the host will be
199  * placed in `req_sizep` (if non-NULL).  Returns the number of feature bits
200  * supported by the host.
201  */
202 uint_t
hma_fpu_describe_xsave_state(hma_xsave_state_desc_t * descp,uint_t ndesc,size_t * req_sizep)203 hma_fpu_describe_xsave_state(hma_xsave_state_desc_t *descp, uint_t ndesc,
204     size_t *req_sizep)
205 {
206 	uint64_t features;
207 
208 	switch (fp_save_mech) {
209 	case FP_FXSAVE:
210 		/*
211 		 * Even without xsave support, the FPU will have legacy x87
212 		 * float and SSE state contained within.
213 		 */
214 		features = XFEATURE_LEGACY_FP | XFEATURE_SSE;
215 		break;
216 	case FP_XSAVE:
217 		features = get_xcr(XFEATURE_ENABLED_MASK);
218 		break;
219 	default:
220 		panic("Invalid fp_save_mech");
221 	}
222 
223 	uint_t count, pos;
224 	uint_t max_size = MIN_XSAVE_SIZE;
225 	for (count = 0, pos = 0; pos <= 63; pos++) {
226 		const uint64_t bit = (1 << pos);
227 		uint32_t size, off;
228 
229 		if ((features & bit) == 0) {
230 			continue;
231 		}
232 
233 		if (bit == XFEATURE_LEGACY_FP || bit == XFEATURE_SSE) {
234 			size = sizeof (struct fxsave_state);
235 			off = 0;
236 		} else {
237 			/*
238 			 * Size and position of data types within the XSAVE area
239 			 * is described in leaf 0xD in the subfunction
240 			 * corresponding to the bit position (for pos > 1).
241 			 */
242 			struct cpuid_regs regs = {
243 				.cp_eax = 0xD,
244 				.cp_ecx = pos,
245 			};
246 
247 			ASSERT3U(pos, >, 1);
248 
249 			(void) __cpuid_insn(&regs);
250 			size = regs.cp_eax;
251 			off = regs.cp_ebx;
252 		}
253 		max_size = MAX(max_size, off + size);
254 
255 		if (count < ndesc) {
256 			hma_xsave_state_desc_t *desc = &descp[count];
257 
258 			desc->hxsd_bit = bit;
259 			desc->hxsd_size = size;
260 			desc->hxsd_off = off;
261 		}
262 		count++;
263 	}
264 	if (req_sizep != NULL) {
265 		*req_sizep = max_size;
266 	}
267 	return (count);
268 }
269 
270 hma_fpu_xsave_result_t
hma_fpu_get_xsave_state(const hma_fpu_t * fpu,void * buf,size_t len)271 hma_fpu_get_xsave_state(const hma_fpu_t *fpu, void *buf, size_t len)
272 {
273 	ASSERT(!fpu->hf_inguest);
274 
275 	size_t valid_len;
276 	switch (fp_save_mech) {
277 	case FP_FXSAVE: {
278 		if (len < MIN_XSAVE_SIZE) {
279 			return (HFXR_NO_SPACE);
280 		}
281 		bcopy(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, buf,
282 		    sizeof (struct fxsave_state));
283 
284 		struct xsave_header hdr = {
285 			.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE,
286 		};
287 		bcopy(&hdr, buf + sizeof (struct fxsave_state), sizeof (hdr));
288 
289 		break;
290 	}
291 	case FP_XSAVE:
292 		(void) hma_fpu_describe_xsave_state(NULL,  0, &valid_len);
293 		if (len < valid_len) {
294 			return (HFXR_NO_SPACE);
295 		}
296 		bcopy(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, buf,
297 		    valid_len);
298 		break;
299 	default:
300 		panic("Invalid fp_save_mech");
301 	}
302 
303 	return (HFXR_OK);
304 }
305 
306 hma_fpu_xsave_result_t
hma_fpu_set_xsave_state(hma_fpu_t * fpu,void * buf,size_t len)307 hma_fpu_set_xsave_state(hma_fpu_t *fpu, void *buf, size_t len)
308 {
309 	ASSERT(!fpu->hf_inguest);
310 
311 	if (len < MIN_XSAVE_SIZE) {
312 		return (HFXR_NO_SPACE);
313 	}
314 	/* 64-byte alignment is demanded of the FPU-related operations */
315 	if (((uintptr_t)buf & 63) != 0) {
316 		return (HFXR_BAD_ALIGN);
317 	}
318 
319 	struct xsave_header *hdr = buf + sizeof (struct fxsave_state);
320 	if (hdr->xsh_xcomp_bv != 0) {
321 		/* XSAVEC formatting not supported at this time */
322 		return (HFXR_UNSUP_FMT);
323 	}
324 
325 	uint64_t allowed_bits;
326 	size_t save_area_size;
327 	switch (fp_save_mech) {
328 	case FP_FXSAVE:
329 		allowed_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE;
330 		save_area_size = sizeof (struct fxsave_state);
331 		break;
332 	case FP_XSAVE:
333 		allowed_bits = get_xcr(XFEATURE_ENABLED_MASK);
334 		save_area_size = cpuid_get_xsave_size();
335 		break;
336 	default:
337 		panic("Invalid fp_save_mech");
338 	}
339 	if ((hdr->xsh_xstate_bv & ~(allowed_bits)) != 0) {
340 		return (HFXR_UNSUP_FEAT);
341 	}
342 
343 	/*
344 	 * We validate the incoming state with the FPU itself prior to saving it
345 	 * into the guest FPU context area.  In order to preserve any state
346 	 * currently housed in the FPU, we save it to a temporarily allocated
347 	 * FPU context. It is important to note that we are not following the
348 	 * normal rules around state management detailed in uts/intel/os/fpu.c.
349 	 * This saving is unconditional, uncaring about the state in the FPU or
350 	 * the value of CR0_TS, simplifying our process before returning to the
351 	 * caller (without needing to chcek of an lwp, etc).  To prevent
352 	 * interrupting threads from encountering this unusual FPU state, we
353 	 * keep interrupts disabled for the duration.
354 	 */
355 	fpu_ctx_t temp_ctx = {
356 		.fpu_xsave_mask = XFEATURE_FP_ALL,
357 	};
358 	temp_ctx.fpu_regs.kfpu_u.kfpu_generic =
359 	    kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
360 	bzero(temp_ctx.fpu_regs.kfpu_u.kfpu_generic, save_area_size);
361 
362 	ulong_t iflag;
363 	iflag = intr_clear();
364 	bool disable_when_done = (getcr0() & CR0_TS) != 0;
365 	do_fp_save(&temp_ctx);
366 
367 	/*
368 	 * If the provided data is invalid, it will cause a #GP when we attempt
369 	 * to load it into the FPU, so protect against that with on_trap().
370 	 * Should the data load successfully, we can then be confident that its
371 	 * later use in via hma_fpu_start_guest() will be safe.
372 	 */
373 	on_trap_data_t otd;
374 	volatile hma_fpu_xsave_result_t res = HFXR_OK;
375 	if (on_trap(&otd, OT_DATA_EC) != 0) {
376 		res = HFXR_INVALID_DATA;
377 		goto done;
378 	}
379 
380 	switch (fp_save_mech) {
381 	case FP_FXSAVE:
382 		if (hdr->xsh_xstate_bv == 0) {
383 			/*
384 			 * An empty xstate_bv means we can simply load the
385 			 * legacy FP/SSE area with their initial state.
386 			 */
387 			bcopy(&sse_initial,
388 			    fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx,
389 			    sizeof (sse_initial));
390 		} else {
391 			fpxrestore(buf);
392 			fpxsave(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx);
393 		}
394 		break;
395 	case FP_XSAVE:
396 		xrestore(buf, XFEATURE_FP_ALL);
397 		xsavep(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs,
398 		    fpu->hf_guest_fpu.fpu_xsave_mask);
399 		break;
400 	default:
401 		panic("Invalid fp_save_mech");
402 	}
403 
404 done:
405 	no_trap();
406 	fp_restore(&temp_ctx);
407 	if (disable_when_done) {
408 		fpdisable();
409 	}
410 	intr_restore(iflag);
411 	kmem_cache_free(fpsave_cachep, temp_ctx.fpu_regs.kfpu_u.kfpu_generic);
412 
413 	return (res);
414 }
415 
416 void
hma_fpu_get_fxsave_state(const hma_fpu_t * fpu,struct fxsave_state * fx)417 hma_fpu_get_fxsave_state(const hma_fpu_t *fpu, struct fxsave_state *fx)
418 {
419 	const struct fxsave_state *guest;
420 
421 	ASSERT3S(fpu->hf_inguest, ==, B_FALSE);
422 
423 	guest = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx;
424 	bcopy(guest, fx, sizeof (*fx));
425 }
426 
427 int
hma_fpu_set_fxsave_state(hma_fpu_t * fpu,const struct fxsave_state * fx)428 hma_fpu_set_fxsave_state(hma_fpu_t *fpu, const struct fxsave_state *fx)
429 {
430 	struct fxsave_state *gfx;
431 	struct xsave_state *gxs;
432 
433 	ASSERT3S(fpu->hf_inguest, ==, B_FALSE);
434 
435 	/*
436 	 * If reserved bits are set in fx_mxcsr, then we will take a #GP when
437 	 * we restore them. Reject this outright.
438 	 *
439 	 * We do not need to check if we are dealing with state that has pending
440 	 * exceptions. This was only the case with the original FPU save and
441 	 * restore mechanisms (fsave/frstor). When using fxsave/fxrstor and
442 	 * xsave/xrstor they will be deferred to the user using the FPU, which
443 	 * is what we'd want here (they'd be used in guest context).
444 	 */
445 	if ((fx->fx_mxcsr & ~sse_mxcsr_mask) != 0)
446 		return (EINVAL);
447 
448 	switch (fp_save_mech) {
449 	case FP_FXSAVE:
450 		gfx = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx;
451 		bcopy(fx, gfx, sizeof (*fx));
452 		break;
453 	case FP_XSAVE:
454 		gxs = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs;
455 		bzero(gxs, cpuid_get_xsave_size());
456 		bcopy(fx, &gxs->xs_fxsave, sizeof (*fx));
457 		gxs->xs_header.xsh_xstate_bv =
458 		    XFEATURE_LEGACY_FP | XFEATURE_SSE;
459 		break;
460 	default:
461 		panic("Invalid fp_save_mech");
462 	}
463 
464 	return (0);
465 }
466