1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2021 Joyent, Inc.
24 * Copyright 2021 RackTop Systems, Inc.
25 */
26
27 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
29 /* All Rights Reserved */
30
31 /* Copyright (c) 1987, 1988 Microsoft Corporation */
32 /* All Rights Reserved */
33
34 /*
35 * Copyright (c) 2009, Intel Corporation.
36 * All rights reserved.
37 */
38
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/signal.h>
42 #include <sys/regset.h>
43 #include <sys/privregs.h>
44 #include <sys/psw.h>
45 #include <sys/trap.h>
46 #include <sys/fault.h>
47 #include <sys/systm.h>
48 #include <sys/user.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/pcb.h>
52 #include <sys/lwp.h>
53 #include <sys/cpuvar.h>
54 #include <sys/thread.h>
55 #include <sys/disp.h>
56 #include <sys/fp.h>
57 #include <sys/siginfo.h>
58 #include <sys/archsystm.h>
59 #include <sys/kmem.h>
60 #include <sys/debug.h>
61 #include <sys/x86_archext.h>
62 #include <sys/sysmacros.h>
63 #include <sys/cmn_err.h>
64 #include <sys/kfpu.h>
65
66 /*
67 * FPU Management Overview
68 * -----------------------
69 *
70 * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
71 * however, many aspects of its life as a coprocessor are still around in x86.
72 *
73 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
74 * While that state still exists, there is much more that is covered by the FPU.
75 * Today, this includes not just traditional FPU state, but also supervisor only
76 * state. The following state is currently managed and covered logically by the
77 * idea of the FPU registers:
78 *
79 * o Traditional x87 FPU
80 * o Vector Registers (%xmm, %ymm, %zmm)
81 * o Memory Protection Extensions (MPX) Bounds Registers
82 * o Protected Key Rights Registers (PKRU)
83 * o Processor Trace data
84 *
85 * The rest of this covers how the FPU is managed and controlled, how state is
86 * saved and restored between threads, interactions with hypervisors, and other
87 * information exported to user land through aux vectors. A lot of background
88 * information is here to synthesize major parts of the Intel SDM, but
89 * unfortunately, it is not a replacement for reading it.
90 *
91 * FPU Control Registers
92 * ---------------------
93 *
94 * Because the x87 FPU began its life as a co-processor and the FPU was
95 * optional there are several bits that show up in %cr0 that we have to
96 * manipulate when dealing with the FPU. These are:
97 *
98 * o CR0.ET The 'extension type' bit. This was used originally to indicate
99 * that the FPU co-processor was present. Now it is forced on for
100 * compatibility. This is often used to verify whether or not the
101 * FPU is present.
102 *
103 * o CR0.NE The 'native error' bit. Used to indicate that native error
104 * mode should be enabled. This indicates that we should take traps
105 * on FPU errors. The OS enables this early in boot.
106 *
107 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not
108 * wait/fwait instructions generate a #NM if CR0.TS is set.
109 *
110 * o CR0.EM The 'Emulation' bit. This is used to cause floating point
111 * operations (x87 through SSE4) to trap with a #UD so they can be
112 * emulated. The system never sets this bit, but makes sure it is
113 * clear on processor start up.
114 *
115 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating
116 * point operation will generate a #NM. An fwait will as well,
117 * depending on the value in CR0.MP.
118 *
119 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
120 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
121 * complicated role. Historically it has been used to allow running systems to
122 * restore the FPU registers lazily. This will be discussed in greater depth
123 * later on.
124 *
125 * %cr4 is also used as part of the FPU control. Specifically we need to worry
126 * about the following bits in the system:
127 *
128 * o CR4.OSFXSR This bit is used to indicate that the OS understands and
129 * supports the execution of the fxsave and fxrstor
130 * instructions. This bit is required to be set to enable
131 * the use of the SSE->SSE4 instructions.
132 *
133 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand
134 * and take a SIMD floating point exception (#XM). This bit
135 * is always enabled by the system.
136 *
137 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and
138 * supports the execution of the xsave and xrstor family of
139 * instructions. This bit is required to use any of the AVX
140 * and newer feature sets.
141 *
142 * Because all supported processors are 64-bit, they'll always support the XMM
143 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
144 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
145 *
146 * %xcr0 is used to manage the behavior of the xsave feature set and is only
147 * present on the system if xsave is supported. %xcr0 is read and written to
148 * through by the xgetbv and xsetbv instructions. This register is present
149 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
150 * different component of the xsave state and controls whether or not that
151 * information is saved and restored. For newer feature sets like AVX and MPX,
152 * it also controls whether or not the corresponding instructions can be
153 * executed (much like CR0.OSFXSR does for the SSE feature sets).
154 *
155 * Everything in %xcr0 is around features available to users. There is also the
156 * IA32_XSS MSR which is used to control supervisor-only features that are still
157 * part of the xsave state. Bits that can be set in %xcr0 are reserved in
158 * IA32_XSS and vice versa. This is an important property that is particularly
159 * relevant to how the xsave instructions operate.
160 *
161 * Save Mechanisms
162 * ---------------
163 *
164 * When switching between running threads the FPU state needs to be saved and
165 * restored by the OS. If this state was not saved, users would rightfully
166 * complain about corrupt state. There are three mechanisms that exist on the
167 * processor for saving and restoring these state images:
168 *
169 * o fsave
170 * o fxsave
171 * o xsave
172 *
173 * fsave saves and restores only the x87 FPU and is the oldest of these
174 * mechanisms. This mechanism is never used in the kernel today because we are
175 * always running on systems that support fxsave.
176 *
177 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
178 * state to be saved and restored to and from a struct fxsave_state. This is the
179 * default mechanism that is used to save and restore the FPU on amd64. An
180 * important aspect of fxsave that was different from the original i386 fsave
181 * mechanism is that the restoring of FPU state with pending exceptions will not
182 * generate an exception, it will be deferred to the next use of the FPU.
183 *
184 * The final and by far the most complex mechanism is that of the xsave set.
185 * xsave allows for saving and restoring all of the traditional x86 pieces (x87
186 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
187 * registers.
188 *
189 * Data is saved and restored into and out of a struct xsave_state. The first
190 * part of the struct xsave_state is equivalent to the struct fxsave_state.
191 * After that, there is a header which is used to describe the remaining
192 * portions of the state. The header is a 64-byte value of which the first two
193 * uint64_t values are defined and the rest are reserved and must be zero. The
194 * first uint64_t is the xstate_bv member. This describes which values in the
195 * xsave_state are actually valid and present. This is updated on a save and
196 * used on restore. The second member is the xcomp_bv member. Its last bit
197 * determines whether or not a compressed version of the structure is used.
198 *
199 * When the uncompressed structure is used (currently the only format we
200 * support), then each state component is at a fixed offset in the structure,
201 * even if it is not being used. For example, if you only saved the AVX related
202 * state, but did not save the MPX related state, the offset would not change
203 * for any component. With the compressed format, components that aren't used
204 * are all elided (though the x87 and SSE state are always there).
205 *
206 * Unlike fxsave which saves all state, the xsave family does not always save
207 * and restore all the state that could be covered by the xsave_state. The
208 * instructions all take an argument which is a mask of what to consider. This
209 * is the same mask that will be used in the xstate_bv vector and it is also the
210 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
211 * considered with the xsaves and xrstors instructions.
212 *
213 * When a save or restore is requested, a bitwise and is performed between the
214 * requested bits and those that have been enabled in %xcr0. Only the bits that
215 * match that are then saved or restored. Others will be silently ignored by
216 * the processor. This idea is used often in the OS. We will always request that
217 * we save and restore all of the state, but only those portions that are
218 * actually enabled in %xcr0 will be touched.
219 *
220 * If a feature has been asked to be restored that is not set in the xstate_bv
221 * feature vector of the save state, then it will be set to its initial state by
222 * the processor (usually zeros). Also, when asked to save state, the processor
223 * may not write out data that is in its initial state as an optimization. This
224 * optimization only applies to saving data and not to restoring data.
225 *
226 * There are a few different variants of the xsave and xrstor instruction. They
227 * are:
228 *
229 * o xsave This is the original save instruction. It will save all of the
230 * requested data in the xsave state structure. It only saves data
231 * in the uncompressed (xcomp_bv[63] is zero) format. It may be
232 * executed at all privilege levels.
233 *
234 * o xrstor This is the original restore instruction. It will restore all of
235 * the requested data. The xrstor function can handle both the
236 * compressed and uncompressed formats. It may be executed at all
237 * privilege levels.
238 *
239 * o xsaveopt This is a variant of the xsave instruction that employs
240 * optimizations to try and only write out state that has been
241 * modified since the last time an xrstor instruction was called.
242 * The processor tracks a tuple of information about the last
243 * xrstor and tries to ensure that the same buffer is being used
244 * when this optimization is being used. However, because of the
245 * way that it tracks the xrstor buffer based on the address of it,
246 * it is not suitable for use if that buffer can be easily reused.
247 * The most common case is trying to save data to the stack in
248 * rtld. It may be executed at all privilege levels.
249 *
250 * o xsavec This is a variant of the xsave instruction that writes out the
251 * compressed form of the xsave_state. Otherwise it behaves as
252 * xsave. It may be executed at all privilege levels.
253 *
254 * o xsaves This is a variant of the xsave instruction. It is similar to
255 * xsavec in that it always writes the compressed form of the
256 * buffer. Unlike all the other forms, this instruction looks at
257 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
258 * what to save and restore. xsaves also implements the same
259 * optimization that xsaveopt does around modified pieces. User
260 * land may not execute the instruction.
261 *
262 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves
263 * it can save and restore both the user and privileged states.
264 * Unlike xrstor it can only operate on the compressed form.
265 * User land may not execute the instruction.
266 *
267 * Based on all of these, the kernel has a precedence for what it will use.
268 * Basically, xsaves (not supported) is preferred to xsaveopt, which is
269 * preferred to xsave. A similar scheme is used when informing rtld (more later)
270 * about what it should use. xsavec is preferred to xsave. xsaveopt is not
271 * recommended due to the modified optimization not being appropriate for this
272 * use.
273 *
274 * Finally, there is one last gotcha with the xsave state. Importantly some AMD
275 * processors did not always save and restore some of the FPU exception state in
276 * some cases like Intel did. In those cases the OS will make up for this fact
277 * itself.
278 *
279 * FPU Initialization
280 * ------------------
281 *
282 * One difference with the FPU registers is that not all threads have FPU state,
283 * only those that have an lwp. Generally this means kernel threads, which all
284 * share p0 and its lwp, do not have FPU state. Though there are definitely
285 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
286 * and lwp interchangeably, just think of thread meaning a thread that has a
287 * lwp.
288 *
289 * Each lwp has its FPU state allocated in its pcb (process control block). The
290 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
291 * dynamically at start up based on the save mechanism that we're using and the
292 * amount of memory required for it. This is dynamic because the xsave_state
293 * size varies based on the supported feature set.
294 *
295 * The hardware side of the FPU is initialized early in boot before we mount the
296 * root file system. This is effectively done in fpu_probe(). This is where we
297 * make the final decision about what the save and restore mechanisms we should
298 * use are, create the fpsave_cachep kmem cache, and initialize a number of
299 * function pointers that use save and restoring logic.
300 *
301 * The thread/lwp side is a a little more involved. There are two different
302 * things that we need to concern ourselves with. The first is how the FPU
303 * resources are allocated and the second is how the FPU state is initialized
304 * for a given lwp.
305 *
306 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
307 * This is always called unconditionally by the system as part of creating an
308 * LWP.
309 *
310 * There are three different initialization paths that we deal with. The first
311 * is when we are executing a new process. As part of exec all of the register
312 * state is reset. The exec case is particularly important because init is born
313 * like Athena, sprouting from the head of the kernel, without any true parent
314 * to fork from. The second is used whenever we fork or create a new lwp. The
315 * third is to deal with special lwps like the agent lwp.
316 *
317 * During exec, we will call fp_exec() which will initialize and set up the FPU
318 * state for the process. That will fill in the initial state for the FPU and
319 * also set that state in the FPU itself. As part of fp_exec() we also install a
320 * thread context operations vector that takes care of dealing with the saving
321 * and restoring of the FPU. These context handlers will also be called whenever
322 * an lwp is created or forked. In those cases, to initialize the FPU we will
323 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
324 * operations vector for the new thread.
325 *
326 * Next we'll end up in the context operation fp_new_lwp(). This saves the
327 * current thread's state, initializes the new thread's state, and copies over
328 * the relevant parts of the originating thread's state. It's as this point that
329 * we also install the FPU context operations into the new thread, which ensures
330 * that all future threads that are descendants of the current one get the
331 * thread context operations (unless they call exec).
332 *
333 * To deal with some things like the agent lwp, we double check the state of the
334 * FPU in sys_rtt_common() to make sure that it has been enabled before
335 * returning to user land. In general, this path should be rare, but it's useful
336 * for the odd lwp here and there.
337 *
338 * The FPU state will remain valid most of the time. There are times that
339 * the state will be rewritten. For example in restorecontext, due to /proc, or
340 * the lwp calls exec(). Whether the context is being freed or we are resetting
341 * the state, we will call fp_free() to disable the FPU and our context.
342 *
343 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
344 * state by calling fp_lwp_cleanup().
345 *
346 * Kernel FPU Multiplexing
347 * -----------------------
348 *
349 * Just as the kernel has to maintain all of the general purpose registers when
350 * switching between scheduled threads, the same is true of the FPU registers.
351 *
352 * When a thread has FPU state, it also has a set of context operations
353 * installed. These context operations take care of making sure that the FPU is
354 * properly saved and restored during a context switch (fpsave_ctxt and
355 * fprestore_ctxt respectively). This means that the current implementation of
356 * the FPU is 'eager', when a thread is running the CPU will have its FPU state
357 * loaded. While this is always true when executing in userland, there are a few
358 * cases where this is not true in the kernel.
359 *
360 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
361 * employed. This meant that the FPU would be saved on a context switch and the
362 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
363 * then take a #NM trap, at which point we would restore the FPU from the save
364 * area and return to user land. Given the frequency of use of the FPU alone by
365 * libc, there's no point returning to user land just to trap again.
366 *
367 * There are a few cases though where the FPU state may need to be changed for a
368 * thread on its behalf. The most notable cases are in the case of processes
369 * using /proc, restorecontext, forking, etc. In all of these cases the kernel
370 * will force a threads FPU state to be saved into the PCB through the fp_save()
371 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
372 * pcb. This indicates that the save state holds currently valid data. As a side
373 * effect of this, CR0.TS will be set. To make sure that all of the state is
374 * updated before returning to user land, in these cases, we set a flag on the
375 * PCB that says the FPU needs to be updated. This will make sure that we take
376 * the slow path out of a system call to fix things up for the thread. Due to
377 * the fact that this is a rather rare case, effectively setting the equivalent
378 * of t_postsys is acceptable.
379 *
380 * CR0.TS will be set after a save occurs and cleared when a restore occurs.
381 * Generally this means it will be cleared immediately by the new thread that is
382 * running in a context switch. However, this isn't the case for kernel threads.
383 * They currently operate with CR0.TS set as no kernel state is restored for
384 * them. This means that using the FPU will cause a #NM and panic.
385 *
386 * The FPU_VALID flag on the currently executing thread's pcb is meant to track
387 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
388 * However, because we eagerly restore, the only time that CR0.TS should be set
389 * for a non-kernel thread is during operations where it will be cleared before
390 * returning to user land and importantly, the only data that is in it is its
391 * own.
392 *
393 * Kernel FPU Usage
394 * ----------------
395 *
396 * Traditionally the kernel never used the FPU since it had no need for
397 * floating point operations. However, modern FPU hardware supports a variety
398 * of SIMD extensions which can speed up code such as parity calculations or
399 * encryption.
400 *
401 * To allow the kernel to take advantage of these features, the
402 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
403 * around any usage of the FPU by the kernel to ensure that user-level context
404 * is properly saved/restored, as well as to properly setup the FPU for use by
405 * the kernel. There are a variety of ways this wrapping can be used, as
406 * discussed in this section below.
407 *
408 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
409 * operations, the kernel_fpu_alloc() function should be used to allocate a
410 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
411 * state. This structure is not tied to any thread. That is, different threads
412 * can reuse the same kfpu_state_t structure, although not concurrently. A
413 * kfpu_state_t structure is freed by the kernel_fpu_free() function.
414 *
415 * In some cases, the kernel may need to use the FPU for a short operation
416 * without the overhead to manage a kfpu_state_t structure and without
417 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
418 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
419 * parameter. This indicates that there is no kfpu_state_t. When used this way,
420 * kernel preemption should be disabled by the caller (kpreempt_disable) before
421 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
422 * For this usage, it is important to limit the kernel's FPU use to short
423 * operations. The tradeoff between using the FPU without a kfpu_state_t
424 * structure vs. the overhead of allowing a context switch while using the FPU
425 * should be carefully considered on a case by case basis.
426 *
427 * In other cases, kernel threads have an LWP, but never execute in user space.
428 * In this situation, the LWP's pcb_fpu area can be used to save/restore the
429 * kernel's FPU state if the thread is context switched, instead of having to
430 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
431 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
432 * enable this behavior. It is the caller's responsibility to ensure that this
433 * is only used for a kernel thread which never executes in user space.
434 *
435 * FPU Exceptions
436 * --------------
437 *
438 * Certain operations can cause the kernel to take traps due to FPU activity.
439 * Generally these events will cause a user process to receive a SIGFPU and if
440 * the kernel receives it in kernel context, we will die. Traditionally the #NM
441 * (Device Not Available / No Math) exception generated by CR0.TS would have
442 * caused us to restore the FPU. Now it is a fatal event regardless of whether
443 * or not user land causes it.
444 *
445 * While there are some cases where the kernel uses the FPU, it is up to the
446 * kernel to use the FPU in a way such that it cannot receive a trap or to use
447 * the appropriate trap protection mechanisms.
448 *
449 * Hypervisors
450 * -----------
451 *
452 * When providing support for hypervisors things are a little bit more
453 * complicated because the FPU is not virtualized at all. This means that they
454 * need to save and restore the FPU and %xcr0 across entry and exit to the
455 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
456 * allow us to use the full native state to make sure that we are always saving
457 * and restoring the full FPU that the host sees, even when the guest is using a
458 * subset.
459 *
460 * One tricky aspect of this is that the guest may be using a subset of %xcr0
461 * and therefore changing our %xcr0 on the fly. It is vital that when we're
462 * saving and restoring the FPU that we always use the largest %xcr0 contents
463 * otherwise we will end up leaving behind data in it.
464 *
465 * ELF PLT Support
466 * ---------------
467 *
468 * rtld has to preserve a subset of the FPU when it is saving and restoring
469 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
470 * more information. As a result, we set up an aux vector that contains
471 * information about what save and restore mechanisms it should be using and
472 * the sizing thereof based on what the kernel supports. This is passed down in
473 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
474 * initialized in fpu_subr.c.
475 */
476
477 kmem_cache_t *fpsave_cachep;
478
479 /* Legacy fxsave layout + xsave header + ymm */
480 #define AVX_XSAVE_SIZE (512 + 64 + 256)
481
482 /*
483 * Various sanity checks.
484 */
485 CTASSERT(sizeof (struct fxsave_state) == 512);
486 CTASSERT(sizeof (struct fnsave_state) == 108);
487 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
488 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
489
490 /*
491 * This structure is the x86 implementation of the kernel FPU that is defined in
492 * uts/common/sys/kfpu.h.
493 */
494
495 typedef enum kfpu_flags {
496 /*
497 * This indicates that the save state has initial FPU data.
498 */
499 KFPU_F_INITIALIZED = 0x01
500 } kfpu_flags_t;
501
502 struct kfpu_state {
503 fpu_ctx_t kfpu_ctx;
504 kfpu_flags_t kfpu_flags;
505 kthread_t *kfpu_curthread;
506 };
507
508 /*
509 * Initial kfpu state for SSE/SSE2 used by fpinit()
510 */
511 const struct fxsave_state sse_initial = {
512 FPU_CW_INIT, /* fx_fcw */
513 0, /* fx_fsw */
514 0, /* fx_fctw */
515 0, /* fx_fop */
516 0, /* fx_rip */
517 0, /* fx_rdp */
518 SSE_MXCSR_INIT /* fx_mxcsr */
519 /* rest of structure is zero */
520 };
521
522 /*
523 * Initial kfpu state for AVX used by fpinit()
524 */
525 const struct xsave_state avx_initial = {
526 /*
527 * The definition below needs to be identical with sse_initial
528 * defined above.
529 */
530 {
531 FPU_CW_INIT, /* fx_fcw */
532 0, /* fx_fsw */
533 0, /* fx_fctw */
534 0, /* fx_fop */
535 0, /* fx_rip */
536 0, /* fx_rdp */
537 SSE_MXCSR_INIT /* fx_mxcsr */
538 /* rest of structure is zero */
539 },
540 /*
541 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid,
542 * and CPU should initialize XMM/YMM.
543 */
544 1,
545 0 /* xs_xcomp_bv */
546 /* rest of structure is zero */
547 };
548
549 /*
550 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
551 * the #gp exception caused by setting unsupported bits in the
552 * MXCSR register
553 */
554 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
555
556 /*
557 * Initial kfpu state for x87 used by fpinit()
558 */
559 const struct fnsave_state x87_initial = {
560 FPU_CW_INIT, /* f_fcw */
561 0, /* __f_ign0 */
562 0, /* f_fsw */
563 0, /* __f_ign1 */
564 0xffff, /* f_ftw */
565 /* rest of structure is zero */
566 };
567
568 /*
569 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
570 * have an XSAVE-capable chip in fpu_probe.
571 */
572 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
573 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
574
575 /*
576 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
577 */
578 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
579
580 static int fpe_sicode(uint_t);
581 static int fpe_simd_sicode(uint_t);
582
583 /*
584 * Copy the state of parent lwp's floating point context into the new lwp.
585 * Invoked for both fork() and lwp_create().
586 *
587 * Note that we inherit -only- the control state (e.g. exception masks,
588 * rounding, precision control, etc.); the FPU registers are otherwise
589 * reset to their initial state.
590 */
591 static void
fp_new_lwp(kthread_id_t t,kthread_id_t ct)592 fp_new_lwp(kthread_id_t t, kthread_id_t ct)
593 {
594 struct fpu_ctx *fp; /* parent fpu context */
595 struct fpu_ctx *cfp; /* new fpu context */
596 struct fxsave_state *fx, *cfx;
597 struct xsave_state *cxs;
598
599 ASSERT(fp_kind != FP_NO);
600
601 fp = &t->t_lwp->lwp_pcb.pcb_fpu;
602 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
603
604 /*
605 * If the parent FPU state is still in the FPU hw then save it;
606 * conveniently, fp_save() already does this for us nicely.
607 */
608 fp_save(fp);
609
610 cfp->fpu_flags = FPU_EN | FPU_VALID;
611 cfp->fpu_regs.kfpu_status = 0;
612 cfp->fpu_regs.kfpu_xstatus = 0;
613
614 /*
615 * Make sure that the child's FPU is cleaned up and made ready for user
616 * land.
617 */
618 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
619
620 switch (fp_save_mech) {
621 case FP_FXSAVE:
622 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
623 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
624 bcopy(&sse_initial, cfx, sizeof (*cfx));
625 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
626 cfx->fx_fcw = fx->fx_fcw;
627 break;
628
629 case FP_XSAVE:
630 cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
631
632 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
633
634 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
635 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
636 cfx = &cxs->xs_fxsave;
637
638 bcopy(&avx_initial, cxs, sizeof (*cxs));
639 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
640 cfx->fx_fcw = fx->fx_fcw;
641 cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) &
642 XFEATURE_FP_INITIAL);
643 break;
644 default:
645 panic("Invalid fp_save_mech");
646 /*NOTREACHED*/
647 }
648
649 /*
650 * Mark that both the parent and child need to have the FPU cleaned up
651 * before returning to user land.
652 */
653
654 installctx(ct, cfp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
655 fp_new_lwp, NULL, fp_free, NULL);
656 }
657
658 /*
659 * Free any state associated with floating point context.
660 * Fp_free can be called in three cases:
661 * 1) from reaper -> thread_free -> freectx-> fp_free
662 * fp context belongs to a thread on deathrow
663 * nothing to do, thread will never be resumed
664 * thread calling ctxfree is reaper
665 *
666 * 2) from exec -> freectx -> fp_free
667 * fp context belongs to the current thread
668 * must disable fpu, thread calling ctxfree is curthread
669 *
670 * 3) from restorecontext -> setfpregs -> fp_free
671 * we have a modified context in the memory (lwp->pcb_fpu)
672 * disable fpu and release the fp context for the CPU
673 *
674 */
675 /*ARGSUSED*/
676 void
fp_free(struct fpu_ctx * fp,int isexec)677 fp_free(struct fpu_ctx *fp, int isexec)
678 {
679 ASSERT(fp_kind != FP_NO);
680
681 if (fp->fpu_flags & FPU_VALID)
682 return;
683
684 kpreempt_disable();
685 /*
686 * We want to do fpsave rather than fpdisable so that we can
687 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
688 */
689 fp->fpu_flags |= FPU_VALID;
690 /* If for current thread disable FP to track FPU_VALID */
691 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
692 /* Clear errors if any to prevent frstor from complaining */
693 (void) fperr_reset();
694 if (fp_kind & __FP_SSE)
695 (void) fpxerr_reset();
696 fpdisable();
697 }
698 kpreempt_enable();
699 }
700
701 /*
702 * Store the floating point state and disable the floating point unit.
703 */
704 void
fp_save(struct fpu_ctx * fp)705 fp_save(struct fpu_ctx *fp)
706 {
707 ASSERT(fp_kind != FP_NO);
708
709 kpreempt_disable();
710 if (!fp || fp->fpu_flags & FPU_VALID ||
711 (fp->fpu_flags & FPU_EN) == 0) {
712 kpreempt_enable();
713 return;
714 }
715 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
716
717 switch (fp_save_mech) {
718 case FP_FXSAVE:
719 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
720 break;
721
722 case FP_XSAVE:
723 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
724 break;
725 default:
726 panic("Invalid fp_save_mech");
727 /*NOTREACHED*/
728 }
729
730 fp->fpu_flags |= FPU_VALID;
731
732 /*
733 * We save the FPU as part of forking, execing, modifications via /proc,
734 * restorecontext, etc. As such, we need to make sure that we return to
735 * userland with valid state in the FPU. If we're context switched out
736 * before we hit sys_rtt_common() we'll end up having restored the FPU
737 * as part of the context ops operations. The restore logic always makes
738 * sure that FPU_VALID is set before doing a restore so we don't restore
739 * it a second time.
740 */
741 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
742
743 kpreempt_enable();
744 }
745
746 /*
747 * Restore the FPU context for the thread:
748 * The possibilities are:
749 * 1. No active FPU context: Load the new context into the FPU hw
750 * and enable the FPU.
751 */
752 void
fp_restore(struct fpu_ctx * fp)753 fp_restore(struct fpu_ctx *fp)
754 {
755 switch (fp_save_mech) {
756 case FP_FXSAVE:
757 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
758 break;
759
760 case FP_XSAVE:
761 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
762 break;
763 default:
764 panic("Invalid fp_save_mech");
765 /*NOTREACHED*/
766 }
767
768 fp->fpu_flags &= ~FPU_VALID;
769 }
770
771 /*
772 * Reset the FPU such that it is in a valid state for a new thread that is
773 * coming out of exec. The FPU will be in a usable state at this point. At this
774 * point we know that the FPU state has already been allocated and if this
775 * wasn't an init process, then it will have had fp_free() previously called.
776 */
777 void
fp_exec(void)778 fp_exec(void)
779 {
780 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
781 struct ctxop *ctx = installctx_preallocate();
782
783 if (fp_save_mech == FP_XSAVE) {
784 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
785 }
786
787 /*
788 * Make sure that we're not preempted in the middle of initializing the
789 * FPU on CPU.
790 */
791 kpreempt_disable();
792 installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
793 fp_new_lwp, NULL, fp_free, ctx);
794 fpinit();
795 fp->fpu_flags = FPU_EN;
796 kpreempt_enable();
797 }
798
799
800 /*
801 * Seeds the initial state for the current thread. The possibilities are:
802 * 1. Another process has modified the FPU state before we have done any
803 * initialization: Load the FPU state from the LWP state.
804 * 2. The FPU state has not been externally modified: Load a clean state.
805 */
806 void
fp_seed(void)807 fp_seed(void)
808 {
809 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
810
811 ASSERT(curthread->t_preempt >= 1);
812 ASSERT((fp->fpu_flags & FPU_EN) == 0);
813
814 /*
815 * Always initialize a new context and initialize the hardware.
816 */
817 if (fp_save_mech == FP_XSAVE) {
818 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
819 }
820
821 installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
822 fp_new_lwp, NULL, fp_free, NULL);
823 fpinit();
824
825 /*
826 * If FPU_VALID is set, it means someone has modified registers via
827 * /proc. In this case, restore the current lwp's state.
828 */
829 if (fp->fpu_flags & FPU_VALID)
830 fp_restore(fp);
831
832 ASSERT((fp->fpu_flags & FPU_VALID) == 0);
833 fp->fpu_flags = FPU_EN;
834 }
835
836 /*
837 * When using xsave/xrstor, these three functions are used by the lwp code to
838 * manage the memory for the xsave area.
839 */
840 void
fp_lwp_init(struct _klwp * lwp)841 fp_lwp_init(struct _klwp *lwp)
842 {
843 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
844
845 /*
846 * We keep a copy of the pointer in lwp_fpu so that we can restore the
847 * value in forklwp() after we duplicate the parent's LWP state.
848 */
849 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
850 kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
851
852 if (fp_save_mech == FP_XSAVE) {
853 /*
854 *
855 * We bzero since the fpinit() code path will only
856 * partially initialize the xsave area using avx_inital.
857 */
858 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
859 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
860 }
861 }
862
863 void
fp_lwp_cleanup(struct _klwp * lwp)864 fp_lwp_cleanup(struct _klwp *lwp)
865 {
866 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
867
868 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
869 kmem_cache_free(fpsave_cachep,
870 fp->fpu_regs.kfpu_u.kfpu_generic);
871 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
872 }
873 }
874
875 /*
876 * Called during the process of forklwp(). The kfpu_u pointer will have been
877 * overwritten while copying the parent's LWP structure. We have a valid copy
878 * stashed in the child's lwp_fpu which we use to restore the correct value.
879 */
880 void
fp_lwp_dup(struct _klwp * lwp)881 fp_lwp_dup(struct _klwp *lwp)
882 {
883 void *xp = lwp->lwp_fpu;
884 size_t sz;
885
886 switch (fp_save_mech) {
887 case FP_FXSAVE:
888 sz = sizeof (struct fxsave_state);
889 break;
890 case FP_XSAVE:
891 sz = cpuid_get_xsave_size();
892 break;
893 default:
894 panic("Invalid fp_save_mech");
895 /*NOTREACHED*/
896 }
897
898 /* copy the parent's values into the new lwp's struct */
899 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
900 /* now restore the pointer */
901 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
902 }
903
904 /*
905 * Handle a processor extension error fault
906 * Returns non zero for error.
907 */
908
909 /*ARGSUSED*/
910 int
fpexterrflt(struct regs * rp)911 fpexterrflt(struct regs *rp)
912 {
913 uint32_t fpcw, fpsw;
914 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
915
916 ASSERT(fp_kind != FP_NO);
917
918 /*
919 * Now we can enable the interrupts.
920 * (NOTE: x87 fp exceptions come thru interrupt gate)
921 */
922 sti();
923
924 if (!fpu_exists)
925 return (FPE_FLTINV);
926
927 /*
928 * Do an unconditional save of the FP state. If it's dirty (TS=0),
929 * it'll be saved into the fpu context area passed in (that of the
930 * current thread). If it's not dirty (it may not be, due to
931 * an intervening save due to a context switch between the sti(),
932 * above and here, then it's safe to just use the stored values in
933 * the context save area to determine the cause of the fault.
934 */
935 fp_save(fp);
936
937 /* clear exception flags in saved state, as if by fnclex */
938 switch (fp_save_mech) {
939 case FP_FXSAVE:
940 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
941 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
942 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
943 break;
944
945 case FP_XSAVE:
946 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
947 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
948 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
949 /*
950 * Always set LEGACY_FP as it may have been cleared by XSAVE
951 * instruction
952 */
953 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
954 break;
955 default:
956 panic("Invalid fp_save_mech");
957 /*NOTREACHED*/
958 }
959
960 fp->fpu_regs.kfpu_status = fpsw;
961
962 if ((fpsw & FPS_ES) == 0)
963 return (0); /* No exception */
964
965 /*
966 * "and" the exception flags with the complement of the mask
967 * bits to determine which exception occurred
968 */
969 return (fpe_sicode(fpsw & ~fpcw & 0x3f));
970 }
971
972 /*
973 * Handle an SSE/SSE2 precise exception.
974 * Returns a non-zero sicode for error.
975 */
976 /*ARGSUSED*/
977 int
fpsimderrflt(struct regs * rp)978 fpsimderrflt(struct regs *rp)
979 {
980 uint32_t mxcsr, xmask;
981 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
982
983 ASSERT(fp_kind & __FP_SSE);
984
985 /*
986 * NOTE: Interrupts are disabled during execution of this
987 * function. They are enabled by the caller in trap.c.
988 */
989
990 /*
991 * The only way we could have gotten here if there is no FP unit
992 * is via a user executing an INT $19 instruction, so there is
993 * no fault in that case.
994 */
995 if (!fpu_exists)
996 return (0);
997
998 /*
999 * Do an unconditional save of the FP state. If it's dirty (TS=0),
1000 * it'll be saved into the fpu context area passed in (that of the
1001 * current thread). If it's not dirty, then it's safe to just use
1002 * the stored values in the context save area to determine the
1003 * cause of the fault.
1004 */
1005 fp_save(fp); /* save the FPU state */
1006
1007 if (fp_save_mech == FP_XSAVE) {
1008 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1009 fp->fpu_regs.kfpu_status =
1010 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1011 } else {
1012 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1013 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1014 }
1015 fp->fpu_regs.kfpu_xstatus = mxcsr;
1016
1017 /*
1018 * compute the mask that determines which conditions can cause
1019 * a #xm exception, and use this to clean the status bits so that
1020 * we can identify the true cause of this one.
1021 */
1022 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1023 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1024 }
1025
1026 /*
1027 * In the unlikely event that someone is relying on this subcode being
1028 * FPE_FLTILL for denormalize exceptions, it can always be patched back
1029 * again to restore old behaviour.
1030 */
1031 int fpe_fltden = FPE_FLTDEN;
1032
1033 /*
1034 * Map from the FPU status word to the FP exception si_code.
1035 */
1036 static int
fpe_sicode(uint_t sw)1037 fpe_sicode(uint_t sw)
1038 {
1039 if (sw & FPS_IE)
1040 return (FPE_FLTINV);
1041 if (sw & FPS_ZE)
1042 return (FPE_FLTDIV);
1043 if (sw & FPS_DE)
1044 return (fpe_fltden);
1045 if (sw & FPS_OE)
1046 return (FPE_FLTOVF);
1047 if (sw & FPS_UE)
1048 return (FPE_FLTUND);
1049 if (sw & FPS_PE)
1050 return (FPE_FLTRES);
1051 return (FPE_FLTINV); /* default si_code for other exceptions */
1052 }
1053
1054 /*
1055 * Map from the SSE status word to the FP exception si_code.
1056 */
1057 static int
fpe_simd_sicode(uint_t sw)1058 fpe_simd_sicode(uint_t sw)
1059 {
1060 if (sw & SSE_IE)
1061 return (FPE_FLTINV);
1062 if (sw & SSE_ZE)
1063 return (FPE_FLTDIV);
1064 if (sw & SSE_DE)
1065 return (FPE_FLTDEN);
1066 if (sw & SSE_OE)
1067 return (FPE_FLTOVF);
1068 if (sw & SSE_UE)
1069 return (FPE_FLTUND);
1070 if (sw & SSE_PE)
1071 return (FPE_FLTRES);
1072 return (FPE_FLTINV); /* default si_code for other exceptions */
1073 }
1074
1075 /*
1076 * This routine is invoked as part of libc's __fpstart implementation
1077 * via sysi86(2).
1078 *
1079 * It may be called -before- any context has been assigned in which case
1080 * we try and avoid touching the hardware. Or it may be invoked well
1081 * after the context has been assigned and fiddled with, in which case
1082 * just tweak it directly.
1083 */
1084 void
fpsetcw(uint16_t fcw,uint32_t mxcsr)1085 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1086 {
1087 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1088 struct fxsave_state *fx;
1089
1090 if (!fpu_exists || fp_kind == FP_NO)
1091 return;
1092
1093 if ((fp->fpu_flags & FPU_EN) == 0) {
1094 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1095 /*
1096 * Common case. Floating point unit not yet
1097 * enabled, and kernel already intends to initialize
1098 * the hardware the way the caller wants.
1099 */
1100 return;
1101 }
1102 /*
1103 * Hmm. Userland wants a different default.
1104 * Do a fake "first trap" to establish the context, then
1105 * handle as if we already had a context before we came in.
1106 */
1107 kpreempt_disable();
1108 fp_seed();
1109 kpreempt_enable();
1110 }
1111
1112 /*
1113 * Ensure that the current hardware state is flushed back to the
1114 * pcb, then modify that copy. Next use of the fp will
1115 * restore the context.
1116 */
1117 fp_save(fp);
1118
1119 switch (fp_save_mech) {
1120 case FP_FXSAVE:
1121 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1122 fx->fx_fcw = fcw;
1123 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1124 break;
1125
1126 case FP_XSAVE:
1127 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1128 fx->fx_fcw = fcw;
1129 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1130 /*
1131 * Always set LEGACY_FP as it may have been cleared by XSAVE
1132 * instruction
1133 */
1134 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
1135 break;
1136 default:
1137 panic("Invalid fp_save_mech");
1138 /*NOTREACHED*/
1139 }
1140 }
1141
1142 static void
kernel_fpu_fpstate_init(kfpu_state_t * kfpu)1143 kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1144 {
1145 struct xsave_state *xs;
1146
1147 switch (fp_save_mech) {
1148 case FP_FXSAVE:
1149 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1150 sizeof (struct fxsave_state));
1151 kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1152 break;
1153 case FP_XSAVE:
1154 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1155 bzero(xs, cpuid_get_xsave_size());
1156 bcopy(&avx_initial, xs, sizeof (*xs));
1157 xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1158 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1159 break;
1160 default:
1161 panic("invalid fp_save_mech");
1162 }
1163
1164 /*
1165 * Set the corresponding flags that the system expects on the FPU state
1166 * to indicate that this is our state. The FPU_EN flag is required to
1167 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1168 * not set below as it represents that this state is being suppressed
1169 * by the kernel.
1170 */
1171 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1172 kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1173 }
1174
1175 kfpu_state_t *
kernel_fpu_alloc(int kmflags)1176 kernel_fpu_alloc(int kmflags)
1177 {
1178 kfpu_state_t *kfpu;
1179
1180 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1181 return (NULL);
1182 }
1183
1184 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1185 kmem_cache_alloc(fpsave_cachep, kmflags);
1186 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1187 kmem_free(kfpu, sizeof (kfpu_state_t));
1188 return (NULL);
1189 }
1190
1191 kernel_fpu_fpstate_init(kfpu);
1192
1193 return (kfpu);
1194 }
1195
1196 void
kernel_fpu_free(kfpu_state_t * kfpu)1197 kernel_fpu_free(kfpu_state_t *kfpu)
1198 {
1199 kmem_cache_free(fpsave_cachep,
1200 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1201 kmem_free(kfpu, sizeof (kfpu_state_t));
1202 }
1203
1204 static void
kernel_fpu_ctx_save(void * arg)1205 kernel_fpu_ctx_save(void *arg)
1206 {
1207 kfpu_state_t *kfpu = arg;
1208 fpu_ctx_t *pf;
1209
1210 if (kfpu == NULL) {
1211 /*
1212 * A NULL kfpu implies this is a kernel thread with an LWP and
1213 * no user-level FPU usage. Use the lwp fpu save area.
1214 */
1215 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1216
1217 ASSERT(curthread->t_procp->p_flag & SSYS);
1218 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1219
1220 fp_save(pf);
1221 } else {
1222 pf = &kfpu->kfpu_ctx;
1223
1224 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1225 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1226
1227 /*
1228 * Note, we can't use fp_save because it assumes that we're
1229 * saving to the thread's PCB and not somewhere else. Because
1230 * this is a different FPU context, we instead have to do this
1231 * ourselves.
1232 */
1233 switch (fp_save_mech) {
1234 case FP_FXSAVE:
1235 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1236 break;
1237 case FP_XSAVE:
1238 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1239 break;
1240 default:
1241 panic("Invalid fp_save_mech");
1242 }
1243
1244 /*
1245 * Because we have saved context here, our save state is no
1246 * longer valid and therefore needs to be reinitialized.
1247 */
1248 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1249 }
1250
1251 pf->fpu_flags |= FPU_VALID;
1252
1253 /*
1254 * Clear KFPU flag. This allows swtch to check for improper kernel
1255 * usage of the FPU (i.e. switching to a new thread while the old
1256 * thread was in the kernel and using the FPU, but did not perform a
1257 * context save).
1258 */
1259 curthread->t_flag &= ~T_KFPU;
1260 }
1261
1262 static void
kernel_fpu_ctx_restore(void * arg)1263 kernel_fpu_ctx_restore(void *arg)
1264 {
1265 kfpu_state_t *kfpu = arg;
1266 fpu_ctx_t *pf;
1267
1268 if (kfpu == NULL) {
1269 /*
1270 * A NULL kfpu implies this is a kernel thread with an LWP and
1271 * no user-level FPU usage. Use the lwp fpu save area.
1272 */
1273 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1274
1275 ASSERT(curthread->t_procp->p_flag & SSYS);
1276 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1277 } else {
1278 pf = &kfpu->kfpu_ctx;
1279
1280 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1281 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1282 }
1283
1284 fp_restore(pf);
1285 curthread->t_flag |= T_KFPU;
1286 }
1287
1288 /*
1289 * Validate that the thread is not switching off-cpu while actively using the
1290 * FPU within the kernel.
1291 */
1292 void
kernel_fpu_no_swtch(void)1293 kernel_fpu_no_swtch(void)
1294 {
1295 if ((curthread->t_flag & T_KFPU) != 0) {
1296 panic("curthread swtch-ing while the kernel is using the FPU");
1297 }
1298 }
1299
1300 void
kernel_fpu_begin(kfpu_state_t * kfpu,uint_t flags)1301 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1302 {
1303 klwp_t *pl = curthread->t_lwp;
1304 struct ctxop *ctx;
1305
1306 if ((curthread->t_flag & T_KFPU) != 0) {
1307 panic("curthread attempting to nest kernel FPU states");
1308 }
1309
1310 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1311 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1312 (KFPU_USE_LWP | KFPU_NO_STATE));
1313
1314 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1315 /*
1316 * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1317 * hold our kernel FPU context, we depend on the caller doing
1318 * kpreempt_disable for the duration of our FPU usage. This
1319 * should only be done for very short periods of time.
1320 */
1321 ASSERT(curthread->t_preempt > 0);
1322 ASSERT(kfpu == NULL);
1323
1324 if (pl != NULL) {
1325 /*
1326 * We might have already saved once so FPU_VALID could
1327 * be set. This is handled in fp_save.
1328 */
1329 fp_save(&pl->lwp_pcb.pcb_fpu);
1330 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1331 }
1332
1333 curthread->t_flag |= T_KFPU;
1334
1335 /* Always restore the fpu to the initial state. */
1336 fpinit();
1337
1338 return;
1339 }
1340
1341 /*
1342 * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1343 */
1344
1345 if ((flags & KFPU_USE_LWP) == 0) {
1346 if (kfpu->kfpu_curthread != NULL)
1347 panic("attempting to reuse kernel FPU state at %p when "
1348 "another thread already is using", kfpu);
1349
1350 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1351 kernel_fpu_fpstate_init(kfpu);
1352
1353 kfpu->kfpu_curthread = curthread;
1354 }
1355
1356 /*
1357 * Not all threads may have an active LWP. If they do and we're not
1358 * going to re-use the LWP, then we should go ahead and save the state.
1359 * We must also note that the fpu is now being used by the kernel and
1360 * therefore we do not want to manage the fpu state via the user-level
1361 * thread's context handlers.
1362 *
1363 * We might have already saved once (due to a prior use of the kernel
1364 * FPU or another code path) so FPU_VALID could be set. This is handled
1365 * by fp_save, as is the FPU_EN check.
1366 */
1367 ctx = installctx_preallocate();
1368 kpreempt_disable();
1369 if (pl != NULL) {
1370 if ((flags & KFPU_USE_LWP) == 0)
1371 fp_save(&pl->lwp_pcb.pcb_fpu);
1372 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1373 }
1374
1375 /*
1376 * Set the context operations for kernel FPU usage. Note that this is
1377 * done with a preallocated buffer and under kpreempt_disable because
1378 * without a preallocated buffer, installctx does a sleeping
1379 * allocation. We haven't finished initializing our kernel FPU state
1380 * yet, and in the rare case that we happen to save/restore just as
1381 * installctx() exits its own kpreempt_enable() internal call, we
1382 * guard against restoring an uninitialized buffer (0xbaddcafe).
1383 */
1384 installctx(curthread, kfpu, kernel_fpu_ctx_save, kernel_fpu_ctx_restore,
1385 NULL, NULL, NULL, NULL, ctx);
1386
1387 curthread->t_flag |= T_KFPU;
1388
1389 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1390 /*
1391 * For pure kernel threads with an LWP, we can use the LWP's
1392 * pcb_fpu to save/restore context.
1393 */
1394 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1395
1396 VERIFY(curthread->t_procp->p_flag & SSYS);
1397 VERIFY(kfpu == NULL);
1398 ASSERT((pf->fpu_flags & FPU_EN) == 0);
1399
1400 /* Always restore the fpu to the initial state. */
1401 if (fp_save_mech == FP_XSAVE)
1402 pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1403 fpinit();
1404 pf->fpu_flags = FPU_EN | FPU_KERNEL;
1405 } else {
1406 /* initialize the kfpu state */
1407 kernel_fpu_ctx_restore(kfpu);
1408 }
1409 kpreempt_enable();
1410 }
1411
1412 void
kernel_fpu_end(kfpu_state_t * kfpu,uint_t flags)1413 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1414 {
1415 ulong_t iflags;
1416
1417 if ((curthread->t_flag & T_KFPU) == 0) {
1418 panic("curthread attempting to clear kernel FPU state "
1419 "without using it");
1420 }
1421
1422 /*
1423 * General comments on why the rest of this function is structured the
1424 * way it is. Be aware that there is a lot of subtlety here.
1425 *
1426 * If a user-level thread ever uses the fpu while in the kernel, then
1427 * we cannot call fpdisable since that does STTS. That will set the
1428 * ts bit in %cr0 which will cause an exception if anything touches the
1429 * fpu. However, the user-level context switch handler (fpsave_ctxt)
1430 * needs to access the fpu to save the registers into the pcb.
1431 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1432 * fprestore_ctxt when the thread context switched onto the CPU.
1433 *
1434 * Calling fpdisable only effects the current CPU's %cr0 register.
1435 *
1436 * During removectx and kpreempt_enable, we can voluntarily context
1437 * switch, so the CPU we were on when we entered this function might
1438 * not be the same one we're on when we return from removectx or end
1439 * the function. Note there can be user-level context switch handlers
1440 * still installed if this is a user-level thread.
1441 *
1442 * We also must be careful in the unlikely chance we're running in an
1443 * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1444 * incorrectly for the "real" thread to resume on this CPU.
1445 */
1446
1447 if ((flags & KFPU_NO_STATE) == 0) {
1448 kpreempt_disable();
1449 } else {
1450 ASSERT(curthread->t_preempt > 0);
1451 }
1452
1453 curthread->t_flag &= ~T_KFPU;
1454
1455 /*
1456 * When we are ending things, we explicitly don't save the current
1457 * kernel FPU state back to the temporary state. The kfpu API is not
1458 * intended to be a permanent save location.
1459 *
1460 * If this is a user-level thread and we were to context switch
1461 * before returning to user-land, fpsave_ctxt will be a no-op since we
1462 * already saved the user-level FPU state the first time we run
1463 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1464 * the user-level fpu state). The fpsave_ctxt functions only save if
1465 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1466 * fprestore_ctxt will be done in sys_rtt_common when the thread
1467 * finally returns to user-land.
1468 */
1469
1470 if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1471 curthread->t_intr == NULL) {
1472 /*
1473 * A kernel thread which is not an interrupt thread, so we
1474 * STTS now.
1475 */
1476 fpdisable();
1477 }
1478
1479 if ((flags & KFPU_NO_STATE) == 0) {
1480 removectx(curthread, kfpu, kernel_fpu_ctx_save,
1481 kernel_fpu_ctx_restore, NULL, NULL, NULL, NULL);
1482
1483 if (kfpu != NULL) {
1484 if (kfpu->kfpu_curthread != curthread) {
1485 panic("attempting to end kernel FPU state "
1486 "for %p, but active thread is not "
1487 "curthread", kfpu);
1488 } else {
1489 kfpu->kfpu_curthread = NULL;
1490 }
1491 }
1492
1493 kpreempt_enable();
1494 }
1495
1496 if (curthread->t_lwp != NULL) {
1497 uint_t f;
1498
1499 if (flags & KFPU_USE_LWP) {
1500 f = FPU_EN | FPU_KERNEL;
1501 } else {
1502 f = FPU_KERNEL;
1503 }
1504 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1505 }
1506 }
1507