1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2018, Joyent, Inc.
24 */
25
26/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
27/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
28/*		All Rights Reserved				*/
29
30/*	Copyright (c) 1987, 1988 Microsoft Corporation		*/
31/*		All Rights Reserved				*/
32
33/*
34 * Copyright (c) 2009, Intel Corporation.
35 * All rights reserved.
36 */
37
38#include <sys/types.h>
39#include <sys/param.h>
40#include <sys/signal.h>
41#include <sys/regset.h>
42#include <sys/privregs.h>
43#include <sys/psw.h>
44#include <sys/trap.h>
45#include <sys/fault.h>
46#include <sys/systm.h>
47#include <sys/user.h>
48#include <sys/file.h>
49#include <sys/proc.h>
50#include <sys/pcb.h>
51#include <sys/lwp.h>
52#include <sys/cpuvar.h>
53#include <sys/thread.h>
54#include <sys/disp.h>
55#include <sys/fp.h>
56#include <sys/siginfo.h>
57#include <sys/archsystm.h>
58#include <sys/kmem.h>
59#include <sys/debug.h>
60#include <sys/x86_archext.h>
61#include <sys/sysmacros.h>
62#include <sys/cmn_err.h>
63
64/*
65 * FPU Management Overview
66 * -----------------------
67 *
68 * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
69 * however, many aspects of its life as a coprocessor are still around in x86.
70 *
71 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
72 * While that state still exists, there is much more that is covered by the FPU.
73 * Today, this includes not just traditional FPU state, but also supervisor only
74 * state. The following state is currently managed and covered logically by the
75 * idea of the FPU registers:
76 *
77 *    o Traditional x87 FPU
78 *    o Vector Registers (%xmm, %ymm, %zmm)
79 *    o Memory Protection Extensions (MPX) Bounds Registers
80 *    o Protected Key Rights Registers (PKRU)
81 *    o Processor Trace data
82 *
83 * The rest of this covers how the FPU is managed and controlled, how state is
84 * saved and restored between threads, interactions with hypervisors, and other
85 * information exported to user land through aux vectors. A lot of background
86 * information is here to synthesize major parts of the Intel SDM, but
87 * unfortunately, it is not a replacement for reading it.
88 *
89 * FPU Control Registers
90 * ---------------------
91 *
92 * Because the x87 FPU began its life as a co-processor and the FPU was
93 * optional there are several bits that show up in %cr0 that we have to
94 * manipulate when dealing with the FPU. These are:
95 *
96 *   o CR0.ET	The 'extension type' bit. This was used originally to indicate
97 *		that the FPU co-processor was present. Now it is forced on for
98 *		compatibility. This is often used to verify whether or not the
99 *		FPU is present.
100 *
101 *   o CR0.NE	The 'native error' bit. Used to indicate that native error
102 *		mode should be enabled. This indicates that we should take traps
103 *		on FPU errors. The OS enables this early in boot.
104 *
105 *   o CR0.MP	The 'Monitor Coprocessor' bit. Used to control whether or not
106 *		wait/fwait instructions generate a #NM if CR0.TS is set.
107 *
108 *   o CR0.EM	The 'Emulation' bit. This is used to cause floating point
109 *		operations (x87 through SSE4) to trap with a #UD so they can be
110 *		emulated. The system never sets this bit, but makes sure it is
111 *		clear on processor start up.
112 *
113 *   o CR0.TS	The 'Task Switched' bit. When this is turned on, a floating
114 *		point operation will generate a #NM. An fwait will as well,
115 *		depending on the value in CR0.MP.
116 *
117 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
118 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
119 * complicated role. Historically it has been used to allow running systems to
120 * restore the FPU registers lazily. This will be discussed in greater depth
121 * later on.
122 *
123 * %cr4 is also used as part of the FPU control. Specifically we need to worry
124 * about the following bits in the system:
125 *
126 *   o CR4.OSFXSR	This bit is used to indicate that the OS understands and
127 *			supports the execution of the fxsave and fxrstor
128 *			instructions. This bit is required to be set to enable
129 *			the use of the SSE->SSE4 instructions.
130 *
131 *   o CR4.OSXMMEXCPT	This bit is used to indicate that the OS can understand
132 *			and take a SIMD floating point exception (#XM). This bit
133 *			is always enabled by the system.
134 *
135 *   o CR4.OSXSAVE	This bit is used to indicate that the OS understands and
136 *			supports the execution of the xsave and xrstor family of
137 *			instructions. This bit is required to use any of the AVX
138 *			and newer feature sets.
139 *
140 * Because all supported processors are 64-bit, they'll always support the XMM
141 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
142 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
143 *
144 * %xcr0 is used to manage the behavior of the xsave feature set and is only
145 * present on the system if xsave is supported. %xcr0 is read and written to
146 * through by the xgetbv and xsetbv instructions. This register is present
147 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
148 * different component of the xsave state and controls whether or not that
149 * information is saved and restored. For newer feature sets like AVX and MPX,
150 * it also controls whether or not the corresponding instructions can be
151 * executed (much like CR0.OSFXSR does for the SSE feature sets).
152 *
153 * Everything in %xcr0 is around features available to users. There is also the
154 * IA32_XSS MSR which is used to control supervisor-only features that are still
155 * part of the xsave state. Bits that can be set in %xcr0 are reserved in
156 * IA32_XSS and vice versa. This is an important property that is particularly
157 * relevant to how the xsave instructions operate.
158 *
159 * Save Mechanisms
160 * ---------------
161 *
162 * When switching between running threads the FPU state needs to be saved and
163 * restored by the OS. If this state was not saved, users would rightfully
164 * complain about corrupt state. There are three mechanisms that exist on the
165 * processor for saving and restoring these state images:
166 *
167 *   o fsave
168 *   o fxsave
169 *   o xsave
170 *
171 * fsave saves and restores only the x87 FPU and is the oldest of these
172 * mechanisms. This mechanism is never used in the kernel today because we are
173 * always running on systems that support fxsave.
174 *
175 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
176 * state to be saved and restored to and from a struct fxsave_state. This is the
177 * default mechanism that is used to save and restore the FPU on amd64. An
178 * important aspect of fxsave that was different from the original i386 fsave
179 * mechanism is that the restoring of FPU state with pending exceptions will not
180 * generate an exception, it will be deferred to the next use of the FPU.
181 *
182 * The final and by far the most complex mechanism is that of the xsave set.
183 * xsave allows for saving and restoring all of the traditional x86 pieces (x87
184 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
185 * registers.
186 *
187 * Data is saved and restored into and out of a struct xsave_state. The first
188 * part of the struct xsave_state is equivalent to the struct fxsave_state.
189 * After that, there is a header which is used to describe the remaining
190 * portions of the state. The header is a 64-byte value of which the first two
191 * uint64_t values are defined and the rest are reserved and must be zero. The
192 * first uint64_t is the xstate_bv member. This describes which values in the
193 * xsave_state are actually valid and present. This is updated on a save and
194 * used on restore. The second member is the xcomp_bv member. Its last bit
195 * determines whether or not a compressed version of the structure is used.
196 *
197 * When the uncompressed structure is used (currently the only format we
198 * support), then each state component is at a fixed offset in the structure,
199 * even if it is not being used. For example, if you only saved the AVX related
200 * state, but did not save the MPX related state, the offset would not change
201 * for any component. With the compressed format, components that aren't used
202 * are all elided (though the x87 and SSE state are always there).
203 *
204 * Unlike fxsave which saves all state, the xsave family does not always save
205 * and restore all the state that could be covered by the xsave_state. The
206 * instructions all take an argument which is a mask of what to consider. This
207 * is the same mask that will be used in the xstate_bv vector and it is also the
208 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
209 * considered with the xsaves and xrstors instructions.
210 *
211 * When a save or restore is requested, a bitwise and is performed between the
212 * requested bits and those that have been enabled in %xcr0. Only the bits that
213 * match that are then saved or restored. Others will be silently ignored by
214 * the processor. This idea is used often in the OS. We will always request that
215 * we save and restore all of the state, but only those portions that are
216 * actually enabled in %xcr0 will be touched.
217 *
218 * If a feature has been asked to be restored that is not set in the xstate_bv
219 * feature vector of the save state, then it will be set to its initial state by
220 * the processor (usually zeros). Also, when asked to save state, the processor
221 * may not write out data that is in its initial state as an optimization. This
222 * optimization only applies to saving data and not to restoring data.
223 *
224 * There are a few different variants of the xsave and xrstor instruction. They
225 * are:
226 *
227 *   o xsave	This is the original save instruction. It will save all of the
228 *		requested data in the xsave state structure. It only saves data
229 *		in the uncompressed (xcomp_bv[63] is zero) format. It may be
230 *		executed at all privilege levels.
231 *
232 *   o xrstor	This is the original restore instruction. It will restore all of
233 *		the requested data. The xrstor function can handle both the
234 *		compressed and uncompressed formats. It may be executed at all
235 *		privilege levels.
236 *
237 *   o xsaveopt	This is a variant of the xsave instruction that employs
238 *		optimizations to try and only write out state that has been
239 *		modified since the last time an xrstor instruction was called.
240 *		The processor tracks a tuple of information about the last
241 *		xrstor and tries to ensure that the same buffer is being used
242 *		when this optimization is being used. However, because of the
243 *		way that it tracks the xrstor buffer based on the address of it,
244 *		it is not suitable for use if that buffer can be easily reused.
245 *		The most common case is trying to save data to the stack in
246 *		rtld. It may be executed at all privilege levels.
247 *
248 *   o xsavec	This is a variant of the xsave instruction that writes out the
249 *		compressed form of the xsave_state. Otherwise it behaves as
250 *		xsave. It may be executed at all privilege levels.
251 *
252 *   o xsaves	This is a variant of the xsave instruction. It is similar to
253 *		xsavec in that it always writes the compressed form of the
254 *		buffer. Unlike all the other forms, this instruction looks at
255 *		both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
256 *		what to save and restore. xsaves also implements the same
257 *		optimization that xsaveopt does around modified pieces. User
258 *		land may not execute the instruction.
259 *
260 *   o xrstors	This is a variant of the xrstor instruction. Similar to xsaves
261 *		it can save and restore both the user and privileged states.
262 *		Unlike xrstor it can only operate on the compressed form.
263 *		User land may not execute the instruction.
264 *
265 * Based on all of these, the kernel has a precedence for what it will use.
266 * Basically, xsaves (not supported) is preferred to xsaveopt, which is
267 * preferred to xsave. A similar scheme is used when informing rtld (more later)
268 * about what it should use. xsavec is preferred to xsave. xsaveopt is not
269 * recommended due to the modified optimization not being appropriate for this
270 * use.
271 *
272 * Finally, there is one last gotcha with the xsave state. Importantly some AMD
273 * processors did not always save and restore some of the FPU exception state in
274 * some cases like Intel did. In those cases the OS will make up for this fact
275 * itself.
276 *
277 * FPU Initialization
278 * ------------------
279 *
280 * One difference with the FPU registers is that not all threads have FPU state,
281 * only those that have an lwp. Generally this means kernel threads, which all
282 * share p0 and its lwp, do not have FPU state. Though there are definitely
283 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
284 * and lwp interchangeably, just think of thread meaning a thread that has a
285 * lwp.
286 *
287 * Each lwp has its FPU state allocated in its pcb (process control block). The
288 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
289 * dynamically at start up based on the save mechanism that we're using and the
290 * amount of memory required for it. This is dynamic because the xsave_state
291 * size varies based on the supported feature set.
292 *
293 * The hardware side of the FPU is initialized early in boot before we mount the
294 * root file system. This is effectively done in fpu_probe(). This is where we
295 * make the final decision about what the save and restore mechanisms we should
296 * use are, create the fpsave_cachep kmem cache, and initialize a number of
297 * function pointers that use save and restoring logic.
298 *
299 * The thread/lwp side is a a little more involved. There are two different
300 * things that we need to concern ourselves with. The first is how the FPU
301 * resources are allocated and the second is how the FPU state is initialized
302 * for a given lwp.
303 *
304 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
305 * This is always called unconditionally by the system as part of creating an
306 * LWP.
307 *
308 * There are three different initialization paths that we deal with. The first
309 * is when we are executing a new process. As part of exec all of the register
310 * state is reset. The exec case is particularly important because init is born
311 * like Athena, sprouting from the head of the kernel, without any true parent
312 * to fork from. The second is used whenever we fork or create a new lwp.  The
313 * third is to deal with special lwps like the agent lwp.
314 *
315 * During exec, we will call fp_exec() which will initialize and set up the FPU
316 * state for the process. That will fill in the initial state for the FPU and
317 * also set that state in the FPU itself. As part of fp_exec() we also install a
318 * thread context operations vector that takes care of dealing with the saving
319 * and restoring of the FPU. These context handlers will also be called whenever
320 * an lwp is created or forked. In those cases, to initialize the FPU we will
321 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
322 * operations vector for the new thread.
323 *
324 * Next we'll end up in the context operation fp_new_lwp(). This saves the
325 * current thread's state, initializes the new thread's state, and copies over
326 * the relevant parts of the originating thread's state. It's as this point that
327 * we also install the FPU context operations into the new thread, which ensures
328 * that all future threads that are descendants of the current one get the
329 * thread context operations (unless they call exec).
330 *
331 * To deal with some things like the agent lwp, we double check the state of the
332 * FPU in sys_rtt_common() to make sure that it has been enabled before
333 * returning to user land. In general, this path should be rare, but it's useful
334 * for the odd lwp here and there.
335 *
336 * The FPU state will remain valid most of the time. There are times that
337 * the state will be rewritten. For example in restorecontext, due to /proc, or
338 * the lwp calls exec(). Whether the context is being freed or we are resetting
339 * the state, we will call fp_free() to disable the FPU and our context.
340 *
341 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
342 * state by calling fp_lwp_cleanup().
343 *
344 * Kernel FPU Multiplexing
345 * -----------------------
346 *
347 * Just as the kernel has to maintain all of the general purpose registers when
348 * switching between scheduled threads, the same is true of the FPU registers.
349 *
350 * When a thread has FPU state, it also has a set of context operations
351 * installed. These context operations take care of making sure that the FPU is
352 * properly saved and restored during a context switch (fpsave_ctxt and
353 * fprestore_ctxt respectively). This means that the current implementation of
354 * the FPU is 'eager', when a thread is running the CPU will have its FPU state
355 * loaded. While this is always true when executing in userland, there are a few
356 * cases where this is not true in the kernel.
357 *
358 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
359 * employed. This meant that the FPU would be saved on a context switch and the
360 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
361 * then take a #NM trap, at which point we would restore the FPU from the save
362 * area and return to user land. Given the frequency of use of the FPU alone by
363 * libc, there's no point returning to user land just to trap again.
364 *
365 * There are a few cases though where the FPU state may need to be changed for a
366 * thread on its behalf. The most notable cases are in the case of processes
367 * using /proc, restorecontext, forking, etc. In all of these cases the kernel
368 * will force a threads FPU state to be saved into the PCB through the fp_save()
369 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
370 * pcb. This indicates that the save state holds currently valid data. As a side
371 * effect of this, CR0.TS will be set. To make sure that all of the state is
372 * updated before returning to user land, in these cases, we set a flag on the
373 * PCB that says the FPU needs to be updated. This will make sure that we take
374 * the slow path out of a system call to fix things up for the thread. Due to
375 * the fact that this is a rather rare case, effectively setting the equivalent
376 * of t_postsys is acceptable.
377 *
378 * CR0.TS will be set after a save occurs and cleared when a restore occurs.
379 * Generally this means it will be cleared immediately by the new thread that is
380 * running in a context switch. However, this isn't the case for kernel threads.
381 * They currently operate with CR0.TS set as no kernel state is restored for
382 * them. This means that using the FPU will cause a #NM and panic.
383 *
384 * The FPU_VALID flag on the currently executing thread's pcb is meant to track
385 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
386 * However, because we eagerly restore, the only time that CR0.TS should be set
387 * for a non-kernel thread is during operations where it will be cleared before
388 * returning to user land and importantly, the only data that is in it is its
389 * own.
390 *
391 * FPU Exceptions
392 * --------------
393 *
394 * Certain operations can cause the kernel to take traps due to FPU activity.
395 * Generally these events will cause a user process to receive a SIGFPU and if
396 * the kernel receives it in kernel context, we will die. Traditionally the #NM
397 * (Device Not Available / No Math) exception generated by CR0.TS would have
398 * caused us to restore the FPU. Now it is a fatal event regardless of whether
399 * or not user land causes it.
400 *
401 * While there are some cases where the kernel uses the FPU, it is up to the
402 * kernel to use the FPU in a way such that it cannot receive a trap or to use
403 * the appropriate trap protection mechanisms.
404 *
405 * Hypervisors
406 * -----------
407 *
408 * When providing support for hypervisors things are a little bit more
409 * complicated because the FPU is not virtualized at all. This means that they
410 * need to save and restore the FPU and %xcr0 across entry and exit to the
411 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
412 * allow us to use the full native state to make sure that we are always saving
413 * and restoring the full FPU that the host sees, even when the guest is using a
414 * subset.
415 *
416 * One tricky aspect of this is that the guest may be using a subset of %xcr0
417 * and therefore changing our %xcr0 on the fly. It is vital that when we're
418 * saving and restoring the FPU that we always use the largest %xcr0 contents
419 * otherwise we will end up leaving behind data in it.
420 *
421 * ELF PLT Support
422 * ---------------
423 *
424 * rtld has to preserve a subset of the FPU when it is saving and restoring
425 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
426 * more information. As a result, we set up an aux vector that contains
427 * information about what save and restore mechanisms it should be using and
428 * the sizing thereof based on what the kernel supports. This is passed down in
429 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
430 * initialized in fpu_subr.c.
431 */
432
433kmem_cache_t *fpsave_cachep;
434
435/* Legacy fxsave layout + xsave header + ymm */
436#define	AVX_XSAVE_SIZE		(512 + 64 + 256)
437
438/*
439 * Various sanity checks.
440 */
441CTASSERT(sizeof (struct fxsave_state) == 512);
442CTASSERT(sizeof (struct fnsave_state) == 108);
443CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
444CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
445
446/*
447 * Initial kfpu state for SSE/SSE2 used by fpinit()
448 */
449const struct fxsave_state sse_initial = {
450	FPU_CW_INIT,	/* fx_fcw */
451	0,		/* fx_fsw */
452	0,		/* fx_fctw */
453	0,		/* fx_fop */
454#if defined(__amd64)
455	0,		/* fx_rip */
456	0,		/* fx_rdp */
457#else
458	0,		/* fx_eip */
459	0,		/* fx_cs */
460	0,		/* __fx_ign0 */
461	0,		/* fx_dp */
462	0,		/* fx_ds */
463	0,		/* __fx_ign1 */
464#endif /* __amd64 */
465	SSE_MXCSR_INIT	/* fx_mxcsr */
466	/* rest of structure is zero */
467};
468
469/*
470 * Initial kfpu state for AVX used by fpinit()
471 */
472const struct xsave_state avx_initial = {
473	/*
474	 * The definition below needs to be identical with sse_initial
475	 * defined above.
476	 */
477	{
478		FPU_CW_INIT,	/* fx_fcw */
479		0,		/* fx_fsw */
480		0,		/* fx_fctw */
481		0,		/* fx_fop */
482#if defined(__amd64)
483		0,		/* fx_rip */
484		0,		/* fx_rdp */
485#else
486		0,		/* fx_eip */
487		0,		/* fx_cs */
488		0,		/* __fx_ign0 */
489		0,		/* fx_dp */
490		0,		/* fx_ds */
491		0,		/* __fx_ign1 */
492#endif /* __amd64 */
493		SSE_MXCSR_INIT	/* fx_mxcsr */
494		/* rest of structure is zero */
495	},
496	/*
497	 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid,
498	 * and CPU should initialize XMM/YMM.
499	 */
500	1,
501	0	/* xs_xcomp_bv */
502	/* rest of structure is zero */
503};
504
505/*
506 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
507 * the #gp exception caused by setting unsupported bits in the
508 * MXCSR register
509 */
510uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
511
512/*
513 * Initial kfpu state for x87 used by fpinit()
514 */
515const struct fnsave_state x87_initial = {
516	FPU_CW_INIT,	/* f_fcw */
517	0,		/* __f_ign0 */
518	0,		/* f_fsw */
519	0,		/* __f_ign1 */
520	0xffff,		/* f_ftw */
521	/* rest of structure is zero */
522};
523
524/*
525 * This vector is patched to xsave_ctxt() if we discover we have an
526 * XSAVE-capable chip in fpu_probe.
527 */
528void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
529void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
530
531/*
532 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
533 */
534void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
535
536static int fpe_sicode(uint_t);
537static int fpe_simd_sicode(uint_t);
538
539/*
540 * Copy the state of parent lwp's floating point context into the new lwp.
541 * Invoked for both fork() and lwp_create().
542 *
543 * Note that we inherit -only- the control state (e.g. exception masks,
544 * rounding, precision control, etc.); the FPU registers are otherwise
545 * reset to their initial state.
546 */
547static void
548fp_new_lwp(kthread_id_t t, kthread_id_t ct)
549{
550	struct fpu_ctx *fp;		/* parent fpu context */
551	struct fpu_ctx *cfp;		/* new fpu context */
552	struct fxsave_state *fx, *cfx;
553	struct xsave_state *cxs;
554
555	ASSERT(fp_kind != FP_NO);
556
557	fp = &t->t_lwp->lwp_pcb.pcb_fpu;
558	cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
559
560	/*
561	 * If the parent FPU state is still in the FPU hw then save it;
562	 * conveniently, fp_save() already does this for us nicely.
563	 */
564	fp_save(fp);
565
566	cfp->fpu_flags = FPU_EN | FPU_VALID;
567	cfp->fpu_regs.kfpu_status = 0;
568	cfp->fpu_regs.kfpu_xstatus = 0;
569
570	/*
571	 * Make sure that the child's FPU is cleaned up and made ready for user
572	 * land.
573	 */
574	PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
575
576	switch (fp_save_mech) {
577	case FP_FXSAVE:
578		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
579		cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
580		bcopy(&sse_initial, cfx, sizeof (*cfx));
581		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
582		cfx->fx_fcw = fx->fx_fcw;
583		break;
584
585	case FP_XSAVE:
586		cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
587
588		VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
589
590		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
591		cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
592		cfx = &cxs->xs_fxsave;
593
594		bcopy(&avx_initial, cxs, sizeof (*cxs));
595		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
596		cfx->fx_fcw = fx->fx_fcw;
597		cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) &
598		    XFEATURE_FP_INITIAL);
599		break;
600	default:
601		panic("Invalid fp_save_mech");
602		/*NOTREACHED*/
603	}
604
605	/*
606	 * Mark that both the parent and child need to have the FPU cleaned up
607	 * before returning to user land.
608	 */
609
610	installctx(ct, cfp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
611	    fp_new_lwp, NULL, fp_free);
612}
613
614/*
615 * Free any state associated with floating point context.
616 * Fp_free can be called in three cases:
617 * 1) from reaper -> thread_free -> freectx-> fp_free
618 *	fp context belongs to a thread on deathrow
619 *	nothing to do,  thread will never be resumed
620 *	thread calling ctxfree is reaper
621 *
622 * 2) from exec -> freectx -> fp_free
623 *	fp context belongs to the current thread
624 *	must disable fpu, thread calling ctxfree is curthread
625 *
626 * 3) from restorecontext -> setfpregs -> fp_free
627 *	we have a modified context in the memory (lwp->pcb_fpu)
628 *	disable fpu and release the fp context for the CPU
629 *
630 */
631/*ARGSUSED*/
632void
633fp_free(struct fpu_ctx *fp, int isexec)
634{
635	ASSERT(fp_kind != FP_NO);
636
637	if (fp->fpu_flags & FPU_VALID)
638		return;
639
640	kpreempt_disable();
641	/*
642	 * We want to do fpsave rather than fpdisable so that we can
643	 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
644	 */
645	fp->fpu_flags |= FPU_VALID;
646	/* If for current thread disable FP to track FPU_VALID */
647	if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
648		/* Clear errors if any to prevent frstor from complaining */
649		(void) fperr_reset();
650		if (fp_kind & __FP_SSE)
651			(void) fpxerr_reset();
652		fpdisable();
653	}
654	kpreempt_enable();
655}
656
657/*
658 * Store the floating point state and disable the floating point unit.
659 */
660void
661fp_save(struct fpu_ctx *fp)
662{
663	ASSERT(fp_kind != FP_NO);
664
665	kpreempt_disable();
666	if (!fp || fp->fpu_flags & FPU_VALID) {
667		kpreempt_enable();
668		return;
669	}
670	ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
671
672	switch (fp_save_mech) {
673	case FP_FXSAVE:
674		fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
675		break;
676
677	case FP_XSAVE:
678		xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
679		break;
680	default:
681		panic("Invalid fp_save_mech");
682		/*NOTREACHED*/
683	}
684
685	fp->fpu_flags |= FPU_VALID;
686
687	/*
688	 * We save the FPU as part of forking, execing, modifications via /proc,
689	 * restorecontext, etc. As such, we need to make sure that we return to
690	 * userland with valid state in the FPU. If we're context switched out
691	 * before we hit sys_rtt_common() we'll end up having restored the FPU
692	 * as part of the context ops operations. The restore logic always makes
693	 * sure that FPU_VALID is set before doing a restore so we don't restore
694	 * it a second time.
695	 */
696	PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
697
698	kpreempt_enable();
699}
700
701/*
702 * Restore the FPU context for the thread:
703 * The possibilities are:
704 *	1. No active FPU context: Load the new context into the FPU hw
705 *	   and enable the FPU.
706 */
707void
708fp_restore(struct fpu_ctx *fp)
709{
710	switch (fp_save_mech) {
711	case FP_FXSAVE:
712		fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
713		break;
714
715	case FP_XSAVE:
716		xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
717		break;
718	default:
719		panic("Invalid fp_save_mech");
720		/*NOTREACHED*/
721	}
722
723	fp->fpu_flags &= ~FPU_VALID;
724}
725
726/*
727 * Reset the FPU such that it is in a valid state for a new thread that is
728 * coming out of exec. The FPU will be in a usable state at this point. At this
729 * point we know that the FPU state has already been allocated and if this
730 * wasn't an init process, then it will have had fp_free() previously called.
731 */
732void
733fp_exec(void)
734{
735	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
736
737	if (fp_save_mech == FP_XSAVE) {
738		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
739	}
740
741	/*
742	 * Make sure that we're not preempted in the middle of initializing the
743	 * FPU on CPU.
744	 */
745	kpreempt_disable();
746	installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
747	    fp_new_lwp, NULL, fp_free);
748	fpinit();
749	fp->fpu_flags = FPU_EN;
750	kpreempt_enable();
751}
752
753
754/*
755 * Seeds the initial state for the current thread.  The possibilities are:
756 *      1. Another process has modified the FPU state before we have done any
757 *         initialization: Load the FPU state from the LWP state.
758 *      2. The FPU state has not been externally modified:  Load a clean state.
759 */
760void
761fp_seed(void)
762{
763	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
764
765	ASSERT(curthread->t_preempt >= 1);
766	ASSERT((fp->fpu_flags & FPU_EN) == 0);
767
768	/*
769	 * Always initialize a new context and initialize the hardware.
770	 */
771	if (fp_save_mech == FP_XSAVE) {
772		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
773	}
774
775	installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
776	    fp_new_lwp, NULL, fp_free);
777	fpinit();
778
779	/*
780	 * If FPU_VALID is set, it means someone has modified registers via
781	 * /proc.  In this case, restore the current lwp's state.
782	 */
783	if (fp->fpu_flags & FPU_VALID)
784		fp_restore(fp);
785
786	ASSERT((fp->fpu_flags & FPU_VALID) == 0);
787	fp->fpu_flags = FPU_EN;
788}
789
790/*
791 * When using xsave/xrstor, these three functions are used by the lwp code to
792 * manage the memory for the xsave area.
793 */
794void
795fp_lwp_init(struct _klwp *lwp)
796{
797	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
798
799	/*
800	 * We keep a copy of the pointer in lwp_fpu so that we can restore the
801	 * value in forklwp() after we duplicate the parent's LWP state.
802	 */
803	lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
804	    kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
805
806	if (fp_save_mech == FP_XSAVE) {
807		/*
808		 *
809		 * We bzero since the fpinit() code path will only
810		 * partially initialize the xsave area using avx_inital.
811		 */
812		ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
813		bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
814	}
815}
816
817void
818fp_lwp_cleanup(struct _klwp *lwp)
819{
820	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
821
822	if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
823		kmem_cache_free(fpsave_cachep,
824		    fp->fpu_regs.kfpu_u.kfpu_generic);
825		lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
826	}
827}
828
829/*
830 * Called during the process of forklwp(). The kfpu_u pointer will have been
831 * overwritten while copying the parent's LWP structure. We have a valid copy
832 * stashed in the child's lwp_fpu which we use to restore the correct value.
833 */
834void
835fp_lwp_dup(struct _klwp *lwp)
836{
837	void *xp = lwp->lwp_fpu;
838	size_t sz;
839
840	switch (fp_save_mech) {
841	case FP_FXSAVE:
842		sz = sizeof (struct fxsave_state);
843		break;
844	case FP_XSAVE:
845		sz = cpuid_get_xsave_size();
846		break;
847	default:
848		panic("Invalid fp_save_mech");
849		/*NOTREACHED*/
850	}
851
852	/* copy the parent's values into the new lwp's struct */
853	bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
854	/* now restore the pointer */
855	lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
856}
857
858/*
859 * Handle a processor extension error fault
860 * Returns non zero for error.
861 */
862
863/*ARGSUSED*/
864int
865fpexterrflt(struct regs *rp)
866{
867	uint32_t fpcw, fpsw;
868	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
869
870	ASSERT(fp_kind != FP_NO);
871
872	/*
873	 * Now we can enable the interrupts.
874	 * (NOTE: x87 fp exceptions come thru interrupt gate)
875	 */
876	sti();
877
878	if (!fpu_exists)
879		return (FPE_FLTINV);
880
881	/*
882	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
883	 * it'll be saved into the fpu context area passed in (that of the
884	 * current thread).  If it's not dirty (it may not be, due to
885	 * an intervening save due to a context switch between the sti(),
886	 * above and here, then it's safe to just use the stored values in
887	 * the context save area to determine the cause of the fault.
888	 */
889	fp_save(fp);
890
891	/* clear exception flags in saved state, as if by fnclex */
892	switch (fp_save_mech) {
893	case FP_FXSAVE:
894		fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
895		fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
896		fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
897		break;
898
899	case FP_XSAVE:
900		fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
901		fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
902		fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
903		/*
904		 * Always set LEGACY_FP as it may have been cleared by XSAVE
905		 * instruction
906		 */
907		fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
908		break;
909	default:
910		panic("Invalid fp_save_mech");
911		/*NOTREACHED*/
912	}
913
914	fp->fpu_regs.kfpu_status = fpsw;
915
916	if ((fpsw & FPS_ES) == 0)
917		return (0);		/* No exception */
918
919	/*
920	 * "and" the exception flags with the complement of the mask
921	 * bits to determine which exception occurred
922	 */
923	return (fpe_sicode(fpsw & ~fpcw & 0x3f));
924}
925
926/*
927 * Handle an SSE/SSE2 precise exception.
928 * Returns a non-zero sicode for error.
929 */
930/*ARGSUSED*/
931int
932fpsimderrflt(struct regs *rp)
933{
934	uint32_t mxcsr, xmask;
935	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
936
937	ASSERT(fp_kind & __FP_SSE);
938
939	/*
940	 * NOTE: Interrupts are disabled during execution of this
941	 * function.  They are enabled by the caller in trap.c.
942	 */
943
944	/*
945	 * The only way we could have gotten here if there is no FP unit
946	 * is via a user executing an INT $19 instruction, so there is
947	 * no fault in that case.
948	 */
949	if (!fpu_exists)
950		return (0);
951
952	/*
953	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
954	 * it'll be saved into the fpu context area passed in (that of the
955	 * current thread).  If it's not dirty, then it's safe to just use
956	 * the stored values in the context save area to determine the
957	 * cause of the fault.
958	 */
959	fp_save(fp);		/* save the FPU state */
960
961	if (fp_save_mech == FP_XSAVE) {
962		mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
963		fp->fpu_regs.kfpu_status =
964		    fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
965	} else {
966		mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
967		fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
968	}
969	fp->fpu_regs.kfpu_xstatus = mxcsr;
970
971	/*
972	 * compute the mask that determines which conditions can cause
973	 * a #xm exception, and use this to clean the status bits so that
974	 * we can identify the true cause of this one.
975	 */
976	xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
977	return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
978}
979
980/*
981 * In the unlikely event that someone is relying on this subcode being
982 * FPE_FLTILL for denormalize exceptions, it can always be patched back
983 * again to restore old behaviour.
984 */
985int fpe_fltden = FPE_FLTDEN;
986
987/*
988 * Map from the FPU status word to the FP exception si_code.
989 */
990static int
991fpe_sicode(uint_t sw)
992{
993	if (sw & FPS_IE)
994		return (FPE_FLTINV);
995	if (sw & FPS_ZE)
996		return (FPE_FLTDIV);
997	if (sw & FPS_DE)
998		return (fpe_fltden);
999	if (sw & FPS_OE)
1000		return (FPE_FLTOVF);
1001	if (sw & FPS_UE)
1002		return (FPE_FLTUND);
1003	if (sw & FPS_PE)
1004		return (FPE_FLTRES);
1005	return (FPE_FLTINV);	/* default si_code for other exceptions */
1006}
1007
1008/*
1009 * Map from the SSE status word to the FP exception si_code.
1010 */
1011static int
1012fpe_simd_sicode(uint_t sw)
1013{
1014	if (sw & SSE_IE)
1015		return (FPE_FLTINV);
1016	if (sw & SSE_ZE)
1017		return (FPE_FLTDIV);
1018	if (sw & SSE_DE)
1019		return (FPE_FLTDEN);
1020	if (sw & SSE_OE)
1021		return (FPE_FLTOVF);
1022	if (sw & SSE_UE)
1023		return (FPE_FLTUND);
1024	if (sw & SSE_PE)
1025		return (FPE_FLTRES);
1026	return (FPE_FLTINV);	/* default si_code for other exceptions */
1027}
1028
1029/*
1030 * This routine is invoked as part of libc's __fpstart implementation
1031 * via sysi86(2).
1032 *
1033 * It may be called -before- any context has been assigned in which case
1034 * we try and avoid touching the hardware.  Or it may be invoked well
1035 * after the context has been assigned and fiddled with, in which case
1036 * just tweak it directly.
1037 */
1038void
1039fpsetcw(uint16_t fcw, uint32_t mxcsr)
1040{
1041	struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1042	struct fxsave_state *fx;
1043
1044	if (!fpu_exists || fp_kind == FP_NO)
1045		return;
1046
1047	if ((fp->fpu_flags & FPU_EN) == 0) {
1048		if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1049			/*
1050			 * Common case.  Floating point unit not yet
1051			 * enabled, and kernel already intends to initialize
1052			 * the hardware the way the caller wants.
1053			 */
1054			return;
1055		}
1056		/*
1057		 * Hmm.  Userland wants a different default.
1058		 * Do a fake "first trap" to establish the context, then
1059		 * handle as if we already had a context before we came in.
1060		 */
1061		kpreempt_disable();
1062		fp_seed();
1063		kpreempt_enable();
1064	}
1065
1066	/*
1067	 * Ensure that the current hardware state is flushed back to the
1068	 * pcb, then modify that copy.  Next use of the fp will
1069	 * restore the context.
1070	 */
1071	fp_save(fp);
1072
1073	switch (fp_save_mech) {
1074	case FP_FXSAVE:
1075		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1076		fx->fx_fcw = fcw;
1077		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1078		break;
1079
1080	case FP_XSAVE:
1081		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1082		fx->fx_fcw = fcw;
1083		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1084		/*
1085		 * Always set LEGACY_FP as it may have been cleared by XSAVE
1086		 * instruction
1087		 */
1088		fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
1089		break;
1090	default:
1091		panic("Invalid fp_save_mech");
1092		/*NOTREACHED*/
1093	}
1094}
1095