17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5ae115bc7Smrj * Common Development and Distribution License (the "License").
6ae115bc7Smrj * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate /*
227af88ac7SKuriakose Kuruvilla * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23c21bd51dSDan McDonald * Copyright 2021 Joyent, Inc.
2428816667SJerry Jelinek * Copyright 2021 RackTop Systems, Inc.
25*ed093b41SRobert Mustacchi * Copyright 2023 Oxide Computer Company
267c478bd9Sstevel@tonic-gate */
277c478bd9Sstevel@tonic-gate
287c478bd9Sstevel@tonic-gate /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
297c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
307c478bd9Sstevel@tonic-gate /* All Rights Reserved */
317c478bd9Sstevel@tonic-gate
327c478bd9Sstevel@tonic-gate /* Copyright (c) 1987, 1988 Microsoft Corporation */
337c478bd9Sstevel@tonic-gate /* All Rights Reserved */
347c478bd9Sstevel@tonic-gate
357af88ac7SKuriakose Kuruvilla /*
367af88ac7SKuriakose Kuruvilla * Copyright (c) 2009, Intel Corporation.
377af88ac7SKuriakose Kuruvilla * All rights reserved.
387af88ac7SKuriakose Kuruvilla */
397c478bd9Sstevel@tonic-gate
407c478bd9Sstevel@tonic-gate #include <sys/types.h>
417c478bd9Sstevel@tonic-gate #include <sys/param.h>
427c478bd9Sstevel@tonic-gate #include <sys/signal.h>
437c478bd9Sstevel@tonic-gate #include <sys/regset.h>
447c478bd9Sstevel@tonic-gate #include <sys/privregs.h>
457c478bd9Sstevel@tonic-gate #include <sys/psw.h>
467c478bd9Sstevel@tonic-gate #include <sys/trap.h>
477c478bd9Sstevel@tonic-gate #include <sys/fault.h>
487c478bd9Sstevel@tonic-gate #include <sys/systm.h>
497c478bd9Sstevel@tonic-gate #include <sys/user.h>
507c478bd9Sstevel@tonic-gate #include <sys/file.h>
517c478bd9Sstevel@tonic-gate #include <sys/proc.h>
527c478bd9Sstevel@tonic-gate #include <sys/pcb.h>
537c478bd9Sstevel@tonic-gate #include <sys/lwp.h>
547c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
557c478bd9Sstevel@tonic-gate #include <sys/thread.h>
567c478bd9Sstevel@tonic-gate #include <sys/disp.h>
577c478bd9Sstevel@tonic-gate #include <sys/fp.h>
587c478bd9Sstevel@tonic-gate #include <sys/siginfo.h>
597c478bd9Sstevel@tonic-gate #include <sys/archsystm.h>
607c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
617c478bd9Sstevel@tonic-gate #include <sys/debug.h>
627c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h>
637c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
647af88ac7SKuriakose Kuruvilla #include <sys/cmn_err.h>
652fc9ab6eSJerry Jelinek #include <sys/kfpu.h>
66*ed093b41SRobert Mustacchi #include <sys/stdbool.h>
67*ed093b41SRobert Mustacchi #include <sys/stdalign.h>
68*ed093b41SRobert Mustacchi #include <sys/procfs_isa.h>
69*ed093b41SRobert Mustacchi #include <sys/sunddi.h>
707af88ac7SKuriakose Kuruvilla
714c28a617SRobert Mustacchi /*
724c28a617SRobert Mustacchi * FPU Management Overview
734c28a617SRobert Mustacchi * -----------------------
744c28a617SRobert Mustacchi *
754c28a617SRobert Mustacchi * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
764c28a617SRobert Mustacchi * however, many aspects of its life as a coprocessor are still around in x86.
774c28a617SRobert Mustacchi *
784c28a617SRobert Mustacchi * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
794c28a617SRobert Mustacchi * While that state still exists, there is much more that is covered by the FPU.
804c28a617SRobert Mustacchi * Today, this includes not just traditional FPU state, but also supervisor only
814c28a617SRobert Mustacchi * state. The following state is currently managed and covered logically by the
82*ed093b41SRobert Mustacchi * idea of the FPU registers and more generally is called the Extended Processor
83*ed093b41SRobert Mustacchi * States:
844c28a617SRobert Mustacchi *
854c28a617SRobert Mustacchi * o Traditional x87 FPU
864c28a617SRobert Mustacchi * o Vector Registers (%xmm, %ymm, %zmm)
874c28a617SRobert Mustacchi * o Memory Protection Extensions (MPX) Bounds Registers
884c28a617SRobert Mustacchi * o Protected Key Rights Registers (PKRU)
894c28a617SRobert Mustacchi * o Processor Trace data
90*ed093b41SRobert Mustacchi * o Control-Flow Enforcement state
91*ed093b41SRobert Mustacchi * o Hardware Duty Cycle
92*ed093b41SRobert Mustacchi * o Hardware P-states
934c28a617SRobert Mustacchi *
944c28a617SRobert Mustacchi * The rest of this covers how the FPU is managed and controlled, how state is
954c28a617SRobert Mustacchi * saved and restored between threads, interactions with hypervisors, and other
96*ed093b41SRobert Mustacchi * information exported to userland through aux vectors. A lot of background
974c28a617SRobert Mustacchi * information is here to synthesize major parts of the Intel SDM, but
984c28a617SRobert Mustacchi * unfortunately, it is not a replacement for reading it.
994c28a617SRobert Mustacchi *
1004c28a617SRobert Mustacchi * FPU Control Registers
1014c28a617SRobert Mustacchi * ---------------------
1024c28a617SRobert Mustacchi *
1034c28a617SRobert Mustacchi * Because the x87 FPU began its life as a co-processor and the FPU was
1044c28a617SRobert Mustacchi * optional there are several bits that show up in %cr0 that we have to
1054c28a617SRobert Mustacchi * manipulate when dealing with the FPU. These are:
1064c28a617SRobert Mustacchi *
1074c28a617SRobert Mustacchi * o CR0.ET The 'extension type' bit. This was used originally to indicate
108da8e4073SToomas Soome * that the FPU co-processor was present. Now it is forced on for
109da8e4073SToomas Soome * compatibility. This is often used to verify whether or not the
110da8e4073SToomas Soome * FPU is present.
1114c28a617SRobert Mustacchi *
1124c28a617SRobert Mustacchi * o CR0.NE The 'native error' bit. Used to indicate that native error
1134c28a617SRobert Mustacchi * mode should be enabled. This indicates that we should take traps
1144c28a617SRobert Mustacchi * on FPU errors. The OS enables this early in boot.
1154c28a617SRobert Mustacchi *
1164c28a617SRobert Mustacchi * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not
117da8e4073SToomas Soome * wait/fwait instructions generate a #NM if CR0.TS is set.
1184c28a617SRobert Mustacchi *
1194c28a617SRobert Mustacchi * o CR0.EM The 'Emulation' bit. This is used to cause floating point
1204c28a617SRobert Mustacchi * operations (x87 through SSE4) to trap with a #UD so they can be
1214c28a617SRobert Mustacchi * emulated. The system never sets this bit, but makes sure it is
1224c28a617SRobert Mustacchi * clear on processor start up.
1234c28a617SRobert Mustacchi *
1244c28a617SRobert Mustacchi * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating
1254c28a617SRobert Mustacchi * point operation will generate a #NM. An fwait will as well,
1264c28a617SRobert Mustacchi * depending on the value in CR0.MP.
1274c28a617SRobert Mustacchi *
1284c28a617SRobert Mustacchi * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
1294c28a617SRobert Mustacchi * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
1304c28a617SRobert Mustacchi * complicated role. Historically it has been used to allow running systems to
1314c28a617SRobert Mustacchi * restore the FPU registers lazily. This will be discussed in greater depth
1324c28a617SRobert Mustacchi * later on.
1334c28a617SRobert Mustacchi *
1344c28a617SRobert Mustacchi * %cr4 is also used as part of the FPU control. Specifically we need to worry
1354c28a617SRobert Mustacchi * about the following bits in the system:
1364c28a617SRobert Mustacchi *
1374c28a617SRobert Mustacchi * o CR4.OSFXSR This bit is used to indicate that the OS understands and
1384c28a617SRobert Mustacchi * supports the execution of the fxsave and fxrstor
1394c28a617SRobert Mustacchi * instructions. This bit is required to be set to enable
1404c28a617SRobert Mustacchi * the use of the SSE->SSE4 instructions.
1414c28a617SRobert Mustacchi *
1424c28a617SRobert Mustacchi * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand
1434c28a617SRobert Mustacchi * and take a SIMD floating point exception (#XM). This bit
1444c28a617SRobert Mustacchi * is always enabled by the system.
1454c28a617SRobert Mustacchi *
1464c28a617SRobert Mustacchi * o CR4.OSXSAVE This bit is used to indicate that the OS understands and
1474c28a617SRobert Mustacchi * supports the execution of the xsave and xrstor family of
1484c28a617SRobert Mustacchi * instructions. This bit is required to use any of the AVX
1494c28a617SRobert Mustacchi * and newer feature sets.
1504c28a617SRobert Mustacchi *
1514c28a617SRobert Mustacchi * Because all supported processors are 64-bit, they'll always support the XMM
1524c28a617SRobert Mustacchi * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
1534c28a617SRobert Mustacchi * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
1544c28a617SRobert Mustacchi *
1554c28a617SRobert Mustacchi * %xcr0 is used to manage the behavior of the xsave feature set and is only
1564c28a617SRobert Mustacchi * present on the system if xsave is supported. %xcr0 is read and written to
1574c28a617SRobert Mustacchi * through by the xgetbv and xsetbv instructions. This register is present
1584c28a617SRobert Mustacchi * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
1594c28a617SRobert Mustacchi * different component of the xsave state and controls whether or not that
1604c28a617SRobert Mustacchi * information is saved and restored. For newer feature sets like AVX and MPX,
1614c28a617SRobert Mustacchi * it also controls whether or not the corresponding instructions can be
1624c28a617SRobert Mustacchi * executed (much like CR0.OSFXSR does for the SSE feature sets).
1634c28a617SRobert Mustacchi *
1644c28a617SRobert Mustacchi * Everything in %xcr0 is around features available to users. There is also the
1654c28a617SRobert Mustacchi * IA32_XSS MSR which is used to control supervisor-only features that are still
1664c28a617SRobert Mustacchi * part of the xsave state. Bits that can be set in %xcr0 are reserved in
1674c28a617SRobert Mustacchi * IA32_XSS and vice versa. This is an important property that is particularly
1684c28a617SRobert Mustacchi * relevant to how the xsave instructions operate.
1694c28a617SRobert Mustacchi *
1704c28a617SRobert Mustacchi * Save Mechanisms
1714c28a617SRobert Mustacchi * ---------------
1724c28a617SRobert Mustacchi *
1734c28a617SRobert Mustacchi * When switching between running threads the FPU state needs to be saved and
1744c28a617SRobert Mustacchi * restored by the OS. If this state was not saved, users would rightfully
1754c28a617SRobert Mustacchi * complain about corrupt state. There are three mechanisms that exist on the
1764c28a617SRobert Mustacchi * processor for saving and restoring these state images:
1774c28a617SRobert Mustacchi *
1784c28a617SRobert Mustacchi * o fsave
1794c28a617SRobert Mustacchi * o fxsave
1804c28a617SRobert Mustacchi * o xsave
1814c28a617SRobert Mustacchi *
1824c28a617SRobert Mustacchi * fsave saves and restores only the x87 FPU and is the oldest of these
1834c28a617SRobert Mustacchi * mechanisms. This mechanism is never used in the kernel today because we are
1844c28a617SRobert Mustacchi * always running on systems that support fxsave.
1854c28a617SRobert Mustacchi *
1864c28a617SRobert Mustacchi * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
1874c28a617SRobert Mustacchi * state to be saved and restored to and from a struct fxsave_state. This is the
1884c28a617SRobert Mustacchi * default mechanism that is used to save and restore the FPU on amd64. An
1894c28a617SRobert Mustacchi * important aspect of fxsave that was different from the original i386 fsave
1904c28a617SRobert Mustacchi * mechanism is that the restoring of FPU state with pending exceptions will not
1914c28a617SRobert Mustacchi * generate an exception, it will be deferred to the next use of the FPU.
1924c28a617SRobert Mustacchi *
1934c28a617SRobert Mustacchi * The final and by far the most complex mechanism is that of the xsave set.
1944c28a617SRobert Mustacchi * xsave allows for saving and restoring all of the traditional x86 pieces (x87
1954c28a617SRobert Mustacchi * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
1964c28a617SRobert Mustacchi * registers.
1974c28a617SRobert Mustacchi *
1984c28a617SRobert Mustacchi * Data is saved and restored into and out of a struct xsave_state. The first
1994c28a617SRobert Mustacchi * part of the struct xsave_state is equivalent to the struct fxsave_state.
2004c28a617SRobert Mustacchi * After that, there is a header which is used to describe the remaining
2014c28a617SRobert Mustacchi * portions of the state. The header is a 64-byte value of which the first two
2024c28a617SRobert Mustacchi * uint64_t values are defined and the rest are reserved and must be zero. The
2034c28a617SRobert Mustacchi * first uint64_t is the xstate_bv member. This describes which values in the
2044c28a617SRobert Mustacchi * xsave_state are actually valid and present. This is updated on a save and
2054c28a617SRobert Mustacchi * used on restore. The second member is the xcomp_bv member. Its last bit
2064c28a617SRobert Mustacchi * determines whether or not a compressed version of the structure is used.
2074c28a617SRobert Mustacchi *
2084c28a617SRobert Mustacchi * When the uncompressed structure is used (currently the only format we
2094c28a617SRobert Mustacchi * support), then each state component is at a fixed offset in the structure,
2104c28a617SRobert Mustacchi * even if it is not being used. For example, if you only saved the AVX related
2114c28a617SRobert Mustacchi * state, but did not save the MPX related state, the offset would not change
2124c28a617SRobert Mustacchi * for any component. With the compressed format, components that aren't used
2134c28a617SRobert Mustacchi * are all elided (though the x87 and SSE state are always there).
2144c28a617SRobert Mustacchi *
2154c28a617SRobert Mustacchi * Unlike fxsave which saves all state, the xsave family does not always save
2164c28a617SRobert Mustacchi * and restore all the state that could be covered by the xsave_state. The
2174c28a617SRobert Mustacchi * instructions all take an argument which is a mask of what to consider. This
2184c28a617SRobert Mustacchi * is the same mask that will be used in the xstate_bv vector and it is also the
2194c28a617SRobert Mustacchi * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
2204c28a617SRobert Mustacchi * considered with the xsaves and xrstors instructions.
2214c28a617SRobert Mustacchi *
2224c28a617SRobert Mustacchi * When a save or restore is requested, a bitwise and is performed between the
2234c28a617SRobert Mustacchi * requested bits and those that have been enabled in %xcr0. Only the bits that
2244c28a617SRobert Mustacchi * match that are then saved or restored. Others will be silently ignored by
2254c28a617SRobert Mustacchi * the processor. This idea is used often in the OS. We will always request that
2264c28a617SRobert Mustacchi * we save and restore all of the state, but only those portions that are
2274c28a617SRobert Mustacchi * actually enabled in %xcr0 will be touched.
2284c28a617SRobert Mustacchi *
2294c28a617SRobert Mustacchi * If a feature has been asked to be restored that is not set in the xstate_bv
2304c28a617SRobert Mustacchi * feature vector of the save state, then it will be set to its initial state by
2314c28a617SRobert Mustacchi * the processor (usually zeros). Also, when asked to save state, the processor
2324c28a617SRobert Mustacchi * may not write out data that is in its initial state as an optimization. This
2334c28a617SRobert Mustacchi * optimization only applies to saving data and not to restoring data.
2344c28a617SRobert Mustacchi *
2354c28a617SRobert Mustacchi * There are a few different variants of the xsave and xrstor instruction. They
2364c28a617SRobert Mustacchi * are:
2374c28a617SRobert Mustacchi *
2384c28a617SRobert Mustacchi * o xsave This is the original save instruction. It will save all of the
2394c28a617SRobert Mustacchi * requested data in the xsave state structure. It only saves data
2404c28a617SRobert Mustacchi * in the uncompressed (xcomp_bv[63] is zero) format. It may be
2414c28a617SRobert Mustacchi * executed at all privilege levels.
2424c28a617SRobert Mustacchi *
2434c28a617SRobert Mustacchi * o xrstor This is the original restore instruction. It will restore all of
2444c28a617SRobert Mustacchi * the requested data. The xrstor function can handle both the
2454c28a617SRobert Mustacchi * compressed and uncompressed formats. It may be executed at all
2464c28a617SRobert Mustacchi * privilege levels.
2474c28a617SRobert Mustacchi *
2484c28a617SRobert Mustacchi * o xsaveopt This is a variant of the xsave instruction that employs
2494c28a617SRobert Mustacchi * optimizations to try and only write out state that has been
2504c28a617SRobert Mustacchi * modified since the last time an xrstor instruction was called.
2514c28a617SRobert Mustacchi * The processor tracks a tuple of information about the last
2524c28a617SRobert Mustacchi * xrstor and tries to ensure that the same buffer is being used
2534c28a617SRobert Mustacchi * when this optimization is being used. However, because of the
2544c28a617SRobert Mustacchi * way that it tracks the xrstor buffer based on the address of it,
2554c28a617SRobert Mustacchi * it is not suitable for use if that buffer can be easily reused.
2564c28a617SRobert Mustacchi * The most common case is trying to save data to the stack in
2574c28a617SRobert Mustacchi * rtld. It may be executed at all privilege levels.
2584c28a617SRobert Mustacchi *
2594c28a617SRobert Mustacchi * o xsavec This is a variant of the xsave instruction that writes out the
2604c28a617SRobert Mustacchi * compressed form of the xsave_state. Otherwise it behaves as
2614c28a617SRobert Mustacchi * xsave. It may be executed at all privilege levels.
2624c28a617SRobert Mustacchi *
2634c28a617SRobert Mustacchi * o xsaves This is a variant of the xsave instruction. It is similar to
2644c28a617SRobert Mustacchi * xsavec in that it always writes the compressed form of the
2654c28a617SRobert Mustacchi * buffer. Unlike all the other forms, this instruction looks at
2664c28a617SRobert Mustacchi * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
2674c28a617SRobert Mustacchi * what to save and restore. xsaves also implements the same
2684c28a617SRobert Mustacchi * optimization that xsaveopt does around modified pieces. User
2694c28a617SRobert Mustacchi * land may not execute the instruction.
2704c28a617SRobert Mustacchi *
2714c28a617SRobert Mustacchi * o xrstors This is a variant of the xrstor instruction. Similar to xsaves
2724c28a617SRobert Mustacchi * it can save and restore both the user and privileged states.
2734c28a617SRobert Mustacchi * Unlike xrstor it can only operate on the compressed form.
2744c28a617SRobert Mustacchi * User land may not execute the instruction.
2754c28a617SRobert Mustacchi *
2764c28a617SRobert Mustacchi * Based on all of these, the kernel has a precedence for what it will use.
2774c28a617SRobert Mustacchi * Basically, xsaves (not supported) is preferred to xsaveopt, which is
2784c28a617SRobert Mustacchi * preferred to xsave. A similar scheme is used when informing rtld (more later)
2794c28a617SRobert Mustacchi * about what it should use. xsavec is preferred to xsave. xsaveopt is not
2804c28a617SRobert Mustacchi * recommended due to the modified optimization not being appropriate for this
2814c28a617SRobert Mustacchi * use.
2824c28a617SRobert Mustacchi *
2834c28a617SRobert Mustacchi * Finally, there is one last gotcha with the xsave state. Importantly some AMD
2844c28a617SRobert Mustacchi * processors did not always save and restore some of the FPU exception state in
2854c28a617SRobert Mustacchi * some cases like Intel did. In those cases the OS will make up for this fact
2864c28a617SRobert Mustacchi * itself.
2874c28a617SRobert Mustacchi *
2884c28a617SRobert Mustacchi * FPU Initialization
2894c28a617SRobert Mustacchi * ------------------
2904c28a617SRobert Mustacchi *
2914c28a617SRobert Mustacchi * One difference with the FPU registers is that not all threads have FPU state,
2924c28a617SRobert Mustacchi * only those that have an lwp. Generally this means kernel threads, which all
2934c28a617SRobert Mustacchi * share p0 and its lwp, do not have FPU state. Though there are definitely
2944c28a617SRobert Mustacchi * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
2954c28a617SRobert Mustacchi * and lwp interchangeably, just think of thread meaning a thread that has a
2964c28a617SRobert Mustacchi * lwp.
2974c28a617SRobert Mustacchi *
2984c28a617SRobert Mustacchi * Each lwp has its FPU state allocated in its pcb (process control block). The
2994c28a617SRobert Mustacchi * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
3004c28a617SRobert Mustacchi * dynamically at start up based on the save mechanism that we're using and the
3014c28a617SRobert Mustacchi * amount of memory required for it. This is dynamic because the xsave_state
3024c28a617SRobert Mustacchi * size varies based on the supported feature set.
3034c28a617SRobert Mustacchi *
3044c28a617SRobert Mustacchi * The hardware side of the FPU is initialized early in boot before we mount the
3054c28a617SRobert Mustacchi * root file system. This is effectively done in fpu_probe(). This is where we
3064c28a617SRobert Mustacchi * make the final decision about what the save and restore mechanisms we should
3074c28a617SRobert Mustacchi * use are, create the fpsave_cachep kmem cache, and initialize a number of
3084c28a617SRobert Mustacchi * function pointers that use save and restoring logic.
3094c28a617SRobert Mustacchi *
3104c28a617SRobert Mustacchi * The thread/lwp side is a a little more involved. There are two different
3114c28a617SRobert Mustacchi * things that we need to concern ourselves with. The first is how the FPU
3124c28a617SRobert Mustacchi * resources are allocated and the second is how the FPU state is initialized
3134c28a617SRobert Mustacchi * for a given lwp.
3144c28a617SRobert Mustacchi *
3154c28a617SRobert Mustacchi * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
3164c28a617SRobert Mustacchi * This is always called unconditionally by the system as part of creating an
3174c28a617SRobert Mustacchi * LWP.
3184c28a617SRobert Mustacchi *
3194c28a617SRobert Mustacchi * There are three different initialization paths that we deal with. The first
3204c28a617SRobert Mustacchi * is when we are executing a new process. As part of exec all of the register
3214c28a617SRobert Mustacchi * state is reset. The exec case is particularly important because init is born
3224c28a617SRobert Mustacchi * like Athena, sprouting from the head of the kernel, without any true parent
3234c28a617SRobert Mustacchi * to fork from. The second is used whenever we fork or create a new lwp. The
3244c28a617SRobert Mustacchi * third is to deal with special lwps like the agent lwp.
3254c28a617SRobert Mustacchi *
3264c28a617SRobert Mustacchi * During exec, we will call fp_exec() which will initialize and set up the FPU
3274c28a617SRobert Mustacchi * state for the process. That will fill in the initial state for the FPU and
3284c28a617SRobert Mustacchi * also set that state in the FPU itself. As part of fp_exec() we also install a
3294c28a617SRobert Mustacchi * thread context operations vector that takes care of dealing with the saving
3304c28a617SRobert Mustacchi * and restoring of the FPU. These context handlers will also be called whenever
3314c28a617SRobert Mustacchi * an lwp is created or forked. In those cases, to initialize the FPU we will
3324c28a617SRobert Mustacchi * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
3334c28a617SRobert Mustacchi * operations vector for the new thread.
3344c28a617SRobert Mustacchi *
3354c28a617SRobert Mustacchi * Next we'll end up in the context operation fp_new_lwp(). This saves the
3364c28a617SRobert Mustacchi * current thread's state, initializes the new thread's state, and copies over
3374c28a617SRobert Mustacchi * the relevant parts of the originating thread's state. It's as this point that
3384c28a617SRobert Mustacchi * we also install the FPU context operations into the new thread, which ensures
3394c28a617SRobert Mustacchi * that all future threads that are descendants of the current one get the
3404c28a617SRobert Mustacchi * thread context operations (unless they call exec).
3414c28a617SRobert Mustacchi *
3424c28a617SRobert Mustacchi * To deal with some things like the agent lwp, we double check the state of the
3434c28a617SRobert Mustacchi * FPU in sys_rtt_common() to make sure that it has been enabled before
344*ed093b41SRobert Mustacchi * returning to userland. In general, this path should be rare, but it's useful
3454c28a617SRobert Mustacchi * for the odd lwp here and there.
3464c28a617SRobert Mustacchi *
3474c28a617SRobert Mustacchi * The FPU state will remain valid most of the time. There are times that
3484c28a617SRobert Mustacchi * the state will be rewritten. For example in restorecontext, due to /proc, or
3494c28a617SRobert Mustacchi * the lwp calls exec(). Whether the context is being freed or we are resetting
3504c28a617SRobert Mustacchi * the state, we will call fp_free() to disable the FPU and our context.
3514c28a617SRobert Mustacchi *
3524c28a617SRobert Mustacchi * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
3534c28a617SRobert Mustacchi * state by calling fp_lwp_cleanup().
3544c28a617SRobert Mustacchi *
3554c28a617SRobert Mustacchi * Kernel FPU Multiplexing
3564c28a617SRobert Mustacchi * -----------------------
3574c28a617SRobert Mustacchi *
3584c28a617SRobert Mustacchi * Just as the kernel has to maintain all of the general purpose registers when
3594c28a617SRobert Mustacchi * switching between scheduled threads, the same is true of the FPU registers.
3604c28a617SRobert Mustacchi *
3614c28a617SRobert Mustacchi * When a thread has FPU state, it also has a set of context operations
3624c28a617SRobert Mustacchi * installed. These context operations take care of making sure that the FPU is
3634c28a617SRobert Mustacchi * properly saved and restored during a context switch (fpsave_ctxt and
3644c28a617SRobert Mustacchi * fprestore_ctxt respectively). This means that the current implementation of
3654c28a617SRobert Mustacchi * the FPU is 'eager', when a thread is running the CPU will have its FPU state
3664c28a617SRobert Mustacchi * loaded. While this is always true when executing in userland, there are a few
3674c28a617SRobert Mustacchi * cases where this is not true in the kernel.
3684c28a617SRobert Mustacchi *
3694c28a617SRobert Mustacchi * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
3704c28a617SRobert Mustacchi * employed. This meant that the FPU would be saved on a context switch and the
3714c28a617SRobert Mustacchi * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
3724c28a617SRobert Mustacchi * then take a #NM trap, at which point we would restore the FPU from the save
373*ed093b41SRobert Mustacchi * area and return to userland. Given the frequency of use of the FPU alone by
374*ed093b41SRobert Mustacchi * libc, there's no point returning to userland just to trap again.
3754c28a617SRobert Mustacchi *
3764c28a617SRobert Mustacchi * There are a few cases though where the FPU state may need to be changed for a
3774c28a617SRobert Mustacchi * thread on its behalf. The most notable cases are in the case of processes
3784c28a617SRobert Mustacchi * using /proc, restorecontext, forking, etc. In all of these cases the kernel
3794c28a617SRobert Mustacchi * will force a threads FPU state to be saved into the PCB through the fp_save()
3804c28a617SRobert Mustacchi * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
3814c28a617SRobert Mustacchi * pcb. This indicates that the save state holds currently valid data. As a side
3824c28a617SRobert Mustacchi * effect of this, CR0.TS will be set. To make sure that all of the state is
383*ed093b41SRobert Mustacchi * updated before returning to userland, in these cases, we set a flag on the
3844c28a617SRobert Mustacchi * PCB that says the FPU needs to be updated. This will make sure that we take
3854c28a617SRobert Mustacchi * the slow path out of a system call to fix things up for the thread. Due to
3864c28a617SRobert Mustacchi * the fact that this is a rather rare case, effectively setting the equivalent
3874c28a617SRobert Mustacchi * of t_postsys is acceptable.
3884c28a617SRobert Mustacchi *
3894c28a617SRobert Mustacchi * CR0.TS will be set after a save occurs and cleared when a restore occurs.
3904c28a617SRobert Mustacchi * Generally this means it will be cleared immediately by the new thread that is
3914c28a617SRobert Mustacchi * running in a context switch. However, this isn't the case for kernel threads.
3924c28a617SRobert Mustacchi * They currently operate with CR0.TS set as no kernel state is restored for
3934c28a617SRobert Mustacchi * them. This means that using the FPU will cause a #NM and panic.
3944c28a617SRobert Mustacchi *
3954c28a617SRobert Mustacchi * The FPU_VALID flag on the currently executing thread's pcb is meant to track
3964c28a617SRobert Mustacchi * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
3974c28a617SRobert Mustacchi * However, because we eagerly restore, the only time that CR0.TS should be set
3984c28a617SRobert Mustacchi * for a non-kernel thread is during operations where it will be cleared before
399*ed093b41SRobert Mustacchi * returning to userland and importantly, the only data that is in it is its
4004c28a617SRobert Mustacchi * own.
4014c28a617SRobert Mustacchi *
4022fc9ab6eSJerry Jelinek * Kernel FPU Usage
4032fc9ab6eSJerry Jelinek * ----------------
4042fc9ab6eSJerry Jelinek *
4052fc9ab6eSJerry Jelinek * Traditionally the kernel never used the FPU since it had no need for
4062fc9ab6eSJerry Jelinek * floating point operations. However, modern FPU hardware supports a variety
4072fc9ab6eSJerry Jelinek * of SIMD extensions which can speed up code such as parity calculations or
4082fc9ab6eSJerry Jelinek * encryption.
4092fc9ab6eSJerry Jelinek *
4102fc9ab6eSJerry Jelinek * To allow the kernel to take advantage of these features, the
4112fc9ab6eSJerry Jelinek * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
4122fc9ab6eSJerry Jelinek * around any usage of the FPU by the kernel to ensure that user-level context
4132fc9ab6eSJerry Jelinek * is properly saved/restored, as well as to properly setup the FPU for use by
4142fc9ab6eSJerry Jelinek * the kernel. There are a variety of ways this wrapping can be used, as
4152fc9ab6eSJerry Jelinek * discussed in this section below.
4162fc9ab6eSJerry Jelinek *
4172fc9ab6eSJerry Jelinek * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
4182fc9ab6eSJerry Jelinek * operations, the kernel_fpu_alloc() function should be used to allocate a
4192fc9ab6eSJerry Jelinek * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
4202fc9ab6eSJerry Jelinek * state. This structure is not tied to any thread. That is, different threads
4212fc9ab6eSJerry Jelinek * can reuse the same kfpu_state_t structure, although not concurrently. A
4222fc9ab6eSJerry Jelinek * kfpu_state_t structure is freed by the kernel_fpu_free() function.
4232fc9ab6eSJerry Jelinek *
4242fc9ab6eSJerry Jelinek * In some cases, the kernel may need to use the FPU for a short operation
4252fc9ab6eSJerry Jelinek * without the overhead to manage a kfpu_state_t structure and without
4262fc9ab6eSJerry Jelinek * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
4272fc9ab6eSJerry Jelinek * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
4282fc9ab6eSJerry Jelinek * parameter. This indicates that there is no kfpu_state_t. When used this way,
4292fc9ab6eSJerry Jelinek * kernel preemption should be disabled by the caller (kpreempt_disable) before
4302fc9ab6eSJerry Jelinek * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
4312fc9ab6eSJerry Jelinek * For this usage, it is important to limit the kernel's FPU use to short
4322fc9ab6eSJerry Jelinek * operations. The tradeoff between using the FPU without a kfpu_state_t
4332fc9ab6eSJerry Jelinek * structure vs. the overhead of allowing a context switch while using the FPU
4342fc9ab6eSJerry Jelinek * should be carefully considered on a case by case basis.
4352fc9ab6eSJerry Jelinek *
4362fc9ab6eSJerry Jelinek * In other cases, kernel threads have an LWP, but never execute in user space.
4372fc9ab6eSJerry Jelinek * In this situation, the LWP's pcb_fpu area can be used to save/restore the
4382fc9ab6eSJerry Jelinek * kernel's FPU state if the thread is context switched, instead of having to
4392fc9ab6eSJerry Jelinek * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
4402fc9ab6eSJerry Jelinek * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
4412fc9ab6eSJerry Jelinek * enable this behavior. It is the caller's responsibility to ensure that this
4422fc9ab6eSJerry Jelinek * is only used for a kernel thread which never executes in user space.
4432fc9ab6eSJerry Jelinek *
4444c28a617SRobert Mustacchi * FPU Exceptions
4454c28a617SRobert Mustacchi * --------------
4464c28a617SRobert Mustacchi *
4474c28a617SRobert Mustacchi * Certain operations can cause the kernel to take traps due to FPU activity.
4484c28a617SRobert Mustacchi * Generally these events will cause a user process to receive a SIGFPU and if
4494c28a617SRobert Mustacchi * the kernel receives it in kernel context, we will die. Traditionally the #NM
4504c28a617SRobert Mustacchi * (Device Not Available / No Math) exception generated by CR0.TS would have
4514c28a617SRobert Mustacchi * caused us to restore the FPU. Now it is a fatal event regardless of whether
452*ed093b41SRobert Mustacchi * or not userland causes it.
4534c28a617SRobert Mustacchi *
4544c28a617SRobert Mustacchi * While there are some cases where the kernel uses the FPU, it is up to the
4554c28a617SRobert Mustacchi * kernel to use the FPU in a way such that it cannot receive a trap or to use
4564c28a617SRobert Mustacchi * the appropriate trap protection mechanisms.
4574c28a617SRobert Mustacchi *
4584c28a617SRobert Mustacchi * Hypervisors
4594c28a617SRobert Mustacchi * -----------
4604c28a617SRobert Mustacchi *
4614c28a617SRobert Mustacchi * When providing support for hypervisors things are a little bit more
4624c28a617SRobert Mustacchi * complicated because the FPU is not virtualized at all. This means that they
4634c28a617SRobert Mustacchi * need to save and restore the FPU and %xcr0 across entry and exit to the
4644c28a617SRobert Mustacchi * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
4654c28a617SRobert Mustacchi * allow us to use the full native state to make sure that we are always saving
4664c28a617SRobert Mustacchi * and restoring the full FPU that the host sees, even when the guest is using a
4674c28a617SRobert Mustacchi * subset.
4684c28a617SRobert Mustacchi *
4694c28a617SRobert Mustacchi * One tricky aspect of this is that the guest may be using a subset of %xcr0
4704c28a617SRobert Mustacchi * and therefore changing our %xcr0 on the fly. It is vital that when we're
4714c28a617SRobert Mustacchi * saving and restoring the FPU that we always use the largest %xcr0 contents
4724c28a617SRobert Mustacchi * otherwise we will end up leaving behind data in it.
4734c28a617SRobert Mustacchi *
4744c28a617SRobert Mustacchi * ELF PLT Support
4754c28a617SRobert Mustacchi * ---------------
4764c28a617SRobert Mustacchi *
4774c28a617SRobert Mustacchi * rtld has to preserve a subset of the FPU when it is saving and restoring
4784c28a617SRobert Mustacchi * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
4794c28a617SRobert Mustacchi * more information. As a result, we set up an aux vector that contains
4804c28a617SRobert Mustacchi * information about what save and restore mechanisms it should be using and
4814c28a617SRobert Mustacchi * the sizing thereof based on what the kernel supports. This is passed down in
4824c28a617SRobert Mustacchi * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
4834c28a617SRobert Mustacchi * initialized in fpu_subr.c.
484*ed093b41SRobert Mustacchi *
485*ed093b41SRobert Mustacchi * Signal Handling and the ucontext_t
486*ed093b41SRobert Mustacchi * ----------------------------------
487*ed093b41SRobert Mustacchi *
488*ed093b41SRobert Mustacchi * One of the many gifts that signals give us is the twofold fact that when a
489*ed093b41SRobert Mustacchi * signal occurs, the signal handler is allowed to change the CPU's state
490*ed093b41SRobert Mustacchi * arbitrarily and when the signal handler is done executing, we must restore it
491*ed093b41SRobert Mustacchi * back to the original state. However, the second part of this is that the
492*ed093b41SRobert Mustacchi * signal handler is actually allowed to modify the state that the thread will
493*ed093b41SRobert Mustacchi * return to! To create this facade, the kernel will create a full ucontext_t
494*ed093b41SRobert Mustacchi * state, effectively calling getcontext(2) on the thread's behalf, and a
495*ed093b41SRobert Mustacchi * pointer to that is given to the signal handler (the void * argument for the
496*ed093b41SRobert Mustacchi * sa_sigaction function pointer in sigaction(2)). When libc is done with a
497*ed093b41SRobert Mustacchi * signal, it will call setcontext(2) with that same ucontext_t.
498*ed093b41SRobert Mustacchi *
499*ed093b41SRobert Mustacchi * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
500*ed093b41SRobert Mustacchi * it's often declared on the stack itself, with the signal handler spilling all
501*ed093b41SRobert Mustacchi * this state to the stack. The ucontext_t machine portion was broken into the
502*ed093b41SRobert Mustacchi * general purpose and floating point registers. In 64-bit code, the floating
503*ed093b41SRobert Mustacchi * point registers were mostly the same as the results of the fxsave instruction
504*ed093b41SRobert Mustacchi * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
505*ed093b41SRobert Mustacchi * starting point for information, it is transformed into a different shape to
506*ed093b41SRobert Mustacchi * deal with the history of the 32-bit SYS V ABI.
507*ed093b41SRobert Mustacchi *
508*ed093b41SRobert Mustacchi * While this worked, if you're reading this, you're aware that the x86 FPU and
509*ed093b41SRobert Mustacchi * extended register states didn't stop at the initial 16 128-bit %xmm
510*ed093b41SRobert Mustacchi * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
511*ed093b41SRobert Mustacchi * opmask registers. None of these fit inside the standard ucontext_t; however,
512*ed093b41SRobert Mustacchi * they must all be preserved and restored across a signal. While the various
513*ed093b41SRobert Mustacchi * x86 platform-specific ABIs all suggest that these registers are not preserved
514*ed093b41SRobert Mustacchi * across a function call, receiving a signal is not a function call and must be
515*ed093b41SRobert Mustacchi * thought of like a process receiving an interrupt. In other words, this
516*ed093b41SRobert Mustacchi * extended state must be preserved.
517*ed093b41SRobert Mustacchi *
518*ed093b41SRobert Mustacchi * To facilitate this, we have extended the ucontext_t structure with an
519*ed093b41SRobert Mustacchi * additional flag, UC_XSAVE, which indicates that the traditional padding
520*ed093b41SRobert Mustacchi * member, uc_xsave, actually is a pointer to the extended state. While this is
521*ed093b41SRobert Mustacchi * accessible outside of a signal handling context through the combination of
522*ed093b41SRobert Mustacchi * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
523*ed093b41SRobert Mustacchi * state is focused on signal handling. Signal handling spills all this state to
524*ed093b41SRobert Mustacchi * the stack and if we cannot spill the entire state to the stack then our
525*ed093b41SRobert Mustacchi * inability to deliver the signal results in the process being killed! While
526*ed093b41SRobert Mustacchi * there are separate efforts to ensure that the signal stack sizing that is
527*ed093b41SRobert Mustacchi * used for the minimum and maximum signal sizes are sufficient, we still need
528*ed093b41SRobert Mustacchi * to do our part to minimize the likelihood here.
529*ed093b41SRobert Mustacchi *
530*ed093b41SRobert Mustacchi * In designing this, we make the following observations which have helped us
531*ed093b41SRobert Mustacchi * focus our design:
532*ed093b41SRobert Mustacchi *
533*ed093b41SRobert Mustacchi * o While the start of an xsave area is the traditional 512-byte fxsave XMM
534*ed093b41SRobert Mustacchi * region, we already have that in the fpregs. Thus there is no reason to
535*ed093b41SRobert Mustacchi * duplicate it. This not only saves 512 bytes of additional stack space,
536*ed093b41SRobert Mustacchi * but it also means we don't have to ask which of the version of it to take
537*ed093b41SRobert Mustacchi * if they were to differ.
538*ed093b41SRobert Mustacchi *
539*ed093b41SRobert Mustacchi * o Many applications out there aren't necessarily using the extended vectors
540*ed093b41SRobert Mustacchi * and even when we do make libc and others take advantage of it, it will
541*ed093b41SRobert Mustacchi * behoove us to ensure that they are put back into their initial state
542*ed093b41SRobert Mustacchi * after use. This leads us to expect that in a number of cases, the actual
543*ed093b41SRobert Mustacchi * extended register state will be in its initial state.
544*ed093b41SRobert Mustacchi *
545*ed093b41SRobert Mustacchi * o While the signal handler does allow contents to be modified, we are
546*ed093b41SRobert Mustacchi * starting with making the interface private and thus allowing us to excise
547*ed093b41SRobert Mustacchi * components that are in their initial state.
548*ed093b41SRobert Mustacchi *
549*ed093b41SRobert Mustacchi * o There are similarities to what we want to create with the compressed
550*ed093b41SRobert Mustacchi * xsave format; however, because we don't always have support for the
551*ed093b41SRobert Mustacchi * compressed format, we can't just arbitrarily say let's do a compressed
552*ed093b41SRobert Mustacchi * save to the user stack.
553*ed093b41SRobert Mustacchi *
554*ed093b41SRobert Mustacchi * o Because we are not handing this state directly to and from hardware, we
555*ed093b41SRobert Mustacchi * don't need to meet some of the constraints of the compressed xsave format
556*ed093b41SRobert Mustacchi * around wanting alignment for the initial save or additional components.
557*ed093b41SRobert Mustacchi *
558*ed093b41SRobert Mustacchi * All of the above lead us to our own unique format for this data. When the
559*ed093b41SRobert Mustacchi * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
560*ed093b41SRobert Mustacchi * uc_xsave_t structure which has a magic version number, a 32-bit length of the
561*ed093b41SRobert Mustacchi * overall structure, and the 64-bit state bit-vector to represent which
562*ed093b41SRobert Mustacchi * components are valid. Following this 8-byte header, each component that is
563*ed093b41SRobert Mustacchi * present in the bit vector is immediately written out in roughly ascending bit
564*ed093b41SRobert Mustacchi * order (the order is determined based on the order of the fpu_xsave_info
565*ed093b41SRobert Mustacchi * array).
566*ed093b41SRobert Mustacchi *
567*ed093b41SRobert Mustacchi * This makes the rough logic that we have here when taking a signal and writing
568*ed093b41SRobert Mustacchi * out this state as:
569*ed093b41SRobert Mustacchi *
570*ed093b41SRobert Mustacchi * 1. Ensure that the FPU is saved and that the contents of the pcb save area
571*ed093b41SRobert Mustacchi * are valid. That is, call fp_save() if the state is not already flagged
572*ed093b41SRobert Mustacchi * with FPU_VALID.
573*ed093b41SRobert Mustacchi *
574*ed093b41SRobert Mustacchi * 2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
575*ed093b41SRobert Mustacchi * and XFEATURE_SSE bits as these will be placed in the xsave area.
576*ed093b41SRobert Mustacchi *
577*ed093b41SRobert Mustacchi * 3. Initialize the uc_xsave_t by setting our version field, initializing the
578*ed093b41SRobert Mustacchi * length to the length of the current structure, and then setting the
579*ed093b41SRobert Mustacchi * modified bit vector above.
580*ed093b41SRobert Mustacchi *
581*ed093b41SRobert Mustacchi * 4. Walk each remaining bit of the bit-vector. For each set bit, copy out
582*ed093b41SRobert Mustacchi * its extended state starting at the current length in the header and then
583*ed093b41SRobert Mustacchi * increase the header size by that length.
584*ed093b41SRobert Mustacchi *
585*ed093b41SRobert Mustacchi * 5. Finally write out the final uc_xsave_t structure.
586*ed093b41SRobert Mustacchi *
587*ed093b41SRobert Mustacchi * The above process is also used when someone manually calls getcontext_extd(2)
588*ed093b41SRobert Mustacchi * to get this state. The main difference between the two is which copyout
589*ed093b41SRobert Mustacchi * function we use. This deserves some explanation. Our main starting point for
590*ed093b41SRobert Mustacchi * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
591*ed093b41SRobert Mustacchi * the signal handling context to operate with a different copyout than we
592*ed093b41SRobert Mustacchi * normally use in say getcontext_extd(2).
593*ed093b41SRobert Mustacchi *
594*ed093b41SRobert Mustacchi * When we've received a signal, we're at the intersection of several different
595*ed093b41SRobert Mustacchi * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
596*ed093b41SRobert Mustacchi * the watchpoints effectively set a copyout override function (t_copyops) that
597*ed093b41SRobert Mustacchi * we end up vectoring to rather than a normal copyout. This allows the data to
598*ed093b41SRobert Mustacchi * be modified and for the watchpoint to fire. While this is all well and good
599*ed093b41SRobert Mustacchi * normally, it is problematic if we are trying to handle a signal. The signal
600*ed093b41SRobert Mustacchi * deliver logic, sendsig(), goes through and disables the watchpoint for the
601*ed093b41SRobert Mustacchi * region of the stack that we are copying out to. However, disabling
602*ed093b41SRobert Mustacchi * watchpoints is not sufficient, we also need to use the copyout_noerr
603*ed093b41SRobert Mustacchi * variants.
604*ed093b41SRobert Mustacchi *
605*ed093b41SRobert Mustacchi * These variants also require the use of on_fault() and no_fault() for error
606*ed093b41SRobert Mustacchi * handling. While it is tempting to try and on_fault() the entire
607*ed093b41SRobert Mustacchi * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
608*ed093b41SRobert Mustacchi * The first is that we don't want to disable faults during the entire operation
609*ed093b41SRobert Mustacchi * as if the kernel messes up we will treat that as a user error. That isn't
610*ed093b41SRobert Mustacchi * theoretical and happened during development. The second and perhaps more
611*ed093b41SRobert Mustacchi * important issue is that correctly bounding the on_fault() / no_fault() means
612*ed093b41SRobert Mustacchi * being careful about state. For example, kernel pre-emption is often disabled
613*ed093b41SRobert Mustacchi * during parts of these operations, but it needs to be re-enabled when we're
614*ed093b41SRobert Mustacchi * done. This would require tracking in some volatile variable that this had
615*ed093b41SRobert Mustacchi * been enabled and disabled and tracking that.
616*ed093b41SRobert Mustacchi *
617*ed093b41SRobert Mustacchi * Instead, this is why fpu_signal_copyout() takes a copy out function as an
618*ed093b41SRobert Mustacchi * argument. When we're in signal handling context, the function will use
619*ed093b41SRobert Mustacchi * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
620*ed093b41SRobert Mustacchi *
621*ed093b41SRobert Mustacchi * RESTORING STATE
622*ed093b41SRobert Mustacchi *
623*ed093b41SRobert Mustacchi * Copying out our current state is the easier half of this problem. When the
624*ed093b41SRobert Mustacchi * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
625*ed093b41SRobert Mustacchi * assembled for it as described above. setcontext(2) isn't just used for
626*ed093b41SRobert Mustacchi * returning from signals.
627*ed093b41SRobert Mustacchi *
628*ed093b41SRobert Mustacchi * The process for this goes in two steps. The first step is to copy in,
629*ed093b41SRobert Mustacchi * validate, and transform the ucontext_t UC_XSAVE that we created above into an
630*ed093b41SRobert Mustacchi * equivalent xsave format that we can use the appropriate xrstor function on.
631*ed093b41SRobert Mustacchi * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
632*ed093b41SRobert Mustacchi * come back through a second phase that is driven out of restorecontext() and
633*ed093b41SRobert Mustacchi * is implemented in fpu_set_xsave().
634*ed093b41SRobert Mustacchi *
635*ed093b41SRobert Mustacchi * Let's start by discussing the second part of this, which is more
636*ed093b41SRobert Mustacchi * straightforward. In particular, the second phase assumes that all of the
637*ed093b41SRobert Mustacchi * validation and error handling has been done by the first phase. This means
638*ed093b41SRobert Mustacchi * here, we have a buffer that is already the appropriate size
639*ed093b41SRobert Mustacchi * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
640*ed093b41SRobert Mustacchi * replace the actual save state with the current one.
641*ed093b41SRobert Mustacchi *
642*ed093b41SRobert Mustacchi * The only piece of shenanigans we have to do is around the kernel provided
643*ed093b41SRobert Mustacchi * notion of 'status' and 'xstatus', which are cached versions of the x87 and
644*ed093b41SRobert Mustacchi * SSE exception vectors. These are part of the fpregset ABI and therefore we
645*ed093b41SRobert Mustacchi * need to propagate them from the temporary storage that part 1 sets up in the
646*ed093b41SRobert Mustacchi * ignored region of the fxsave data. We use that because it is not persisted by
647*ed093b41SRobert Mustacchi * the CPU, so clobbering it is generally alright.
648*ed093b41SRobert Mustacchi *
649*ed093b41SRobert Mustacchi * Once that is done, we simply note that we need a PCB update to occur to
650*ed093b41SRobert Mustacchi * refresh the FPU state before we return to userland. Given that someone has
651*ed093b41SRobert Mustacchi * called setcontext(2), this was always going to happen because we have to
652*ed093b41SRobert Mustacchi * update segment registers and related, so this isn't so bad. With that, let's
653*ed093b41SRobert Mustacchi * move onto the more nuanced part (1).
654*ed093b41SRobert Mustacchi *
655*ed093b41SRobert Mustacchi * When we're handling a setcontext(2) we have, in userland, a data structure
656*ed093b41SRobert Mustacchi * that should match one we serialized out, though we cannot assume that a user
657*ed093b41SRobert Mustacchi * has not modified it either accidentally or maliciously. Our goal is to set up
658*ed093b41SRobert Mustacchi * the appropriate xsave state that can be passed to the CPU's xrstor. The first
659*ed093b41SRobert Mustacchi * problem we have to deal with is where do we actually put this state?
660*ed093b41SRobert Mustacchi *
661*ed093b41SRobert Mustacchi * While not many programs actually call setcontext(2) of their own volition,
662*ed093b41SRobert Mustacchi * this is going to get hit every time we take a signal. The first thought was
663*ed093b41SRobert Mustacchi * to re-use the existing thread's save area; however, that's a bit challenging
664*ed093b41SRobert Mustacchi * for a few reasons. In particular, we would need to ensure that we don't go
665*ed093b41SRobert Mustacchi * off-CPU for any reason, which we cannot assume with a copyin from a user
666*ed093b41SRobert Mustacchi * address space. In particular, it is trivial for us to hit a case where the
667*ed093b41SRobert Mustacchi * stack has been paged out for some reason, which eschews that path.
668*ed093b41SRobert Mustacchi *
669*ed093b41SRobert Mustacchi * Instead, whenever a thread first calls setcontext(2), generally from signal
670*ed093b41SRobert Mustacchi * context, we will at that time allocate another entry from the 'fpsave_cachep'
671*ed093b41SRobert Mustacchi * kmem cache, giving us a buffer of the appropriate space to handle this. Once
672*ed093b41SRobert Mustacchi * this buffer has been allocated, we leave it assigned to the thread's pcb and
673*ed093b41SRobert Mustacchi * only tear it down when the thread itself finally exits. We reason that a
674*ed093b41SRobert Mustacchi * thread that takes a signal once is either going to have the process exit
675*ed093b41SRobert Mustacchi * shortly thereafter or is much more likely to take a signal again in the
676*ed093b41SRobert Mustacchi * future. Many daemons and other processes set things up so signals are
677*ed093b41SRobert Mustacchi * dispatched via one location, masking signals in other thread, using
678*ed093b41SRobert Mustacchi * sigsuspend(2), signalfd(3C), or something similar.
679*ed093b41SRobert Mustacchi *
680*ed093b41SRobert Mustacchi * With this buffer in hand, we begin our task of reassembling state. Note, all
681*ed093b41SRobert Mustacchi * of this is conditional on UC_XSAVE being set in the uc_flags member of the
682*ed093b41SRobert Mustacchi * ucontext_t. If it is not set, then we assume that there is no extended state
683*ed093b41SRobert Mustacchi * and will use the traditional path of setting the fpregset_t into the system
684*ed093b41SRobert Mustacchi * via setfpregs().
685*ed093b41SRobert Mustacchi *
686*ed093b41SRobert Mustacchi * We first will copyin and validate the uc_xsave_t. In particular, we need to
687*ed093b41SRobert Mustacchi * make sure the version makes sense, that the xsave component bit-vector
688*ed093b41SRobert Mustacchi * doesn't have anything unexpected and more importantly unsupported in it, and
689*ed093b41SRobert Mustacchi * that the addresses we've been given are within the user address space. At
690*ed093b41SRobert Mustacchi * this point we can walk through our table of implemented bits and process
691*ed093b41SRobert Mustacchi * them.
692*ed093b41SRobert Mustacchi *
693*ed093b41SRobert Mustacchi * For most components in here, the processing is straightforward. We continue
694*ed093b41SRobert Mustacchi * walking our cursor and copy data into the kernel and place it in the
695*ed093b41SRobert Mustacchi * appropriate place in our xsave state. If a xsave state component bit-vector
696*ed093b41SRobert Mustacchi * isn't set, then we must ensure that we have the item in the initial state,
697*ed093b41SRobert Mustacchi * which for everything other than the x87/SSE state is the memory being zeroed.
698*ed093b41SRobert Mustacchi *
699*ed093b41SRobert Mustacchi * The most unique case in the copyin state is that of the x87/SSE state. You
700*ed093b41SRobert Mustacchi * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
701*ed093b41SRobert Mustacchi * but instead have opted to use the single definition in the fpregset_t. Thus
702*ed093b41SRobert Mustacchi * here, we copy it out of the fpregset_t, which the kernel has helpfully
703*ed093b41SRobert Mustacchi * already unified into the 64-bit fxsave version prior to calling us, and
704*ed093b41SRobert Mustacchi * install that into the save area we're building up.
705*ed093b41SRobert Mustacchi *
706*ed093b41SRobert Mustacchi * As part of this, there are two important pieces to be aware of. The first is
707*ed093b41SRobert Mustacchi * that because the fpregset_t has both the status and xstatus members
708*ed093b41SRobert Mustacchi * mentioned earlier, we temporarily copy them to the software-usable ignored
709*ed093b41SRobert Mustacchi * areas of the fxsave state so we can corral this extra state into part (2)
710*ed093b41SRobert Mustacchi * without needing to allocate additional space. The second piece is that when
711*ed093b41SRobert Mustacchi * we're done processing this we explicitly remove the UC_FPU flag that would
712*ed093b41SRobert Mustacchi * tell the kernel to proceed with updating that region. The problem is that
713*ed093b41SRobert Mustacchi * that goes directly into the pcb's save area and not to the intermediate
714*ed093b41SRobert Mustacchi * buffer as it uses the same entry point as /proc, mainly setfpregs().
715*ed093b41SRobert Mustacchi *
716*ed093b41SRobert Mustacchi * We don't do much validation of the actual contents of the registers that are
717*ed093b41SRobert Mustacchi * being set with the exception of ensuring that no reserved bits of the mxcsr
718*ed093b41SRobert Mustacchi * are used. This is not as strict as /proc, but failure here means the process
719*ed093b41SRobert Mustacchi * is likely going to die (returning from setcontext() in a signal handler is
720*ed093b41SRobert Mustacchi * fatal).
721*ed093b41SRobert Mustacchi *
722*ed093b41SRobert Mustacchi * /proc xregs
723*ed093b41SRobert Mustacchi * -----------
724*ed093b41SRobert Mustacchi *
725*ed093b41SRobert Mustacchi * Observability of the state of the extended registers is important for
726*ed093b41SRobert Mustacchi * understanding the system. While on the surface this is similar to signal
727*ed093b41SRobert Mustacchi * handling, it is crucially different in a number of ways:
728*ed093b41SRobert Mustacchi *
729*ed093b41SRobert Mustacchi * o In signal handling, we're trying to conserve every byte of stack that we
730*ed093b41SRobert Mustacchi * can.
731*ed093b41SRobert Mustacchi * o The /proc xregs file will end up in core files, which means that we need
732*ed093b41SRobert Mustacchi * a way of knowing what components are present and not present in it,
733*ed093b41SRobert Mustacchi * because this will vary from CPU to CPU due to the addition of
734*ed093b41SRobert Mustacchi * architectural features. For example, some CPUs support AVX-512, but
735*ed093b41SRobert Mustacchi * others do not.
736*ed093b41SRobert Mustacchi *
737*ed093b41SRobert Mustacchi * o The signal handling structure (uc_xsave_t) is private and we're not
738*ed093b41SRobert Mustacchi * trying to have software modify it, on the other hand, the /proc
739*ed093b41SRobert Mustacchi * interfaces that we support we do want software to be able to interrogate
740*ed093b41SRobert Mustacchi * and manipulate. These need to be something that we can introduce
741*ed093b41SRobert Mustacchi * additional components into and make other changes that still allow it to
742*ed093b41SRobert Mustacchi * work.
743*ed093b41SRobert Mustacchi *
744*ed093b41SRobert Mustacchi * The x86 xregs format is documented in proc(5). The short form is that the
745*ed093b41SRobert Mustacchi * prxregset_hdr_t has a number of information entries, which are of the type
746*ed093b41SRobert Mustacchi * prxregset_info_t. Each of the information headers has a type, size, and
747*ed093b41SRobert Mustacchi * offset which indicate where to find the additional data.
748*ed093b41SRobert Mustacchi *
749*ed093b41SRobert Mustacchi * Each entry is described as one of the entries in the fpu_xsave_info[]. These
750*ed093b41SRobert Mustacchi * items either are a 1:1 correspondence with a xsave related feature (e.g.
751*ed093b41SRobert Mustacchi * there is one entry for each of the three AVX-512 components) or it is
752*ed093b41SRobert Mustacchi * something synthetic that we provide as additional information such as the
753*ed093b41SRobert Mustacchi * PRX_INFO_XCR, which is a way of getting information about the system such as
754*ed093b41SRobert Mustacchi * what is enabled in %xcr0 out there.
755*ed093b41SRobert Mustacchi *
756*ed093b41SRobert Mustacchi * Unlike signal handling, we are given the buffer to place everything that
757*ed093b41SRobert Mustacchi * needs to be written out. This is partially the design of the /proc APIs. That
758*ed093b41SRobert Mustacchi * is, we will always assemble everything into the entire buffer that /proc asks
759*ed093b41SRobert Mustacchi * us to, and then it will use as much or as little of it as is required.
760*ed093b41SRobert Mustacchi * Similarly, when setting things, we don't have to worry about copying in
761*ed093b41SRobert Mustacchi * information in the same way as signal handling does, because /proc takes care
762*ed093b41SRobert Mustacchi * of it and always hands us a full buffer. Sizing that is a little nuanced, but
763*ed093b41SRobert Mustacchi * is all handled in prmachdep.c.
764*ed093b41SRobert Mustacchi *
765*ed093b41SRobert Mustacchi * When someone performs a read of the xregs and thus is asking us for the
766*ed093b41SRobert Mustacchi * current state, there is a little bit of nuance that we need to deal with.
767*ed093b41SRobert Mustacchi * The first, is whether or not the FPU is enabled and the second is if the FPU
768*ed093b41SRobert Mustacchi * is enabled, whether a given component is noted as being in its initial state.
769*ed093b41SRobert Mustacchi * This basically gives us three possible states for a given component:
770*ed093b41SRobert Mustacchi *
771*ed093b41SRobert Mustacchi * 1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
772*ed093b41SRobert Mustacchi * the illumos FPU default for an item. More on that in a moment.
773*ed093b41SRobert Mustacchi * 2. The saved xsave state indicates that the bit for a given component is
774*ed093b41SRobert Mustacchi * zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
775*ed093b41SRobert Mustacchi * In this case, we must take the CPU's default for an item. This is
776*ed093b41SRobert Mustacchi * usually the same as illumos, but not always.
777*ed093b41SRobert Mustacchi * 3. The saved xsave state indicates that a given component's state bit is
778*ed093b41SRobert Mustacchi * valid. The simplest of our cases. We can just take what we have from the
779*ed093b41SRobert Mustacchi * xsave state.
780*ed093b41SRobert Mustacchi *
781*ed093b41SRobert Mustacchi * The CPU's default state for most components other than the x87/SSE state is
782*ed093b41SRobert Mustacchi * to have it be zeroed. This is what we treat as our default state as well. The
783*ed093b41SRobert Mustacchi * primary difference is in the initialization of the x87/SSE state. The SYS V
784*ed093b41SRobert Mustacchi * ABI requires that we enable a different floating point control word then the
785*ed093b41SRobert Mustacchi * hardware default. This means that when we're dealing with case (1) for
786*ed093b41SRobert Mustacchi * x87/SSE we have to be more careful than the other components. Thankfully for
787*ed093b41SRobert Mustacchi * everything else this is just keeping it zeroed.
788*ed093b41SRobert Mustacchi *
789*ed093b41SRobert Mustacchi * A reasonable question would be why not just skip components that aren't
790*ed093b41SRobert Mustacchi * marked as present. There are a few reasons we take a different approach and
791*ed093b41SRobert Mustacchi * always include them. Both of these are to make lives simpler for consumers.
792*ed093b41SRobert Mustacchi * In the first case, when someone is performing a read and wants to reassemble
793*ed093b41SRobert Mustacchi * and answer the question of 'what is the value of %ymm0 or %zmm15', they have
794*ed093b41SRobert Mustacchi * to combine multiple disparate parts. If one knows that the data we put into
795*ed093b41SRobert Mustacchi * there is always valid and represents what is in hardware and doesn't have to
796*ed093b41SRobert Mustacchi * keep track of what are the defaults in different circumstances, then that
797*ed093b41SRobert Mustacchi * greatly simplifies consumers lives. It also helps us for core files and other
798*ed093b41SRobert Mustacchi * observability cases because the answer to what is the operating system's
799*ed093b41SRobert Mustacchi * default may change over time.
800*ed093b41SRobert Mustacchi *
801*ed093b41SRobert Mustacchi * Similarly, including all the possible structures means that we have
802*ed093b41SRobert Mustacchi * simplified writes. Writes are always setting the full state of a thread,
803*ed093b41SRobert Mustacchi * meaning that if someone wants to modify only a single register they must do a
804*ed093b41SRobert Mustacchi * read, modify, and write. By including everything that they might need, it
805*ed093b41SRobert Mustacchi * makes it easier for consumers to do this and not have to cons up the whole
806*ed093b41SRobert Mustacchi * structure on their own.
807*ed093b41SRobert Mustacchi *
808*ed093b41SRobert Mustacchi * When we're setting state, things change around a little bit. We have a few
809*ed093b41SRobert Mustacchi * constraints that are laid out in proc(5). In particular, we require that the
810*ed093b41SRobert Mustacchi * PRX_INFO_XSAVE component always be present to tell us which other components
811*ed093b41SRobert Mustacchi * we expect to be here and which ones we don't. We also are much stricter about
812*ed093b41SRobert Mustacchi * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
813*ed093b41SRobert Mustacchi * and may not be modified by a calling process. In addition, when we have
814*ed093b41SRobert Mustacchi * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
815*ed093b41SRobert Mustacchi * components, if they are being written to and have modifications, then we will
816*ed093b41SRobert Mustacchi * indicate an error there.
817*ed093b41SRobert Mustacchi *
818*ed093b41SRobert Mustacchi * Because we are given the entire buffer from userland and don't need to have
819*ed093b41SRobert Mustacchi * an intermediate place to copy it in, we will validate the entire thing in
820*ed093b41SRobert Mustacchi * advance. Once it has been validated and we consider it legal, then we will
821*ed093b41SRobert Mustacchi * translate each entry into its corresponding entry in pcb's normal floating
822*ed093b41SRobert Mustacchi * point state. This is different from signal handling mostly because of the
823*ed093b41SRobert Mustacchi * fact that we are not using copyin, and once we get to this point, there is
824*ed093b41SRobert Mustacchi * no more validation, so we don't have the same concerns around blocking while
825*ed093b41SRobert Mustacchi * pre-emption is disabled.
826*ed093b41SRobert Mustacchi *
827*ed093b41SRobert Mustacchi * The Wrinkle with fpregs
828*ed093b41SRobert Mustacchi * -----------------------
829*ed093b41SRobert Mustacchi *
830*ed093b41SRobert Mustacchi * When we instead turn our attention to the fpregs, whether we're gathering
831*ed093b41SRobert Mustacchi * them as part of the ucontext_t or as part of /proc, there are a few
832*ed093b41SRobert Mustacchi * complications that we need to be aware of when we're operating on a kernel
833*ed093b41SRobert Mustacchi * that is using xsave as the save mechanism. When we're using fxsave as the
834*ed093b41SRobert Mustacchi * save mechanism, the CPU will always save the entire 512-byte fxsave region.
835*ed093b41SRobert Mustacchi * The fpregs ABI that the kernel expects is basically this structure itself,
836*ed093b41SRobert Mustacchi * which is transformed into a 32-bit compatible form in archdep.c.
837*ed093b41SRobert Mustacchi *
838*ed093b41SRobert Mustacchi * But xsave makes this much more complex and has historically been a source of
839*ed093b41SRobert Mustacchi * bugs in the system. In particular, unlike fxsave, xsave has its component bit
840*ed093b41SRobert Mustacchi * vector that is written out to indicate validity. This means that blindly
841*ed093b41SRobert Mustacchi * copying the fxsave area without checking those bits will lead us to do the
842*ed093b41SRobert Mustacchi * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
843*ed093b41SRobert Mustacchi * while the x87 legacy fp flag covers the rest of the state. This is all good,
844*ed093b41SRobert Mustacchi * aside from the MCXSR.
845*ed093b41SRobert Mustacchi *
846*ed093b41SRobert Mustacchi * One of the more complicated pieces of xsave state management is correctly
847*ed093b41SRobert Mustacchi * answering the question of when the MXCSR is written out to xsave_state. In
848*ed093b41SRobert Mustacchi * practice, this is rather convoluted and varies. If either the XMM or AVX
849*ed093b41SRobert Mustacchi * feature bits are set then the CPU will write out the MXCSR and its mask
850*ed093b41SRobert Mustacchi * register into the traditional fxsave state region. This behavior is dependent
851*ed093b41SRobert Mustacchi * on the type of save function that we use. xsave and xsaveopt will look at the
852*ed093b41SRobert Mustacchi * AVX feature bit; however, xsavec does not and only considers the SSE feature
853*ed093b41SRobert Mustacchi * bit. This means that when we're retrieving things, we need to check both of
854*ed093b41SRobert Mustacchi * those bits to determine if we should use the initial state or the value
855*ed093b41SRobert Mustacchi * written out.
856*ed093b41SRobert Mustacchi *
857*ed093b41SRobert Mustacchi * When we come to someone trying to set the fpregs through /proc, the main
858*ed093b41SRobert Mustacchi * question we have is what happens to the extended registers. We have opted to
859*ed093b41SRobert Mustacchi * implement and document it such that a write to the fpregs only impacts the
860*ed093b41SRobert Mustacchi * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
861*ed093b41SRobert Mustacchi * copying the data into the save area, set the state bits for x87 and XMM
862*ed093b41SRobert Mustacchi * state, and then set the FPU to be restored. All in all, this basically means
863*ed093b41SRobert Mustacchi * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
864*ed093b41SRobert Mustacchi * that we might have present.
865*ed093b41SRobert Mustacchi *
866*ed093b41SRobert Mustacchi * Forward Looking: Adding Intel AMX Support
867*ed093b41SRobert Mustacchi * -----------------------------------------
868*ed093b41SRobert Mustacchi *
869*ed093b41SRobert Mustacchi * Nothing can stop the march of features being added into the FPU. One of the
870*ed093b41SRobert Mustacchi * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
871*ed093b41SRobert Mustacchi * Extensions (AMX), which add a large chunk of xsave state to each process.
872*ed093b41SRobert Mustacchi * While things like AVX and AVX-512 have been enabled by default, the broader
873*ed093b41SRobert Mustacchi * OS community has not been wanting to do this for AMX ,because of the size of
874*ed093b41SRobert Mustacchi * the state which exceeds 8 KiB. While the signal handling state went out of
875*ed093b41SRobert Mustacchi * its way to minimize the size it wrote to the stack, if this is used, it would
876*ed093b41SRobert Mustacchi * need to be preserved.
877*ed093b41SRobert Mustacchi *
878*ed093b41SRobert Mustacchi * To deal with this reality and the fact that folks don't really want to
879*ed093b41SRobert Mustacchi * enable it by default for all purposes when its use will be quite special
880*ed093b41SRobert Mustacchi * purpose, Intel has also added a MSR around extended feature disable or xfd.
881*ed093b41SRobert Mustacchi * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
882*ed093b41SRobert Mustacchi * assumption, and the reason that so much of the /proc and signal logic ensures
883*ed093b41SRobert Mustacchi * that we have the thread and process around, taking as an example the unused
884*ed093b41SRobert Mustacchi * process argument in fpu_proc_xregs_info(), is that we will follow suit and
885*ed093b41SRobert Mustacchi * default to having support disabled, but that a process will be able to opt
886*ed093b41SRobert Mustacchi * into it, which will result in several different assumptions around signal
887*ed093b41SRobert Mustacchi * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
888*ed093b41SRobert Mustacchi *
889*ed093b41SRobert Mustacchi * The following is a list of items to pay attention to for future folks who
890*ed093b41SRobert Mustacchi * work on this:
891*ed093b41SRobert Mustacchi *
892*ed093b41SRobert Mustacchi * o We will want to confirm whether other systems have opted to make this
893*ed093b41SRobert Mustacchi * process-wide or thread-wide. Assuming process-wide, we will need to do a
894*ed093b41SRobert Mustacchi * hold of all lwps while making a change. The interface for that probably
895*ed093b41SRobert Mustacchi * doesn't want to be /proc, as a process probably doesn't want to write to
896*ed093b41SRobert Mustacchi * its own control file. Changing it for another process could be done
897*ed093b41SRobert Mustacchi * through the agent-lwp.
898*ed093b41SRobert Mustacchi * o Opting into this should probably be a one-way street.
899*ed093b41SRobert Mustacchi * o Opting into this will need to evaluate all threads and in particular
900*ed093b41SRobert Mustacchi * stack sizes to confirm they adhere to the new minimum.
901*ed093b41SRobert Mustacchi * o We will need to make sure that setting and clearing the xfd MSR is part
902*ed093b41SRobert Mustacchi * of the FPU context ops and something we set by default on every CPU.
903*ed093b41SRobert Mustacchi * o We will need to add a new interface to allow opting into this feature.
904*ed093b41SRobert Mustacchi * o We will need to ensure that all subsequently created signal stacks adhere
905*ed093b41SRobert Mustacchi * to a required minimum size that we communicate through libc.
906*ed093b41SRobert Mustacchi * o We will need to make sure that both rtld and libc no longer rely on a
907*ed093b41SRobert Mustacchi * static value of the AT_SUN_FPSIZE, but rather realize that this can be
908*ed093b41SRobert Mustacchi * dynamic. At that time, we should evaluate if we can get away with not
909*ed093b41SRobert Mustacchi * needing to save this for rtld, even though signal handlers should assume
910*ed093b41SRobert Mustacchi * they will.
911*ed093b41SRobert Mustacchi * o The various components (because there is more than one) will want to be
912*ed093b41SRobert Mustacchi * added to the fpu_xsave_info[]. Consulting the processes's xfd will be
913*ed093b41SRobert Mustacchi * required and probably require logic changes.
914*ed093b41SRobert Mustacchi *
915*ed093b41SRobert Mustacchi * The above is not exhaustive. We'll probably have some other issues and fun
916*ed093b41SRobert Mustacchi * while doing this.
917*ed093b41SRobert Mustacchi */
918*ed093b41SRobert Mustacchi
919*ed093b41SRobert Mustacchi /*
920*ed093b41SRobert Mustacchi * The kind of FPU we advertise to rtld so it knows what to do when working
921*ed093b41SRobert Mustacchi * through the PLT.
922*ed093b41SRobert Mustacchi */
923*ed093b41SRobert Mustacchi int fp_elf = AT_386_FPINFO_FXSAVE;
924*ed093b41SRobert Mustacchi
925*ed093b41SRobert Mustacchi /*
926*ed093b41SRobert Mustacchi * Mechanism to save FPU state.
9274c28a617SRobert Mustacchi */
928*ed093b41SRobert Mustacchi int fp_save_mech = FP_FXSAVE;
9294c28a617SRobert Mustacchi
930088d69f8SJerry Jelinek kmem_cache_t *fpsave_cachep;
931088d69f8SJerry Jelinek
9327af88ac7SKuriakose Kuruvilla /* Legacy fxsave layout + xsave header + ymm */
9337af88ac7SKuriakose Kuruvilla #define AVX_XSAVE_SIZE (512 + 64 + 256)
9347c478bd9Sstevel@tonic-gate
9354c28a617SRobert Mustacchi /*
9364c28a617SRobert Mustacchi * Various sanity checks.
9374c28a617SRobert Mustacchi */
9384c28a617SRobert Mustacchi CTASSERT(sizeof (struct fxsave_state) == 512);
9394c28a617SRobert Mustacchi CTASSERT(sizeof (struct fnsave_state) == 108);
9404c28a617SRobert Mustacchi CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
9414c28a617SRobert Mustacchi CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
9424c28a617SRobert Mustacchi
943*ed093b41SRobert Mustacchi /*
944*ed093b41SRobert Mustacchi * Basic architectural alignment information.
945*ed093b41SRobert Mustacchi */
946*ed093b41SRobert Mustacchi #define FPU_ALIGN_XMM 16
947*ed093b41SRobert Mustacchi #define FPU_ALIGN_YMM 32
948*ed093b41SRobert Mustacchi #define FPU_ALIGN_ZMM 64
949*ed093b41SRobert Mustacchi
9502fc9ab6eSJerry Jelinek /*
9512fc9ab6eSJerry Jelinek * This structure is the x86 implementation of the kernel FPU that is defined in
9522fc9ab6eSJerry Jelinek * uts/common/sys/kfpu.h.
9532fc9ab6eSJerry Jelinek */
9542fc9ab6eSJerry Jelinek
9552fc9ab6eSJerry Jelinek typedef enum kfpu_flags {
9562fc9ab6eSJerry Jelinek /*
9572fc9ab6eSJerry Jelinek * This indicates that the save state has initial FPU data.
9582fc9ab6eSJerry Jelinek */
9592fc9ab6eSJerry Jelinek KFPU_F_INITIALIZED = 0x01
9602fc9ab6eSJerry Jelinek } kfpu_flags_t;
9612fc9ab6eSJerry Jelinek
9622fc9ab6eSJerry Jelinek struct kfpu_state {
9632fc9ab6eSJerry Jelinek fpu_ctx_t kfpu_ctx;
9642fc9ab6eSJerry Jelinek kfpu_flags_t kfpu_flags;
9652fc9ab6eSJerry Jelinek kthread_t *kfpu_curthread;
9662fc9ab6eSJerry Jelinek };
9672fc9ab6eSJerry Jelinek
9687c478bd9Sstevel@tonic-gate /*
9697c478bd9Sstevel@tonic-gate * Initial kfpu state for SSE/SSE2 used by fpinit()
9707c478bd9Sstevel@tonic-gate */
9717c478bd9Sstevel@tonic-gate const struct fxsave_state sse_initial = {
9727c478bd9Sstevel@tonic-gate FPU_CW_INIT, /* fx_fcw */
9737c478bd9Sstevel@tonic-gate 0, /* fx_fsw */
9747c478bd9Sstevel@tonic-gate 0, /* fx_fctw */
9757c478bd9Sstevel@tonic-gate 0, /* fx_fop */
9767c478bd9Sstevel@tonic-gate 0, /* fx_rip */
9777c478bd9Sstevel@tonic-gate 0, /* fx_rdp */
9787c478bd9Sstevel@tonic-gate SSE_MXCSR_INIT /* fx_mxcsr */
9797c478bd9Sstevel@tonic-gate /* rest of structure is zero */
9807c478bd9Sstevel@tonic-gate };
9817c478bd9Sstevel@tonic-gate
9827af88ac7SKuriakose Kuruvilla /*
9837af88ac7SKuriakose Kuruvilla * Initial kfpu state for AVX used by fpinit()
9847af88ac7SKuriakose Kuruvilla */
9857af88ac7SKuriakose Kuruvilla const struct xsave_state avx_initial = {
9867af88ac7SKuriakose Kuruvilla /*
9877af88ac7SKuriakose Kuruvilla * The definition below needs to be identical with sse_initial
9887af88ac7SKuriakose Kuruvilla * defined above.
9897af88ac7SKuriakose Kuruvilla */
990957246c9SPatrick Mooney .xs_fxsave = {
991957246c9SPatrick Mooney .fx_fcw = FPU_CW_INIT,
992957246c9SPatrick Mooney .fx_mxcsr = SSE_MXCSR_INIT,
993957246c9SPatrick Mooney },
994957246c9SPatrick Mooney .xs_header = {
995957246c9SPatrick Mooney /*
996957246c9SPatrick Mooney * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
997957246c9SPatrick Mooney * valid, and CPU should initialize XMM/YMM.
998957246c9SPatrick Mooney */
999957246c9SPatrick Mooney .xsh_xstate_bv = 1,
1000957246c9SPatrick Mooney .xsh_xcomp_bv = 0,
10017af88ac7SKuriakose Kuruvilla },
10027af88ac7SKuriakose Kuruvilla };
10037af88ac7SKuriakose Kuruvilla
10047c478bd9Sstevel@tonic-gate /*
10057c478bd9Sstevel@tonic-gate * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
10067c478bd9Sstevel@tonic-gate * the #gp exception caused by setting unsupported bits in the
10077c478bd9Sstevel@tonic-gate * MXCSR register
10087c478bd9Sstevel@tonic-gate */
10097c478bd9Sstevel@tonic-gate uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
10107c478bd9Sstevel@tonic-gate
10117af88ac7SKuriakose Kuruvilla /*
101228816667SJerry Jelinek * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
101328816667SJerry Jelinek * have an XSAVE-capable chip in fpu_probe.
10147af88ac7SKuriakose Kuruvilla */
10157af88ac7SKuriakose Kuruvilla void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
10164c28a617SRobert Mustacchi void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
10177c478bd9Sstevel@tonic-gate
1018088d69f8SJerry Jelinek /*
1019088d69f8SJerry Jelinek * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
1020088d69f8SJerry Jelinek */
1021088d69f8SJerry Jelinek void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
1022088d69f8SJerry Jelinek
10237c478bd9Sstevel@tonic-gate static int fpe_sicode(uint_t);
10247c478bd9Sstevel@tonic-gate static int fpe_simd_sicode(uint_t);
10255a469116SPatrick Mooney static void fp_new_lwp(void *, void *);
10265a469116SPatrick Mooney static void fp_free_ctx(void *, int);
10275a469116SPatrick Mooney
10285a469116SPatrick Mooney static struct ctxop *
fp_ctxop_allocate(struct fpu_ctx * fp)10295a469116SPatrick Mooney fp_ctxop_allocate(struct fpu_ctx *fp)
10305a469116SPatrick Mooney {
10315a469116SPatrick Mooney const struct ctxop_template tpl = {
10325a469116SPatrick Mooney .ct_rev = CTXOP_TPL_REV,
10335a469116SPatrick Mooney .ct_save = fpsave_ctxt,
10345a469116SPatrick Mooney .ct_restore = fprestore_ctxt,
10355a469116SPatrick Mooney .ct_fork = fp_new_lwp,
10365a469116SPatrick Mooney .ct_lwp_create = fp_new_lwp,
10375a469116SPatrick Mooney .ct_free = fp_free_ctx,
10385a469116SPatrick Mooney };
10395a469116SPatrick Mooney return (ctxop_allocate(&tpl, fp));
10405a469116SPatrick Mooney }
10417c478bd9Sstevel@tonic-gate
10427c478bd9Sstevel@tonic-gate /*
10437c478bd9Sstevel@tonic-gate * Copy the state of parent lwp's floating point context into the new lwp.
10447c478bd9Sstevel@tonic-gate * Invoked for both fork() and lwp_create().
10457c478bd9Sstevel@tonic-gate *
10467c478bd9Sstevel@tonic-gate * Note that we inherit -only- the control state (e.g. exception masks,
10477c478bd9Sstevel@tonic-gate * rounding, precision control, etc.); the FPU registers are otherwise
10487c478bd9Sstevel@tonic-gate * reset to their initial state.
10497c478bd9Sstevel@tonic-gate */
10507c478bd9Sstevel@tonic-gate static void
fp_new_lwp(void * parent,void * child)10515a469116SPatrick Mooney fp_new_lwp(void *parent, void *child)
10527c478bd9Sstevel@tonic-gate {
10535a469116SPatrick Mooney kthread_id_t t = parent, ct = child;
10547c478bd9Sstevel@tonic-gate struct fpu_ctx *fp; /* parent fpu context */
10557c478bd9Sstevel@tonic-gate struct fpu_ctx *cfp; /* new fpu context */
10567c478bd9Sstevel@tonic-gate struct fxsave_state *fx, *cfx;
10577af88ac7SKuriakose Kuruvilla struct xsave_state *cxs;
10587c478bd9Sstevel@tonic-gate
10597c478bd9Sstevel@tonic-gate ASSERT(fp_kind != FP_NO);
10607c478bd9Sstevel@tonic-gate
10617c478bd9Sstevel@tonic-gate fp = &t->t_lwp->lwp_pcb.pcb_fpu;
10627c478bd9Sstevel@tonic-gate cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
10637c478bd9Sstevel@tonic-gate
10647c478bd9Sstevel@tonic-gate /*
10657c478bd9Sstevel@tonic-gate * If the parent FPU state is still in the FPU hw then save it;
10667c478bd9Sstevel@tonic-gate * conveniently, fp_save() already does this for us nicely.
10677c478bd9Sstevel@tonic-gate */
10687c478bd9Sstevel@tonic-gate fp_save(fp);
10697c478bd9Sstevel@tonic-gate
10707c478bd9Sstevel@tonic-gate cfp->fpu_flags = FPU_EN | FPU_VALID;
10717c478bd9Sstevel@tonic-gate cfp->fpu_regs.kfpu_status = 0;
10727c478bd9Sstevel@tonic-gate cfp->fpu_regs.kfpu_xstatus = 0;
10737c478bd9Sstevel@tonic-gate
10744c28a617SRobert Mustacchi /*
10754c28a617SRobert Mustacchi * Make sure that the child's FPU is cleaned up and made ready for user
10764c28a617SRobert Mustacchi * land.
10774c28a617SRobert Mustacchi */
10784c28a617SRobert Mustacchi PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
10794c28a617SRobert Mustacchi
10807af88ac7SKuriakose Kuruvilla switch (fp_save_mech) {
10817af88ac7SKuriakose Kuruvilla case FP_FXSAVE:
1082088d69f8SJerry Jelinek fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1083088d69f8SJerry Jelinek cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
10847c478bd9Sstevel@tonic-gate bcopy(&sse_initial, cfx, sizeof (*cfx));
10857c478bd9Sstevel@tonic-gate cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
10867c478bd9Sstevel@tonic-gate cfx->fx_fcw = fx->fx_fcw;
10877af88ac7SKuriakose Kuruvilla break;
10887c478bd9Sstevel@tonic-gate
10897af88ac7SKuriakose Kuruvilla case FP_XSAVE:
10907af88ac7SKuriakose Kuruvilla cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
10917af88ac7SKuriakose Kuruvilla
1092088d69f8SJerry Jelinek VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
1093088d69f8SJerry Jelinek
1094088d69f8SJerry Jelinek fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1095088d69f8SJerry Jelinek cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
10967af88ac7SKuriakose Kuruvilla cfx = &cxs->xs_fxsave;
10977af88ac7SKuriakose Kuruvilla
10987af88ac7SKuriakose Kuruvilla bcopy(&avx_initial, cxs, sizeof (*cxs));
10997af88ac7SKuriakose Kuruvilla cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
11007af88ac7SKuriakose Kuruvilla cfx->fx_fcw = fx->fx_fcw;
1101957246c9SPatrick Mooney cxs->xs_header.xsh_xstate_bv |=
1102957246c9SPatrick Mooney (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
11037af88ac7SKuriakose Kuruvilla break;
11047af88ac7SKuriakose Kuruvilla default:
11057af88ac7SKuriakose Kuruvilla panic("Invalid fp_save_mech");
11067af88ac7SKuriakose Kuruvilla /*NOTREACHED*/
11077c478bd9Sstevel@tonic-gate }
11087af88ac7SKuriakose Kuruvilla
11097c478bd9Sstevel@tonic-gate /*
11104c28a617SRobert Mustacchi * Mark that both the parent and child need to have the FPU cleaned up
1111*ed093b41SRobert Mustacchi * before returning to userland.
11127c478bd9Sstevel@tonic-gate */
11134c28a617SRobert Mustacchi
11145a469116SPatrick Mooney ctxop_attach(ct, fp_ctxop_allocate(cfp));
11157c478bd9Sstevel@tonic-gate }
11167c478bd9Sstevel@tonic-gate
11177c478bd9Sstevel@tonic-gate /*
11187c478bd9Sstevel@tonic-gate * Free any state associated with floating point context.
11197c478bd9Sstevel@tonic-gate * Fp_free can be called in three cases:
1120088d69f8SJerry Jelinek * 1) from reaper -> thread_free -> freectx-> fp_free
11217c478bd9Sstevel@tonic-gate * fp context belongs to a thread on deathrow
11227c478bd9Sstevel@tonic-gate * nothing to do, thread will never be resumed
11237c478bd9Sstevel@tonic-gate * thread calling ctxfree is reaper
11247c478bd9Sstevel@tonic-gate *
1125088d69f8SJerry Jelinek * 2) from exec -> freectx -> fp_free
11267c478bd9Sstevel@tonic-gate * fp context belongs to the current thread
11277c478bd9Sstevel@tonic-gate * must disable fpu, thread calling ctxfree is curthread
11287c478bd9Sstevel@tonic-gate *
11297c478bd9Sstevel@tonic-gate * 3) from restorecontext -> setfpregs -> fp_free
11307c478bd9Sstevel@tonic-gate * we have a modified context in the memory (lwp->pcb_fpu)
11317c478bd9Sstevel@tonic-gate * disable fpu and release the fp context for the CPU
11327c478bd9Sstevel@tonic-gate *
11337c478bd9Sstevel@tonic-gate */
11347c478bd9Sstevel@tonic-gate void
fp_free(struct fpu_ctx * fp)11355a469116SPatrick Mooney fp_free(struct fpu_ctx *fp)
11367c478bd9Sstevel@tonic-gate {
11377c478bd9Sstevel@tonic-gate ASSERT(fp_kind != FP_NO);
11387c478bd9Sstevel@tonic-gate
11397c478bd9Sstevel@tonic-gate if (fp->fpu_flags & FPU_VALID)
11407c478bd9Sstevel@tonic-gate return;
11417c478bd9Sstevel@tonic-gate
11427c478bd9Sstevel@tonic-gate kpreempt_disable();
11437c478bd9Sstevel@tonic-gate /*
11447c478bd9Sstevel@tonic-gate * We want to do fpsave rather than fpdisable so that we can
11457c478bd9Sstevel@tonic-gate * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
11467c478bd9Sstevel@tonic-gate */
11477c478bd9Sstevel@tonic-gate fp->fpu_flags |= FPU_VALID;
11487c478bd9Sstevel@tonic-gate /* If for current thread disable FP to track FPU_VALID */
11497c478bd9Sstevel@tonic-gate if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
11507c478bd9Sstevel@tonic-gate /* Clear errors if any to prevent frstor from complaining */
11517c478bd9Sstevel@tonic-gate (void) fperr_reset();
11527af88ac7SKuriakose Kuruvilla if (fp_kind & __FP_SSE)
11537c478bd9Sstevel@tonic-gate (void) fpxerr_reset();
11547c478bd9Sstevel@tonic-gate fpdisable();
11557c478bd9Sstevel@tonic-gate }
11567c478bd9Sstevel@tonic-gate kpreempt_enable();
11577c478bd9Sstevel@tonic-gate }
11587c478bd9Sstevel@tonic-gate
11595a469116SPatrick Mooney /*
11605a469116SPatrick Mooney * Wrapper for freectx to make the types line up for fp_free()
11615a469116SPatrick Mooney */
11625a469116SPatrick Mooney static void
fp_free_ctx(void * arg,int isexec __unused)11635a469116SPatrick Mooney fp_free_ctx(void *arg, int isexec __unused)
11645a469116SPatrick Mooney {
11655a469116SPatrick Mooney fp_free((struct fpu_ctx *)arg);
11665a469116SPatrick Mooney }
11675a469116SPatrick Mooney
11687c478bd9Sstevel@tonic-gate /*
11697c478bd9Sstevel@tonic-gate * Store the floating point state and disable the floating point unit.
11707c478bd9Sstevel@tonic-gate */
11717c478bd9Sstevel@tonic-gate void
fp_save(struct fpu_ctx * fp)11727c478bd9Sstevel@tonic-gate fp_save(struct fpu_ctx *fp)
11737c478bd9Sstevel@tonic-gate {
11747c478bd9Sstevel@tonic-gate ASSERT(fp_kind != FP_NO);
11757c478bd9Sstevel@tonic-gate
11767c478bd9Sstevel@tonic-gate kpreempt_disable();
1177a16c2dd2SJerry Jelinek if (!fp || fp->fpu_flags & FPU_VALID ||
1178a16c2dd2SJerry Jelinek (fp->fpu_flags & FPU_EN) == 0) {
11797c478bd9Sstevel@tonic-gate kpreempt_enable();
11807c478bd9Sstevel@tonic-gate return;
11817c478bd9Sstevel@tonic-gate }
11827c478bd9Sstevel@tonic-gate ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
11837c478bd9Sstevel@tonic-gate
11847af88ac7SKuriakose Kuruvilla switch (fp_save_mech) {
11857af88ac7SKuriakose Kuruvilla case FP_FXSAVE:
1186088d69f8SJerry Jelinek fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
11877c478bd9Sstevel@tonic-gate break;
11887af88ac7SKuriakose Kuruvilla
11897af88ac7SKuriakose Kuruvilla case FP_XSAVE:
1190088d69f8SJerry Jelinek xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
11917c478bd9Sstevel@tonic-gate break;
11927af88ac7SKuriakose Kuruvilla default:
11937af88ac7SKuriakose Kuruvilla panic("Invalid fp_save_mech");
11947af88ac7SKuriakose Kuruvilla /*NOTREACHED*/
11957c478bd9Sstevel@tonic-gate }
11967af88ac7SKuriakose Kuruvilla
11977c478bd9Sstevel@tonic-gate fp->fpu_flags |= FPU_VALID;
11984c28a617SRobert Mustacchi
11994c28a617SRobert Mustacchi /*
12004c28a617SRobert Mustacchi * We save the FPU as part of forking, execing, modifications via /proc,
12014c28a617SRobert Mustacchi * restorecontext, etc. As such, we need to make sure that we return to
12024c28a617SRobert Mustacchi * userland with valid state in the FPU. If we're context switched out
12034c28a617SRobert Mustacchi * before we hit sys_rtt_common() we'll end up having restored the FPU
12044c28a617SRobert Mustacchi * as part of the context ops operations. The restore logic always makes
12054c28a617SRobert Mustacchi * sure that FPU_VALID is set before doing a restore so we don't restore
12064c28a617SRobert Mustacchi * it a second time.
12074c28a617SRobert Mustacchi */
12084c28a617SRobert Mustacchi PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
12094c28a617SRobert Mustacchi
12107c478bd9Sstevel@tonic-gate kpreempt_enable();
12117c478bd9Sstevel@tonic-gate }
12127c478bd9Sstevel@tonic-gate
12137c478bd9Sstevel@tonic-gate /*
12147c478bd9Sstevel@tonic-gate * Restore the FPU context for the thread:
12157c478bd9Sstevel@tonic-gate * The possibilities are:
12167c478bd9Sstevel@tonic-gate * 1. No active FPU context: Load the new context into the FPU hw
12177c478bd9Sstevel@tonic-gate * and enable the FPU.
12187c478bd9Sstevel@tonic-gate */
12197c478bd9Sstevel@tonic-gate void
fp_restore(struct fpu_ctx * fp)12207c478bd9Sstevel@tonic-gate fp_restore(struct fpu_ctx *fp)
12217c478bd9Sstevel@tonic-gate {
12227af88ac7SKuriakose Kuruvilla switch (fp_save_mech) {
12237af88ac7SKuriakose Kuruvilla case FP_FXSAVE:
1224088d69f8SJerry Jelinek fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
12257af88ac7SKuriakose Kuruvilla break;
12267af88ac7SKuriakose Kuruvilla
12277af88ac7SKuriakose Kuruvilla case FP_XSAVE:
1228088d69f8SJerry Jelinek xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
12297af88ac7SKuriakose Kuruvilla break;
12307af88ac7SKuriakose Kuruvilla default:
12317af88ac7SKuriakose Kuruvilla panic("Invalid fp_save_mech");
12327af88ac7SKuriakose Kuruvilla /*NOTREACHED*/
12337af88ac7SKuriakose Kuruvilla }
12347af88ac7SKuriakose Kuruvilla
12357c478bd9Sstevel@tonic-gate fp->fpu_flags &= ~FPU_VALID;
12367c478bd9Sstevel@tonic-gate }
12377c478bd9Sstevel@tonic-gate
12384c28a617SRobert Mustacchi /*
12394c28a617SRobert Mustacchi * Reset the FPU such that it is in a valid state for a new thread that is
12404c28a617SRobert Mustacchi * coming out of exec. The FPU will be in a usable state at this point. At this
12414c28a617SRobert Mustacchi * point we know that the FPU state has already been allocated and if this
12424c28a617SRobert Mustacchi * wasn't an init process, then it will have had fp_free() previously called.
12434c28a617SRobert Mustacchi */
12444c28a617SRobert Mustacchi void
fp_exec(void)12454c28a617SRobert Mustacchi fp_exec(void)
12464c28a617SRobert Mustacchi {
12474c28a617SRobert Mustacchi struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
12484c28a617SRobert Mustacchi
12494c28a617SRobert Mustacchi if (fp_save_mech == FP_XSAVE) {
12504c28a617SRobert Mustacchi fp->fpu_xsave_mask = XFEATURE_FP_ALL;
12514c28a617SRobert Mustacchi }
12524c28a617SRobert Mustacchi
12535a469116SPatrick Mooney struct ctxop *ctx = fp_ctxop_allocate(fp);
12544c28a617SRobert Mustacchi /*
12554c28a617SRobert Mustacchi * Make sure that we're not preempted in the middle of initializing the
12564c28a617SRobert Mustacchi * FPU on CPU.
12574c28a617SRobert Mustacchi */
12584c28a617SRobert Mustacchi kpreempt_disable();
12595a469116SPatrick Mooney ctxop_attach(curthread, ctx);
12604c28a617SRobert Mustacchi fpinit();
12614c28a617SRobert Mustacchi fp->fpu_flags = FPU_EN;
12624c28a617SRobert Mustacchi kpreempt_enable();
12634c28a617SRobert Mustacchi }
12644c28a617SRobert Mustacchi
12657c478bd9Sstevel@tonic-gate
12667c478bd9Sstevel@tonic-gate /*
12677c478bd9Sstevel@tonic-gate * Seeds the initial state for the current thread. The possibilities are:
12687c478bd9Sstevel@tonic-gate * 1. Another process has modified the FPU state before we have done any
12697c478bd9Sstevel@tonic-gate * initialization: Load the FPU state from the LWP state.
12707c478bd9Sstevel@tonic-gate * 2. The FPU state has not been externally modified: Load a clean state.
12717c478bd9Sstevel@tonic-gate */
12724c28a617SRobert Mustacchi void
fp_seed(void)12737c478bd9Sstevel@tonic-gate fp_seed(void)
12747c478bd9Sstevel@tonic-gate {
12757c478bd9Sstevel@tonic-gate struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
12767c478bd9Sstevel@tonic-gate
12777c478bd9Sstevel@tonic-gate ASSERT(curthread->t_preempt >= 1);
12787c478bd9Sstevel@tonic-gate ASSERT((fp->fpu_flags & FPU_EN) == 0);
12797c478bd9Sstevel@tonic-gate
12807c478bd9Sstevel@tonic-gate /*
12817c478bd9Sstevel@tonic-gate * Always initialize a new context and initialize the hardware.
12827c478bd9Sstevel@tonic-gate */
12837af88ac7SKuriakose Kuruvilla if (fp_save_mech == FP_XSAVE) {
1284d0158222SRobert Mustacchi fp->fpu_xsave_mask = XFEATURE_FP_ALL;
12857af88ac7SKuriakose Kuruvilla }
12867af88ac7SKuriakose Kuruvilla
12875a469116SPatrick Mooney ctxop_attach(curthread, fp_ctxop_allocate(fp));
12887c478bd9Sstevel@tonic-gate fpinit();
12897c478bd9Sstevel@tonic-gate
12907c478bd9Sstevel@tonic-gate /*
12917c478bd9Sstevel@tonic-gate * If FPU_VALID is set, it means someone has modified registers via
12927c478bd9Sstevel@tonic-gate * /proc. In this case, restore the current lwp's state.
12937c478bd9Sstevel@tonic-gate */
12947c478bd9Sstevel@tonic-gate if (fp->fpu_flags & FPU_VALID)
12957c478bd9Sstevel@tonic-gate fp_restore(fp);
12967c478bd9Sstevel@tonic-gate
12977c478bd9Sstevel@tonic-gate ASSERT((fp->fpu_flags & FPU_VALID) == 0);
12987c478bd9Sstevel@tonic-gate fp->fpu_flags = FPU_EN;
12997c478bd9Sstevel@tonic-gate }
13007c478bd9Sstevel@tonic-gate
1301088d69f8SJerry Jelinek /*
1302088d69f8SJerry Jelinek * When using xsave/xrstor, these three functions are used by the lwp code to
1303088d69f8SJerry Jelinek * manage the memory for the xsave area.
1304088d69f8SJerry Jelinek */
1305088d69f8SJerry Jelinek void
fp_lwp_init(klwp_t * lwp)1306*ed093b41SRobert Mustacchi fp_lwp_init(klwp_t *lwp)
1307088d69f8SJerry Jelinek {
1308088d69f8SJerry Jelinek struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1309088d69f8SJerry Jelinek
1310088d69f8SJerry Jelinek /*
1311088d69f8SJerry Jelinek * We keep a copy of the pointer in lwp_fpu so that we can restore the
1312088d69f8SJerry Jelinek * value in forklwp() after we duplicate the parent's LWP state.
1313088d69f8SJerry Jelinek */
1314088d69f8SJerry Jelinek lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
1315088d69f8SJerry Jelinek kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
1316*ed093b41SRobert Mustacchi fp->fpu_signal = NULL;
1317088d69f8SJerry Jelinek
1318088d69f8SJerry Jelinek if (fp_save_mech == FP_XSAVE) {
1319088d69f8SJerry Jelinek /*
1320088d69f8SJerry Jelinek *
1321088d69f8SJerry Jelinek * We bzero since the fpinit() code path will only
1322088d69f8SJerry Jelinek * partially initialize the xsave area using avx_inital.
1323088d69f8SJerry Jelinek */
1324088d69f8SJerry Jelinek ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
1325088d69f8SJerry Jelinek bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
1326088d69f8SJerry Jelinek }
1327088d69f8SJerry Jelinek }
1328088d69f8SJerry Jelinek
1329088d69f8SJerry Jelinek void
fp_lwp_cleanup(klwp_t * lwp)1330*ed093b41SRobert Mustacchi fp_lwp_cleanup(klwp_t *lwp)
1331088d69f8SJerry Jelinek {
1332088d69f8SJerry Jelinek struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1333088d69f8SJerry Jelinek
1334088d69f8SJerry Jelinek if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
1335088d69f8SJerry Jelinek kmem_cache_free(fpsave_cachep,
1336088d69f8SJerry Jelinek fp->fpu_regs.kfpu_u.kfpu_generic);
1337088d69f8SJerry Jelinek lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
1338088d69f8SJerry Jelinek }
1339*ed093b41SRobert Mustacchi
1340*ed093b41SRobert Mustacchi if (fp->fpu_signal != NULL) {
1341*ed093b41SRobert Mustacchi kmem_cache_free(fpsave_cachep, fp->fpu_signal);
1342*ed093b41SRobert Mustacchi fp->fpu_signal = NULL;
1343*ed093b41SRobert Mustacchi }
1344088d69f8SJerry Jelinek }
1345088d69f8SJerry Jelinek
1346088d69f8SJerry Jelinek /*
1347088d69f8SJerry Jelinek * Called during the process of forklwp(). The kfpu_u pointer will have been
1348088d69f8SJerry Jelinek * overwritten while copying the parent's LWP structure. We have a valid copy
1349088d69f8SJerry Jelinek * stashed in the child's lwp_fpu which we use to restore the correct value.
1350088d69f8SJerry Jelinek */
1351088d69f8SJerry Jelinek void
fp_lwp_dup(klwp_t * lwp)1352*ed093b41SRobert Mustacchi fp_lwp_dup(klwp_t *lwp)
1353088d69f8SJerry Jelinek {
1354088d69f8SJerry Jelinek void *xp = lwp->lwp_fpu;
1355088d69f8SJerry Jelinek size_t sz;
1356088d69f8SJerry Jelinek
1357088d69f8SJerry Jelinek switch (fp_save_mech) {
1358088d69f8SJerry Jelinek case FP_FXSAVE:
1359088d69f8SJerry Jelinek sz = sizeof (struct fxsave_state);
1360088d69f8SJerry Jelinek break;
1361088d69f8SJerry Jelinek case FP_XSAVE:
1362088d69f8SJerry Jelinek sz = cpuid_get_xsave_size();
1363088d69f8SJerry Jelinek break;
1364088d69f8SJerry Jelinek default:
1365088d69f8SJerry Jelinek panic("Invalid fp_save_mech");
1366088d69f8SJerry Jelinek /*NOTREACHED*/
1367088d69f8SJerry Jelinek }
1368088d69f8SJerry Jelinek
1369088d69f8SJerry Jelinek /* copy the parent's values into the new lwp's struct */
1370088d69f8SJerry Jelinek bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
1371088d69f8SJerry Jelinek /* now restore the pointer */
1372088d69f8SJerry Jelinek lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
1373*ed093b41SRobert Mustacchi /* Ensure that we don't inherit our parent's signal state */
1374*ed093b41SRobert Mustacchi lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
1375088d69f8SJerry Jelinek }
1376088d69f8SJerry Jelinek
13777c478bd9Sstevel@tonic-gate /*
13787c478bd9Sstevel@tonic-gate * Handle a processor extension error fault
13797c478bd9Sstevel@tonic-gate * Returns non zero for error.
13807c478bd9Sstevel@tonic-gate */
13817c478bd9Sstevel@tonic-gate
13827c478bd9Sstevel@tonic-gate /*ARGSUSED*/
13837c478bd9Sstevel@tonic-gate int
fpexterrflt(struct regs * rp)13847c478bd9Sstevel@tonic-gate fpexterrflt(struct regs *rp)
13857c478bd9Sstevel@tonic-gate {
1386eae0da43Ssethg uint32_t fpcw, fpsw;
13877c478bd9Sstevel@tonic-gate fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
13887c478bd9Sstevel@tonic-gate
13897c478bd9Sstevel@tonic-gate ASSERT(fp_kind != FP_NO);
13907c478bd9Sstevel@tonic-gate
13917c478bd9Sstevel@tonic-gate /*
13927c478bd9Sstevel@tonic-gate * Now we can enable the interrupts.
13937c478bd9Sstevel@tonic-gate * (NOTE: x87 fp exceptions come thru interrupt gate)
13947c478bd9Sstevel@tonic-gate */
13957c478bd9Sstevel@tonic-gate sti();
13967c478bd9Sstevel@tonic-gate
1397eae0da43Ssethg if (!fpu_exists)
1398eae0da43Ssethg return (FPE_FLTINV);
1399eae0da43Ssethg
1400eae0da43Ssethg /*
1401eae0da43Ssethg * Do an unconditional save of the FP state. If it's dirty (TS=0),
1402eae0da43Ssethg * it'll be saved into the fpu context area passed in (that of the
1403eae0da43Ssethg * current thread). If it's not dirty (it may not be, due to
1404eae0da43Ssethg * an intervening save due to a context switch between the sti(),
1405eae0da43Ssethg * above and here, then it's safe to just use the stored values in
1406eae0da43Ssethg * the context save area to determine the cause of the fault.
1407eae0da43Ssethg */
1408eae0da43Ssethg fp_save(fp);
14097c478bd9Sstevel@tonic-gate
1410eae0da43Ssethg /* clear exception flags in saved state, as if by fnclex */
14117af88ac7SKuriakose Kuruvilla switch (fp_save_mech) {
14127af88ac7SKuriakose Kuruvilla case FP_FXSAVE:
1413088d69f8SJerry Jelinek fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1414088d69f8SJerry Jelinek fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
1415088d69f8SJerry Jelinek fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
14167af88ac7SKuriakose Kuruvilla break;
14177af88ac7SKuriakose Kuruvilla
14187af88ac7SKuriakose Kuruvilla case FP_XSAVE:
1419088d69f8SJerry Jelinek fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1420088d69f8SJerry Jelinek fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
1421088d69f8SJerry Jelinek fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
14227af88ac7SKuriakose Kuruvilla /*
14237af88ac7SKuriakose Kuruvilla * Always set LEGACY_FP as it may have been cleared by XSAVE
14247af88ac7SKuriakose Kuruvilla * instruction
14257af88ac7SKuriakose Kuruvilla */
1426957246c9SPatrick Mooney fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1427957246c9SPatrick Mooney XFEATURE_LEGACY_FP;
14287af88ac7SKuriakose Kuruvilla break;
14297af88ac7SKuriakose Kuruvilla default:
14307af88ac7SKuriakose Kuruvilla panic("Invalid fp_save_mech");
14317af88ac7SKuriakose Kuruvilla /*NOTREACHED*/
14327af88ac7SKuriakose Kuruvilla }
14337af88ac7SKuriakose Kuruvilla
1434eae0da43Ssethg fp->fpu_regs.kfpu_status = fpsw;
1435eae0da43Ssethg
1436eae0da43Ssethg if ((fpsw & FPS_ES) == 0)
1437eae0da43Ssethg return (0); /* No exception */
1438eae0da43Ssethg
14397c478bd9Sstevel@tonic-gate /*
14407c478bd9Sstevel@tonic-gate * "and" the exception flags with the complement of the mask
14417c478bd9Sstevel@tonic-gate * bits to determine which exception occurred
14427c478bd9Sstevel@tonic-gate */
1443eae0da43Ssethg return (fpe_sicode(fpsw & ~fpcw & 0x3f));
14447c478bd9Sstevel@tonic-gate }
14457c478bd9Sstevel@tonic-gate
14467c478bd9Sstevel@tonic-gate /*
14477c478bd9Sstevel@tonic-gate * Handle an SSE/SSE2 precise exception.
14487c478bd9Sstevel@tonic-gate * Returns a non-zero sicode for error.
14497c478bd9Sstevel@tonic-gate */
14507c478bd9Sstevel@tonic-gate /*ARGSUSED*/
14517c478bd9Sstevel@tonic-gate int
fpsimderrflt(struct regs * rp)14527c478bd9Sstevel@tonic-gate fpsimderrflt(struct regs *rp)
14537c478bd9Sstevel@tonic-gate {
14547c478bd9Sstevel@tonic-gate uint32_t mxcsr, xmask;
14557c478bd9Sstevel@tonic-gate fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
14567c478bd9Sstevel@tonic-gate
14577af88ac7SKuriakose Kuruvilla ASSERT(fp_kind & __FP_SSE);
14587c478bd9Sstevel@tonic-gate
1459eae0da43Ssethg /*
1460eae0da43Ssethg * NOTE: Interrupts are disabled during execution of this
1461eae0da43Ssethg * function. They are enabled by the caller in trap.c.
1462eae0da43Ssethg */
1463eae0da43Ssethg
1464eae0da43Ssethg /*
1465eae0da43Ssethg * The only way we could have gotten here if there is no FP unit
1466eae0da43Ssethg * is via a user executing an INT $19 instruction, so there is
1467eae0da43Ssethg * no fault in that case.
1468eae0da43Ssethg */
1469eae0da43Ssethg if (!fpu_exists)
1470eae0da43Ssethg return (0);
1471eae0da43Ssethg
1472eae0da43Ssethg /*
1473eae0da43Ssethg * Do an unconditional save of the FP state. If it's dirty (TS=0),
1474eae0da43Ssethg * it'll be saved into the fpu context area passed in (that of the
1475eae0da43Ssethg * current thread). If it's not dirty, then it's safe to just use
1476eae0da43Ssethg * the stored values in the context save area to determine the
1477eae0da43Ssethg * cause of the fault.
1478eae0da43Ssethg */
1479da8e4073SToomas Soome fp_save(fp); /* save the FPU state */
1480eae0da43Ssethg
1481088d69f8SJerry Jelinek if (fp_save_mech == FP_XSAVE) {
1482088d69f8SJerry Jelinek mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1483088d69f8SJerry Jelinek fp->fpu_regs.kfpu_status =
1484088d69f8SJerry Jelinek fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1485088d69f8SJerry Jelinek } else {
1486088d69f8SJerry Jelinek mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1487088d69f8SJerry Jelinek fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1488088d69f8SJerry Jelinek }
14897c478bd9Sstevel@tonic-gate fp->fpu_regs.kfpu_xstatus = mxcsr;
14907c478bd9Sstevel@tonic-gate
14917c478bd9Sstevel@tonic-gate /*
14927c478bd9Sstevel@tonic-gate * compute the mask that determines which conditions can cause
14937c478bd9Sstevel@tonic-gate * a #xm exception, and use this to clean the status bits so that
14947c478bd9Sstevel@tonic-gate * we can identify the true cause of this one.
14957c478bd9Sstevel@tonic-gate */
14967c478bd9Sstevel@tonic-gate xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
14977c478bd9Sstevel@tonic-gate return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
14987c478bd9Sstevel@tonic-gate }
14997c478bd9Sstevel@tonic-gate
15007c478bd9Sstevel@tonic-gate /*
15017c478bd9Sstevel@tonic-gate * In the unlikely event that someone is relying on this subcode being
15027c478bd9Sstevel@tonic-gate * FPE_FLTILL for denormalize exceptions, it can always be patched back
15037c478bd9Sstevel@tonic-gate * again to restore old behaviour.
15047c478bd9Sstevel@tonic-gate */
15057c478bd9Sstevel@tonic-gate int fpe_fltden = FPE_FLTDEN;
15067c478bd9Sstevel@tonic-gate
15077c478bd9Sstevel@tonic-gate /*
15087c478bd9Sstevel@tonic-gate * Map from the FPU status word to the FP exception si_code.
15097c478bd9Sstevel@tonic-gate */
15107c478bd9Sstevel@tonic-gate static int
fpe_sicode(uint_t sw)15117c478bd9Sstevel@tonic-gate fpe_sicode(uint_t sw)
15127c478bd9Sstevel@tonic-gate {
15137c478bd9Sstevel@tonic-gate if (sw & FPS_IE)
15147c478bd9Sstevel@tonic-gate return (FPE_FLTINV);
15157c478bd9Sstevel@tonic-gate if (sw & FPS_ZE)
15167c478bd9Sstevel@tonic-gate return (FPE_FLTDIV);
15177c478bd9Sstevel@tonic-gate if (sw & FPS_DE)
15187c478bd9Sstevel@tonic-gate return (fpe_fltden);
15197c478bd9Sstevel@tonic-gate if (sw & FPS_OE)
15207c478bd9Sstevel@tonic-gate return (FPE_FLTOVF);
15217c478bd9Sstevel@tonic-gate if (sw & FPS_UE)
15227c478bd9Sstevel@tonic-gate return (FPE_FLTUND);
15237c478bd9Sstevel@tonic-gate if (sw & FPS_PE)
15247c478bd9Sstevel@tonic-gate return (FPE_FLTRES);
15257c478bd9Sstevel@tonic-gate return (FPE_FLTINV); /* default si_code for other exceptions */
15267c478bd9Sstevel@tonic-gate }
15277c478bd9Sstevel@tonic-gate
15287c478bd9Sstevel@tonic-gate /*
15297c478bd9Sstevel@tonic-gate * Map from the SSE status word to the FP exception si_code.
15307c478bd9Sstevel@tonic-gate */
15317c478bd9Sstevel@tonic-gate static int
fpe_simd_sicode(uint_t sw)15327c478bd9Sstevel@tonic-gate fpe_simd_sicode(uint_t sw)
15337c478bd9Sstevel@tonic-gate {
15347c478bd9Sstevel@tonic-gate if (sw & SSE_IE)
15357c478bd9Sstevel@tonic-gate return (FPE_FLTINV);
15367c478bd9Sstevel@tonic-gate if (sw & SSE_ZE)
15377c478bd9Sstevel@tonic-gate return (FPE_FLTDIV);
15387c478bd9Sstevel@tonic-gate if (sw & SSE_DE)
15397c478bd9Sstevel@tonic-gate return (FPE_FLTDEN);
15407c478bd9Sstevel@tonic-gate if (sw & SSE_OE)
15417c478bd9Sstevel@tonic-gate return (FPE_FLTOVF);
15427c478bd9Sstevel@tonic-gate if (sw & SSE_UE)
15437c478bd9Sstevel@tonic-gate return (FPE_FLTUND);
15447c478bd9Sstevel@tonic-gate if (sw & SSE_PE)
15457c478bd9Sstevel@tonic-gate return (FPE_FLTRES);
15467c478bd9Sstevel@tonic-gate return (FPE_FLTINV); /* default si_code for other exceptions */
15477c478bd9Sstevel@tonic-gate }
15487c478bd9Sstevel@tonic-gate
15497c478bd9Sstevel@tonic-gate /*
15507c478bd9Sstevel@tonic-gate * This routine is invoked as part of libc's __fpstart implementation
15517c478bd9Sstevel@tonic-gate * via sysi86(2).
15527c478bd9Sstevel@tonic-gate *
15537c478bd9Sstevel@tonic-gate * It may be called -before- any context has been assigned in which case
15547c478bd9Sstevel@tonic-gate * we try and avoid touching the hardware. Or it may be invoked well
15557c478bd9Sstevel@tonic-gate * after the context has been assigned and fiddled with, in which case
15567c478bd9Sstevel@tonic-gate * just tweak it directly.
15577c478bd9Sstevel@tonic-gate */
15587c478bd9Sstevel@tonic-gate void
fpsetcw(uint16_t fcw,uint32_t mxcsr)15597c478bd9Sstevel@tonic-gate fpsetcw(uint16_t fcw, uint32_t mxcsr)
15607c478bd9Sstevel@tonic-gate {
15617c478bd9Sstevel@tonic-gate struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
15627c478bd9Sstevel@tonic-gate struct fxsave_state *fx;
15637c478bd9Sstevel@tonic-gate
15647c478bd9Sstevel@tonic-gate if (!fpu_exists || fp_kind == FP_NO)
15657c478bd9Sstevel@tonic-gate return;
15667c478bd9Sstevel@tonic-gate
15677c478bd9Sstevel@tonic-gate if ((fp->fpu_flags & FPU_EN) == 0) {
15687c478bd9Sstevel@tonic-gate if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
15697c478bd9Sstevel@tonic-gate /*
15707c478bd9Sstevel@tonic-gate * Common case. Floating point unit not yet
15717c478bd9Sstevel@tonic-gate * enabled, and kernel already intends to initialize
15727c478bd9Sstevel@tonic-gate * the hardware the way the caller wants.
15737c478bd9Sstevel@tonic-gate */
15747c478bd9Sstevel@tonic-gate return;
15757c478bd9Sstevel@tonic-gate }
15767c478bd9Sstevel@tonic-gate /*
15777c478bd9Sstevel@tonic-gate * Hmm. Userland wants a different default.
15787c478bd9Sstevel@tonic-gate * Do a fake "first trap" to establish the context, then
15797c478bd9Sstevel@tonic-gate * handle as if we already had a context before we came in.
15807c478bd9Sstevel@tonic-gate */
15817c478bd9Sstevel@tonic-gate kpreempt_disable();
15827c478bd9Sstevel@tonic-gate fp_seed();
15837c478bd9Sstevel@tonic-gate kpreempt_enable();
15847c478bd9Sstevel@tonic-gate }
15857c478bd9Sstevel@tonic-gate
15867c478bd9Sstevel@tonic-gate /*
15877c478bd9Sstevel@tonic-gate * Ensure that the current hardware state is flushed back to the
15887c478bd9Sstevel@tonic-gate * pcb, then modify that copy. Next use of the fp will
15897c478bd9Sstevel@tonic-gate * restore the context.
15907c478bd9Sstevel@tonic-gate */
15917c478bd9Sstevel@tonic-gate fp_save(fp);
15927c478bd9Sstevel@tonic-gate
15937af88ac7SKuriakose Kuruvilla switch (fp_save_mech) {
15947af88ac7SKuriakose Kuruvilla case FP_FXSAVE:
1595088d69f8SJerry Jelinek fx = fp->fpu_regs.kfpu_u.kfpu_fx;
15967c478bd9Sstevel@tonic-gate fx->fx_fcw = fcw;
15977c478bd9Sstevel@tonic-gate fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
15987c478bd9Sstevel@tonic-gate break;
15997af88ac7SKuriakose Kuruvilla
16007af88ac7SKuriakose Kuruvilla case FP_XSAVE:
1601088d69f8SJerry Jelinek fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
16027af88ac7SKuriakose Kuruvilla fx->fx_fcw = fcw;
16037af88ac7SKuriakose Kuruvilla fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
16047af88ac7SKuriakose Kuruvilla /*
16057af88ac7SKuriakose Kuruvilla * Always set LEGACY_FP as it may have been cleared by XSAVE
16067af88ac7SKuriakose Kuruvilla * instruction
16077af88ac7SKuriakose Kuruvilla */
1608957246c9SPatrick Mooney fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1609957246c9SPatrick Mooney XFEATURE_LEGACY_FP;
16107c478bd9Sstevel@tonic-gate break;
16117af88ac7SKuriakose Kuruvilla default:
16127af88ac7SKuriakose Kuruvilla panic("Invalid fp_save_mech");
16137af88ac7SKuriakose Kuruvilla /*NOTREACHED*/
16147c478bd9Sstevel@tonic-gate }
16157c478bd9Sstevel@tonic-gate }
16162fc9ab6eSJerry Jelinek
16172fc9ab6eSJerry Jelinek static void
kernel_fpu_fpstate_init(kfpu_state_t * kfpu)16182fc9ab6eSJerry Jelinek kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
16192fc9ab6eSJerry Jelinek {
16202fc9ab6eSJerry Jelinek struct xsave_state *xs;
16212fc9ab6eSJerry Jelinek
16222fc9ab6eSJerry Jelinek switch (fp_save_mech) {
16232fc9ab6eSJerry Jelinek case FP_FXSAVE:
16242fc9ab6eSJerry Jelinek bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
16252fc9ab6eSJerry Jelinek sizeof (struct fxsave_state));
16262fc9ab6eSJerry Jelinek kfpu->kfpu_ctx.fpu_xsave_mask = 0;
16272fc9ab6eSJerry Jelinek break;
16282fc9ab6eSJerry Jelinek case FP_XSAVE:
16292fc9ab6eSJerry Jelinek xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
16302fc9ab6eSJerry Jelinek bzero(xs, cpuid_get_xsave_size());
16312fc9ab6eSJerry Jelinek bcopy(&avx_initial, xs, sizeof (*xs));
1632957246c9SPatrick Mooney xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
16332fc9ab6eSJerry Jelinek kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
16342fc9ab6eSJerry Jelinek break;
16352fc9ab6eSJerry Jelinek default:
16362fc9ab6eSJerry Jelinek panic("invalid fp_save_mech");
16372fc9ab6eSJerry Jelinek }
16382fc9ab6eSJerry Jelinek
16392fc9ab6eSJerry Jelinek /*
16402fc9ab6eSJerry Jelinek * Set the corresponding flags that the system expects on the FPU state
16412fc9ab6eSJerry Jelinek * to indicate that this is our state. The FPU_EN flag is required to
16422fc9ab6eSJerry Jelinek * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
16432fc9ab6eSJerry Jelinek * not set below as it represents that this state is being suppressed
16442fc9ab6eSJerry Jelinek * by the kernel.
16452fc9ab6eSJerry Jelinek */
16462fc9ab6eSJerry Jelinek kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
16472fc9ab6eSJerry Jelinek kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
16482fc9ab6eSJerry Jelinek }
16492fc9ab6eSJerry Jelinek
16502fc9ab6eSJerry Jelinek kfpu_state_t *
kernel_fpu_alloc(int kmflags)16512fc9ab6eSJerry Jelinek kernel_fpu_alloc(int kmflags)
16522fc9ab6eSJerry Jelinek {
16532fc9ab6eSJerry Jelinek kfpu_state_t *kfpu;
16542fc9ab6eSJerry Jelinek
16552fc9ab6eSJerry Jelinek if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
16562fc9ab6eSJerry Jelinek return (NULL);
16572fc9ab6eSJerry Jelinek }
16582fc9ab6eSJerry Jelinek
16592fc9ab6eSJerry Jelinek kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
16602fc9ab6eSJerry Jelinek kmem_cache_alloc(fpsave_cachep, kmflags);
16612fc9ab6eSJerry Jelinek if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
16622fc9ab6eSJerry Jelinek kmem_free(kfpu, sizeof (kfpu_state_t));
16632fc9ab6eSJerry Jelinek return (NULL);
16642fc9ab6eSJerry Jelinek }
16652fc9ab6eSJerry Jelinek
16662fc9ab6eSJerry Jelinek kernel_fpu_fpstate_init(kfpu);
16672fc9ab6eSJerry Jelinek
16682fc9ab6eSJerry Jelinek return (kfpu);
16692fc9ab6eSJerry Jelinek }
16702fc9ab6eSJerry Jelinek
16712fc9ab6eSJerry Jelinek void
kernel_fpu_free(kfpu_state_t * kfpu)16722fc9ab6eSJerry Jelinek kernel_fpu_free(kfpu_state_t *kfpu)
16732fc9ab6eSJerry Jelinek {
16742fc9ab6eSJerry Jelinek kmem_cache_free(fpsave_cachep,
16752fc9ab6eSJerry Jelinek kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
16762fc9ab6eSJerry Jelinek kmem_free(kfpu, sizeof (kfpu_state_t));
16772fc9ab6eSJerry Jelinek }
16782fc9ab6eSJerry Jelinek
16792fc9ab6eSJerry Jelinek static void
kernel_fpu_ctx_save(void * arg)16802fc9ab6eSJerry Jelinek kernel_fpu_ctx_save(void *arg)
16812fc9ab6eSJerry Jelinek {
16822fc9ab6eSJerry Jelinek kfpu_state_t *kfpu = arg;
16832fc9ab6eSJerry Jelinek fpu_ctx_t *pf;
16842fc9ab6eSJerry Jelinek
16852fc9ab6eSJerry Jelinek if (kfpu == NULL) {
16862fc9ab6eSJerry Jelinek /*
16872fc9ab6eSJerry Jelinek * A NULL kfpu implies this is a kernel thread with an LWP and
16882fc9ab6eSJerry Jelinek * no user-level FPU usage. Use the lwp fpu save area.
16892fc9ab6eSJerry Jelinek */
16902fc9ab6eSJerry Jelinek pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
16912fc9ab6eSJerry Jelinek
16922fc9ab6eSJerry Jelinek ASSERT(curthread->t_procp->p_flag & SSYS);
16932fc9ab6eSJerry Jelinek ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
16942fc9ab6eSJerry Jelinek
16952fc9ab6eSJerry Jelinek fp_save(pf);
16962fc9ab6eSJerry Jelinek } else {
16972fc9ab6eSJerry Jelinek pf = &kfpu->kfpu_ctx;
16982fc9ab6eSJerry Jelinek
16992fc9ab6eSJerry Jelinek ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
17002fc9ab6eSJerry Jelinek ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
17012fc9ab6eSJerry Jelinek
17022fc9ab6eSJerry Jelinek /*
17032fc9ab6eSJerry Jelinek * Note, we can't use fp_save because it assumes that we're
17042fc9ab6eSJerry Jelinek * saving to the thread's PCB and not somewhere else. Because
17052fc9ab6eSJerry Jelinek * this is a different FPU context, we instead have to do this
17062fc9ab6eSJerry Jelinek * ourselves.
17072fc9ab6eSJerry Jelinek */
17082fc9ab6eSJerry Jelinek switch (fp_save_mech) {
17092fc9ab6eSJerry Jelinek case FP_FXSAVE:
17102fc9ab6eSJerry Jelinek fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
17112fc9ab6eSJerry Jelinek break;
17122fc9ab6eSJerry Jelinek case FP_XSAVE:
17132fc9ab6eSJerry Jelinek xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
17142fc9ab6eSJerry Jelinek break;
17152fc9ab6eSJerry Jelinek default:
17162fc9ab6eSJerry Jelinek panic("Invalid fp_save_mech");
17172fc9ab6eSJerry Jelinek }
17182fc9ab6eSJerry Jelinek
17192fc9ab6eSJerry Jelinek /*
17202fc9ab6eSJerry Jelinek * Because we have saved context here, our save state is no
17212fc9ab6eSJerry Jelinek * longer valid and therefore needs to be reinitialized.
17222fc9ab6eSJerry Jelinek */
17232fc9ab6eSJerry Jelinek kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
17242fc9ab6eSJerry Jelinek }
17252fc9ab6eSJerry Jelinek
17262fc9ab6eSJerry Jelinek pf->fpu_flags |= FPU_VALID;
17272fc9ab6eSJerry Jelinek
17282fc9ab6eSJerry Jelinek /*
17292fc9ab6eSJerry Jelinek * Clear KFPU flag. This allows swtch to check for improper kernel
17302fc9ab6eSJerry Jelinek * usage of the FPU (i.e. switching to a new thread while the old
17312fc9ab6eSJerry Jelinek * thread was in the kernel and using the FPU, but did not perform a
17322fc9ab6eSJerry Jelinek * context save).
17332fc9ab6eSJerry Jelinek */
17342fc9ab6eSJerry Jelinek curthread->t_flag &= ~T_KFPU;
17352fc9ab6eSJerry Jelinek }
17362fc9ab6eSJerry Jelinek
17372fc9ab6eSJerry Jelinek static void
kernel_fpu_ctx_restore(void * arg)17382fc9ab6eSJerry Jelinek kernel_fpu_ctx_restore(void *arg)
17392fc9ab6eSJerry Jelinek {
17402fc9ab6eSJerry Jelinek kfpu_state_t *kfpu = arg;
17412fc9ab6eSJerry Jelinek fpu_ctx_t *pf;
17422fc9ab6eSJerry Jelinek
17432fc9ab6eSJerry Jelinek if (kfpu == NULL) {
17442fc9ab6eSJerry Jelinek /*
17452fc9ab6eSJerry Jelinek * A NULL kfpu implies this is a kernel thread with an LWP and
17462fc9ab6eSJerry Jelinek * no user-level FPU usage. Use the lwp fpu save area.
17472fc9ab6eSJerry Jelinek */
17482fc9ab6eSJerry Jelinek pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
17492fc9ab6eSJerry Jelinek
17502fc9ab6eSJerry Jelinek ASSERT(curthread->t_procp->p_flag & SSYS);
17512fc9ab6eSJerry Jelinek ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
17522fc9ab6eSJerry Jelinek } else {
17532fc9ab6eSJerry Jelinek pf = &kfpu->kfpu_ctx;
17542fc9ab6eSJerry Jelinek
17552fc9ab6eSJerry Jelinek ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
17562fc9ab6eSJerry Jelinek ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
17572fc9ab6eSJerry Jelinek }
17582fc9ab6eSJerry Jelinek
17592fc9ab6eSJerry Jelinek fp_restore(pf);
17602fc9ab6eSJerry Jelinek curthread->t_flag |= T_KFPU;
17612fc9ab6eSJerry Jelinek }
17622fc9ab6eSJerry Jelinek
17632fc9ab6eSJerry Jelinek /*
17642fc9ab6eSJerry Jelinek * Validate that the thread is not switching off-cpu while actively using the
17652fc9ab6eSJerry Jelinek * FPU within the kernel.
17662fc9ab6eSJerry Jelinek */
17672fc9ab6eSJerry Jelinek void
kernel_fpu_no_swtch(void)17682fc9ab6eSJerry Jelinek kernel_fpu_no_swtch(void)
17692fc9ab6eSJerry Jelinek {
17702fc9ab6eSJerry Jelinek if ((curthread->t_flag & T_KFPU) != 0) {
17712fc9ab6eSJerry Jelinek panic("curthread swtch-ing while the kernel is using the FPU");
17722fc9ab6eSJerry Jelinek }
17732fc9ab6eSJerry Jelinek }
17742fc9ab6eSJerry Jelinek
17755a469116SPatrick Mooney static const struct ctxop_template kfpu_ctxop_tpl = {
17765a469116SPatrick Mooney .ct_rev = CTXOP_TPL_REV,
17775a469116SPatrick Mooney .ct_save = kernel_fpu_ctx_save,
17785a469116SPatrick Mooney .ct_restore = kernel_fpu_ctx_restore,
17795a469116SPatrick Mooney };
17805a469116SPatrick Mooney
17812fc9ab6eSJerry Jelinek void
kernel_fpu_begin(kfpu_state_t * kfpu,uint_t flags)17822fc9ab6eSJerry Jelinek kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
17832fc9ab6eSJerry Jelinek {
17842fc9ab6eSJerry Jelinek klwp_t *pl = curthread->t_lwp;
1785c21bd51dSDan McDonald struct ctxop *ctx;
17862fc9ab6eSJerry Jelinek
17872fc9ab6eSJerry Jelinek if ((curthread->t_flag & T_KFPU) != 0) {
17882fc9ab6eSJerry Jelinek panic("curthread attempting to nest kernel FPU states");
17892fc9ab6eSJerry Jelinek }
17902fc9ab6eSJerry Jelinek
17912fc9ab6eSJerry Jelinek /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
17922fc9ab6eSJerry Jelinek ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
17932fc9ab6eSJerry Jelinek (KFPU_USE_LWP | KFPU_NO_STATE));
17942fc9ab6eSJerry Jelinek
17952fc9ab6eSJerry Jelinek if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
17962fc9ab6eSJerry Jelinek /*
17972fc9ab6eSJerry Jelinek * Since we don't have a kfpu_state or usable lwp pcb_fpu to
17982fc9ab6eSJerry Jelinek * hold our kernel FPU context, we depend on the caller doing
17992fc9ab6eSJerry Jelinek * kpreempt_disable for the duration of our FPU usage. This
18002fc9ab6eSJerry Jelinek * should only be done for very short periods of time.
18012fc9ab6eSJerry Jelinek */
18022fc9ab6eSJerry Jelinek ASSERT(curthread->t_preempt > 0);
18032fc9ab6eSJerry Jelinek ASSERT(kfpu == NULL);
18042fc9ab6eSJerry Jelinek
1805a16c2dd2SJerry Jelinek if (pl != NULL) {
1806a16c2dd2SJerry Jelinek /*
1807a16c2dd2SJerry Jelinek * We might have already saved once so FPU_VALID could
1808a16c2dd2SJerry Jelinek * be set. This is handled in fp_save.
1809a16c2dd2SJerry Jelinek */
18102fc9ab6eSJerry Jelinek fp_save(&pl->lwp_pcb.pcb_fpu);
18112fc9ab6eSJerry Jelinek pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
18122fc9ab6eSJerry Jelinek }
18132fc9ab6eSJerry Jelinek
18142fc9ab6eSJerry Jelinek curthread->t_flag |= T_KFPU;
18152fc9ab6eSJerry Jelinek
18162fc9ab6eSJerry Jelinek /* Always restore the fpu to the initial state. */
18172fc9ab6eSJerry Jelinek fpinit();
18182fc9ab6eSJerry Jelinek
18192fc9ab6eSJerry Jelinek return;
18202fc9ab6eSJerry Jelinek }
18212fc9ab6eSJerry Jelinek
18222fc9ab6eSJerry Jelinek /*
18232fc9ab6eSJerry Jelinek * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
18242fc9ab6eSJerry Jelinek */
18252fc9ab6eSJerry Jelinek
18262fc9ab6eSJerry Jelinek if ((flags & KFPU_USE_LWP) == 0) {
18272fc9ab6eSJerry Jelinek if (kfpu->kfpu_curthread != NULL)
18282fc9ab6eSJerry Jelinek panic("attempting to reuse kernel FPU state at %p when "
18292fc9ab6eSJerry Jelinek "another thread already is using", kfpu);
18302fc9ab6eSJerry Jelinek
18312fc9ab6eSJerry Jelinek if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
18322fc9ab6eSJerry Jelinek kernel_fpu_fpstate_init(kfpu);
18332fc9ab6eSJerry Jelinek
18342fc9ab6eSJerry Jelinek kfpu->kfpu_curthread = curthread;
18352fc9ab6eSJerry Jelinek }
18362fc9ab6eSJerry Jelinek
18372fc9ab6eSJerry Jelinek /*
1838a16c2dd2SJerry Jelinek * Not all threads may have an active LWP. If they do and we're not
1839a16c2dd2SJerry Jelinek * going to re-use the LWP, then we should go ahead and save the state.
1840a16c2dd2SJerry Jelinek * We must also note that the fpu is now being used by the kernel and
1841a16c2dd2SJerry Jelinek * therefore we do not want to manage the fpu state via the user-level
1842a16c2dd2SJerry Jelinek * thread's context handlers.
1843a16c2dd2SJerry Jelinek *
1844a16c2dd2SJerry Jelinek * We might have already saved once (due to a prior use of the kernel
1845a16c2dd2SJerry Jelinek * FPU or another code path) so FPU_VALID could be set. This is handled
1846a16c2dd2SJerry Jelinek * by fp_save, as is the FPU_EN check.
18472fc9ab6eSJerry Jelinek */
18485a469116SPatrick Mooney ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1849c21bd51dSDan McDonald kpreempt_disable();
1850a16c2dd2SJerry Jelinek if (pl != NULL) {
1851a16c2dd2SJerry Jelinek if ((flags & KFPU_USE_LWP) == 0)
1852a16c2dd2SJerry Jelinek fp_save(&pl->lwp_pcb.pcb_fpu);
18532fc9ab6eSJerry Jelinek pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
18542fc9ab6eSJerry Jelinek }
18552fc9ab6eSJerry Jelinek
18562fc9ab6eSJerry Jelinek /*
18575a469116SPatrick Mooney * Set the context operations for kernel FPU usage. Because kernel FPU
18585a469116SPatrick Mooney * setup and ctxop attachment needs to happen under the protection of
18595a469116SPatrick Mooney * kpreempt_disable(), we allocate the ctxop outside the guard so its
18605a469116SPatrick Mooney * sleeping allocation will not cause a voluntary swtch(). This allows
18615a469116SPatrick Mooney * the rest of the initialization to proceed, ensuring valid state for
18625a469116SPatrick Mooney * the ctxop handlers.
18632fc9ab6eSJerry Jelinek */
18645a469116SPatrick Mooney ctxop_attach(curthread, ctx);
18652fc9ab6eSJerry Jelinek curthread->t_flag |= T_KFPU;
18662fc9ab6eSJerry Jelinek
18672fc9ab6eSJerry Jelinek if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
18682fc9ab6eSJerry Jelinek /*
18692fc9ab6eSJerry Jelinek * For pure kernel threads with an LWP, we can use the LWP's
18702fc9ab6eSJerry Jelinek * pcb_fpu to save/restore context.
18712fc9ab6eSJerry Jelinek */
18722fc9ab6eSJerry Jelinek fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
18732fc9ab6eSJerry Jelinek
18742fc9ab6eSJerry Jelinek VERIFY(curthread->t_procp->p_flag & SSYS);
18752fc9ab6eSJerry Jelinek VERIFY(kfpu == NULL);
18762fc9ab6eSJerry Jelinek ASSERT((pf->fpu_flags & FPU_EN) == 0);
18772fc9ab6eSJerry Jelinek
18782fc9ab6eSJerry Jelinek /* Always restore the fpu to the initial state. */
18792fc9ab6eSJerry Jelinek if (fp_save_mech == FP_XSAVE)
18802fc9ab6eSJerry Jelinek pf->fpu_xsave_mask = XFEATURE_FP_ALL;
18812fc9ab6eSJerry Jelinek fpinit();
18822fc9ab6eSJerry Jelinek pf->fpu_flags = FPU_EN | FPU_KERNEL;
18832fc9ab6eSJerry Jelinek } else {
1884a16c2dd2SJerry Jelinek /* initialize the kfpu state */
18852fc9ab6eSJerry Jelinek kernel_fpu_ctx_restore(kfpu);
18862fc9ab6eSJerry Jelinek }
1887c21bd51dSDan McDonald kpreempt_enable();
18882fc9ab6eSJerry Jelinek }
18892fc9ab6eSJerry Jelinek
18902fc9ab6eSJerry Jelinek void
kernel_fpu_end(kfpu_state_t * kfpu,uint_t flags)18912fc9ab6eSJerry Jelinek kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
18922fc9ab6eSJerry Jelinek {
18932fc9ab6eSJerry Jelinek if ((curthread->t_flag & T_KFPU) == 0) {
18942fc9ab6eSJerry Jelinek panic("curthread attempting to clear kernel FPU state "
18952fc9ab6eSJerry Jelinek "without using it");
18962fc9ab6eSJerry Jelinek }
18972fc9ab6eSJerry Jelinek
1898a16c2dd2SJerry Jelinek /*
1899a16c2dd2SJerry Jelinek * General comments on why the rest of this function is structured the
1900a16c2dd2SJerry Jelinek * way it is. Be aware that there is a lot of subtlety here.
1901a16c2dd2SJerry Jelinek *
1902a16c2dd2SJerry Jelinek * If a user-level thread ever uses the fpu while in the kernel, then
1903a16c2dd2SJerry Jelinek * we cannot call fpdisable since that does STTS. That will set the
1904a16c2dd2SJerry Jelinek * ts bit in %cr0 which will cause an exception if anything touches the
1905a16c2dd2SJerry Jelinek * fpu. However, the user-level context switch handler (fpsave_ctxt)
1906a16c2dd2SJerry Jelinek * needs to access the fpu to save the registers into the pcb.
1907a16c2dd2SJerry Jelinek * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1908a16c2dd2SJerry Jelinek * fprestore_ctxt when the thread context switched onto the CPU.
1909a16c2dd2SJerry Jelinek *
1910a16c2dd2SJerry Jelinek * Calling fpdisable only effects the current CPU's %cr0 register.
1911a16c2dd2SJerry Jelinek *
19125a469116SPatrick Mooney * During ctxop_remove and kpreempt_enable, we can voluntarily context
1913a16c2dd2SJerry Jelinek * switch, so the CPU we were on when we entered this function might
19145a469116SPatrick Mooney * not be the same one we're on when we return from ctxop_remove or end
1915a16c2dd2SJerry Jelinek * the function. Note there can be user-level context switch handlers
1916a16c2dd2SJerry Jelinek * still installed if this is a user-level thread.
1917a16c2dd2SJerry Jelinek *
1918a16c2dd2SJerry Jelinek * We also must be careful in the unlikely chance we're running in an
1919a16c2dd2SJerry Jelinek * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1920a16c2dd2SJerry Jelinek * incorrectly for the "real" thread to resume on this CPU.
1921a16c2dd2SJerry Jelinek */
1922a16c2dd2SJerry Jelinek
19232fc9ab6eSJerry Jelinek if ((flags & KFPU_NO_STATE) == 0) {
1924a16c2dd2SJerry Jelinek kpreempt_disable();
1925a16c2dd2SJerry Jelinek } else {
1926a16c2dd2SJerry Jelinek ASSERT(curthread->t_preempt > 0);
1927a16c2dd2SJerry Jelinek }
1928a16c2dd2SJerry Jelinek
1929a16c2dd2SJerry Jelinek curthread->t_flag &= ~T_KFPU;
1930a16c2dd2SJerry Jelinek
1931a16c2dd2SJerry Jelinek /*
1932a16c2dd2SJerry Jelinek * When we are ending things, we explicitly don't save the current
1933a16c2dd2SJerry Jelinek * kernel FPU state back to the temporary state. The kfpu API is not
1934a16c2dd2SJerry Jelinek * intended to be a permanent save location.
1935a16c2dd2SJerry Jelinek *
1936a16c2dd2SJerry Jelinek * If this is a user-level thread and we were to context switch
1937a16c2dd2SJerry Jelinek * before returning to user-land, fpsave_ctxt will be a no-op since we
1938a16c2dd2SJerry Jelinek * already saved the user-level FPU state the first time we run
1939a16c2dd2SJerry Jelinek * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1940a16c2dd2SJerry Jelinek * the user-level fpu state). The fpsave_ctxt functions only save if
1941a16c2dd2SJerry Jelinek * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1942a16c2dd2SJerry Jelinek * fprestore_ctxt will be done in sys_rtt_common when the thread
1943a16c2dd2SJerry Jelinek * finally returns to user-land.
1944a16c2dd2SJerry Jelinek */
1945a16c2dd2SJerry Jelinek
1946a16c2dd2SJerry Jelinek if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1947a16c2dd2SJerry Jelinek curthread->t_intr == NULL) {
19482fc9ab6eSJerry Jelinek /*
1949a16c2dd2SJerry Jelinek * A kernel thread which is not an interrupt thread, so we
1950a16c2dd2SJerry Jelinek * STTS now.
19512fc9ab6eSJerry Jelinek */
1952a16c2dd2SJerry Jelinek fpdisable();
1953a16c2dd2SJerry Jelinek }
1954a16c2dd2SJerry Jelinek
1955a16c2dd2SJerry Jelinek if ((flags & KFPU_NO_STATE) == 0) {
19565a469116SPatrick Mooney ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
19572fc9ab6eSJerry Jelinek
19582fc9ab6eSJerry Jelinek if (kfpu != NULL) {
19592fc9ab6eSJerry Jelinek if (kfpu->kfpu_curthread != curthread) {
19602fc9ab6eSJerry Jelinek panic("attempting to end kernel FPU state "
19612fc9ab6eSJerry Jelinek "for %p, but active thread is not "
19622fc9ab6eSJerry Jelinek "curthread", kfpu);
19632fc9ab6eSJerry Jelinek } else {
19642fc9ab6eSJerry Jelinek kfpu->kfpu_curthread = NULL;
19652fc9ab6eSJerry Jelinek }
19662fc9ab6eSJerry Jelinek }
1967a16c2dd2SJerry Jelinek
1968a16c2dd2SJerry Jelinek kpreempt_enable();
19692fc9ab6eSJerry Jelinek }
19702fc9ab6eSJerry Jelinek
1971a16c2dd2SJerry Jelinek if (curthread->t_lwp != NULL) {
19722fc9ab6eSJerry Jelinek uint_t f;
19732fc9ab6eSJerry Jelinek
19742fc9ab6eSJerry Jelinek if (flags & KFPU_USE_LWP) {
19752fc9ab6eSJerry Jelinek f = FPU_EN | FPU_KERNEL;
19762fc9ab6eSJerry Jelinek } else {
19772fc9ab6eSJerry Jelinek f = FPU_KERNEL;
19782fc9ab6eSJerry Jelinek }
19792fc9ab6eSJerry Jelinek curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
19802fc9ab6eSJerry Jelinek }
19812fc9ab6eSJerry Jelinek }
1982*ed093b41SRobert Mustacchi
1983*ed093b41SRobert Mustacchi /*
1984*ed093b41SRobert Mustacchi * Fill in FPU information that is required by exec.
1985*ed093b41SRobert Mustacchi */
1986*ed093b41SRobert Mustacchi void
fpu_auxv_info(int * typep,size_t * lenp)1987*ed093b41SRobert Mustacchi fpu_auxv_info(int *typep, size_t *lenp)
1988*ed093b41SRobert Mustacchi {
1989*ed093b41SRobert Mustacchi *typep = fp_elf;
1990*ed093b41SRobert Mustacchi switch (fp_save_mech) {
1991*ed093b41SRobert Mustacchi case FP_FXSAVE:
1992*ed093b41SRobert Mustacchi *lenp = sizeof (struct fxsave_state);
1993*ed093b41SRobert Mustacchi break;
1994*ed093b41SRobert Mustacchi case FP_XSAVE:
1995*ed093b41SRobert Mustacchi *lenp = cpuid_get_xsave_size();
1996*ed093b41SRobert Mustacchi break;
1997*ed093b41SRobert Mustacchi default:
1998*ed093b41SRobert Mustacchi *lenp = 0;
1999*ed093b41SRobert Mustacchi break;
2000*ed093b41SRobert Mustacchi }
2001*ed093b41SRobert Mustacchi }
2002*ed093b41SRobert Mustacchi
2003*ed093b41SRobert Mustacchi /*
2004*ed093b41SRobert Mustacchi * This function exists to transform an xsave_state into an fxsave_state. The
2005*ed093b41SRobert Mustacchi * way that we have to do this is nuanced. We assume that callers have already
2006*ed093b41SRobert Mustacchi * handled FPU_EN and thus we only need to consider the xsave_state and its
2007*ed093b41SRobert Mustacchi * component vector itself. This results in the following cases that we need to
2008*ed093b41SRobert Mustacchi * consider:
2009*ed093b41SRobert Mustacchi *
2010*ed093b41SRobert Mustacchi * o Neither the x87 / XMM state bits are set. We use the hardware default and
2011*ed093b41SRobert Mustacchi * need to ensure to copy the xsave header.
2012*ed093b41SRobert Mustacchi * o Both x87 / XMM state bits are set. We can copy everything.
2013*ed093b41SRobert Mustacchi * o Only the x87 bit is set. We need to copy the x87 state but make the XMM
2014*ed093b41SRobert Mustacchi * state be in the initial case.
2015*ed093b41SRobert Mustacchi * o Only the XMM bit is set. The reverse of the above case.
2016*ed093b41SRobert Mustacchi *
2017*ed093b41SRobert Mustacchi * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
2018*ed093b41SRobert Mustacchi * generally the same; however, the default floating point control word is
2019*ed093b41SRobert Mustacchi * different.
2020*ed093b41SRobert Mustacchi *
2021*ed093b41SRobert Mustacchi * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
2022*ed093b41SRobert Mustacchi * Because we are using xsave and xsaveopt in the kernel right now and not
2023*ed093b41SRobert Mustacchi * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
2024*ed093b41SRobert Mustacchi * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
2025*ed093b41SRobert Mustacchi * is set, we must also come back and copy out the MXCSR register. Sorry, we
2026*ed093b41SRobert Mustacchi * don't make the rules.
2027*ed093b41SRobert Mustacchi */
2028*ed093b41SRobert Mustacchi static void
fpu_xsave_to_fxsave(const struct xsave_state * xsave,struct fxsave_state * fx)2029*ed093b41SRobert Mustacchi fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
2030*ed093b41SRobert Mustacchi {
2031*ed093b41SRobert Mustacchi const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
2032*ed093b41SRobert Mustacchi
2033*ed093b41SRobert Mustacchi switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
2034*ed093b41SRobert Mustacchi case XFEATURE_LEGACY_FP | XFEATURE_SSE:
2035*ed093b41SRobert Mustacchi bcopy(xsave, fx, sizeof (*fx));
2036*ed093b41SRobert Mustacchi return;
2037*ed093b41SRobert Mustacchi case XFEATURE_LEGACY_FP:
2038*ed093b41SRobert Mustacchi bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
2039*ed093b41SRobert Mustacchi fx->fx_mxcsr = SSE_MXCSR_INIT;
2040*ed093b41SRobert Mustacchi fx->fx_mxcsr_mask = 0;
2041*ed093b41SRobert Mustacchi break;
2042*ed093b41SRobert Mustacchi case XFEATURE_SSE:
2043*ed093b41SRobert Mustacchi bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
2044*ed093b41SRobert Mustacchi fx_mxcsr));
2045*ed093b41SRobert Mustacchi
2046*ed093b41SRobert Mustacchi fx->fx_fcw = FPU_CW_INIT_HW;
2047*ed093b41SRobert Mustacchi fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2048*ed093b41SRobert Mustacchi fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2049*ed093b41SRobert Mustacchi bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
2050*ed093b41SRobert Mustacchi break;
2051*ed093b41SRobert Mustacchi default:
2052*ed093b41SRobert Mustacchi bcopy(&sse_initial, fx, sizeof (*fx));
2053*ed093b41SRobert Mustacchi fx->fx_fcw = FPU_CW_INIT_HW;
2054*ed093b41SRobert Mustacchi break;
2055*ed093b41SRobert Mustacchi }
2056*ed093b41SRobert Mustacchi
2057*ed093b41SRobert Mustacchi /*
2058*ed093b41SRobert Mustacchi * Account for the AVX causing MXCSR to be valid.
2059*ed093b41SRobert Mustacchi */
2060*ed093b41SRobert Mustacchi if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
2061*ed093b41SRobert Mustacchi (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
2062*ed093b41SRobert Mustacchi fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2063*ed093b41SRobert Mustacchi fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2064*ed093b41SRobert Mustacchi }
2065*ed093b41SRobert Mustacchi }
2066*ed093b41SRobert Mustacchi
2067*ed093b41SRobert Mustacchi /*
2068*ed093b41SRobert Mustacchi * This function is designed to answer the question of are we using any xsave
2069*ed093b41SRobert Mustacchi * family of instructions in context switch and therefore we have this state.
2070*ed093b41SRobert Mustacchi * This should still remain true if we are using xsavec or xsaves in the kernel
2071*ed093b41SRobert Mustacchi * in the future.
2072*ed093b41SRobert Mustacchi */
2073*ed093b41SRobert Mustacchi boolean_t
fpu_xsave_enabled(void)2074*ed093b41SRobert Mustacchi fpu_xsave_enabled(void)
2075*ed093b41SRobert Mustacchi {
2076*ed093b41SRobert Mustacchi return (fp_save_mech == FP_XSAVE);
2077*ed093b41SRobert Mustacchi }
2078*ed093b41SRobert Mustacchi
2079*ed093b41SRobert Mustacchi /*
2080*ed093b41SRobert Mustacchi * The following structure is used to track and manage the programmatic
2081*ed093b41SRobert Mustacchi * construction of /proc and signal stack spilling of xsave information. All
2082*ed093b41SRobert Mustacchi * known xsave types that the kernel supports must be included here.
2083*ed093b41SRobert Mustacchi */
2084*ed093b41SRobert Mustacchi typedef struct xsave_proc_info {
2085*ed093b41SRobert Mustacchi /*
2086*ed093b41SRobert Mustacchi * This matches the /proc xregs type that this data represents. This s
2087*ed093b41SRobert Mustacchi * used for /proc only.
2088*ed093b41SRobert Mustacchi */
2089*ed093b41SRobert Mustacchi uint32_t xi_type;
2090*ed093b41SRobert Mustacchi /*
2091*ed093b41SRobert Mustacchi * This indicates the size of the /proc data that we're operating on.
2092*ed093b41SRobert Mustacchi * This is only used for /proc.
2093*ed093b41SRobert Mustacchi */
2094*ed093b41SRobert Mustacchi size_t xi_size;
2095*ed093b41SRobert Mustacchi /*
2096*ed093b41SRobert Mustacchi * This indicates the alignment that we want to have for the member when
2097*ed093b41SRobert Mustacchi * we're writing out. This is not used when setting data. This is only
2098*ed093b41SRobert Mustacchi * used for /proc.
2099*ed093b41SRobert Mustacchi */
2100*ed093b41SRobert Mustacchi size_t xi_align;
2101*ed093b41SRobert Mustacchi /*
2102*ed093b41SRobert Mustacchi * This indicates whether this member must always be considered or not.
2103*ed093b41SRobert Mustacchi * This is used in both /proc and context/signal handling.
2104*ed093b41SRobert Mustacchi */
2105*ed093b41SRobert Mustacchi bool xi_always;
2106*ed093b41SRobert Mustacchi /*
2107*ed093b41SRobert Mustacchi * This contains the corresponding bits in the xsave bit vector that
2108*ed093b41SRobert Mustacchi * corresponds to this entry. This is used for both /proc and
2109*ed093b41SRobert Mustacchi * context/signal handling.
2110*ed093b41SRobert Mustacchi */
2111*ed093b41SRobert Mustacchi uint64_t xi_bits;
2112*ed093b41SRobert Mustacchi /*
2113*ed093b41SRobert Mustacchi * The xi_fill function pointer is used to write out the /proc regset
2114*ed093b41SRobert Mustacchi * data (e.g. when a user reads xregs). This is only used for the /proc
2115*ed093b41SRobert Mustacchi * handling. The xi_valid function pointer is used instead to validate a
2116*ed093b41SRobert Mustacchi * given set of data that we've read in, while the xi_set pointer is
2117*ed093b41SRobert Mustacchi * used to actually transform the data in the underlying fpu save area.
2118*ed093b41SRobert Mustacchi */
2119*ed093b41SRobert Mustacchi void (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
2120*ed093b41SRobert Mustacchi void *);
2121*ed093b41SRobert Mustacchi bool (*xi_valid)(model_t, const void *);
2122*ed093b41SRobert Mustacchi void (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
2123*ed093b41SRobert Mustacchi uint64_t, const void *);
2124*ed093b41SRobert Mustacchi /*
2125*ed093b41SRobert Mustacchi * The xi_signal_in and xi_signal_out function pointers are used for
2126*ed093b41SRobert Mustacchi * extended context and signal handling information. They are used when
2127*ed093b41SRobert Mustacchi * reading in data from a ucontext_t and writing it out respectively.
2128*ed093b41SRobert Mustacchi * These are only used for context/signal handling.
2129*ed093b41SRobert Mustacchi */
2130*ed093b41SRobert Mustacchi int (*xi_signal_in)(const struct xsave_proc_info *,
2131*ed093b41SRobert Mustacchi const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
2132*ed093b41SRobert Mustacchi const uintptr_t);
2133*ed093b41SRobert Mustacchi int (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
2134*ed093b41SRobert Mustacchi uc_xsave_t *, const void *fpup, uintptr_t);
2135*ed093b41SRobert Mustacchi } xsave_proc_info_t;
2136*ed093b41SRobert Mustacchi
2137*ed093b41SRobert Mustacchi static bool
fpu_proc_xregs_initial_state(const fpu_ctx_t * fpu,uint64_t feats)2138*ed093b41SRobert Mustacchi fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
2139*ed093b41SRobert Mustacchi {
2140*ed093b41SRobert Mustacchi const struct xsave_state *xs = fpu->fpu_regs.kfpu_u.kfpu_xs;
2141*ed093b41SRobert Mustacchi
2142*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
2143*ed093b41SRobert Mustacchi return (true);
2144*ed093b41SRobert Mustacchi }
2145*ed093b41SRobert Mustacchi
2146*ed093b41SRobert Mustacchi return ((xs->xs_header.xsh_xstate_bv & feats) == 0);
2147*ed093b41SRobert Mustacchi }
2148*ed093b41SRobert Mustacchi
2149*ed093b41SRobert Mustacchi static void
fpu_proc_xregs_xcr_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2150*ed093b41SRobert Mustacchi fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2151*ed093b41SRobert Mustacchi void *datap)
2152*ed093b41SRobert Mustacchi {
2153*ed093b41SRobert Mustacchi prxregset_xcr_t *xcr = datap;
2154*ed093b41SRobert Mustacchi
2155*ed093b41SRobert Mustacchi xcr->prx_xcr_xcr0 = xsave_bv_all;
2156*ed093b41SRobert Mustacchi }
2157*ed093b41SRobert Mustacchi
2158*ed093b41SRobert Mustacchi /*
2159*ed093b41SRobert Mustacchi * Unlike other instruction portions, we treat the xsave header and the legacy
2160*ed093b41SRobert Mustacchi * XMM section together as both are somewhat tied at the instruction hip. Unlike
2161*ed093b41SRobert Mustacchi * the when dealing with other xsave regions like the ymm and zmm components,
2162*ed093b41SRobert Mustacchi * the initial state here is much more nuanced as it has to match what we actual
2163*ed093b41SRobert Mustacchi * do in the OS and depends on the components that are present.
2164*ed093b41SRobert Mustacchi */
2165*ed093b41SRobert Mustacchi static void
fpu_proc_xregs_xsave_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2166*ed093b41SRobert Mustacchi fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2167*ed093b41SRobert Mustacchi void *datap)
2168*ed093b41SRobert Mustacchi {
2169*ed093b41SRobert Mustacchi prxregset_xsave_t *prxsave = datap;
2170*ed093b41SRobert Mustacchi const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
2171*ed093b41SRobert Mustacchi size_t hdr_off;
2172*ed093b41SRobert Mustacchi
2173*ed093b41SRobert Mustacchi /*
2174*ed093b41SRobert Mustacchi * In the x87/XMM case, the no device vs. initial state is different
2175*ed093b41SRobert Mustacchi * because the initial state case still wants us to copy the real xsave
2176*ed093b41SRobert Mustacchi * header. It's also worth calling out that the actual illumos default
2177*ed093b41SRobert Mustacchi * fxsave state is not the same as what Intel documents. The main
2178*ed093b41SRobert Mustacchi * difference is in what the x87 FPU control word is. This results in
2179*ed093b41SRobert Mustacchi * the following different cases that we need to think about:
2180*ed093b41SRobert Mustacchi *
2181*ed093b41SRobert Mustacchi * o FPU_EN is not set. So we use the illumos default.
2182*ed093b41SRobert Mustacchi */
2183*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & FPU_EN) == 0) {
2184*ed093b41SRobert Mustacchi bcopy(&avx_initial, prxsave, sizeof (*prxsave));
2185*ed093b41SRobert Mustacchi return;
2186*ed093b41SRobert Mustacchi }
2187*ed093b41SRobert Mustacchi
2188*ed093b41SRobert Mustacchi /*
2189*ed093b41SRobert Mustacchi * Convert all the fxsave region while taking into account the validity
2190*ed093b41SRobert Mustacchi * of the xsave bits. The prxregset_xsave_t structure is the same as the
2191*ed093b41SRobert Mustacchi * xsave structure in our ABI and Intel designed the xsave header to
2192*ed093b41SRobert Mustacchi * begin with the 512-bit fxsave structure.
2193*ed093b41SRobert Mustacchi */
2194*ed093b41SRobert Mustacchi fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
2195*ed093b41SRobert Mustacchi
2196*ed093b41SRobert Mustacchi /*
2197*ed093b41SRobert Mustacchi * Now that we've dealt with the x87 and XMM state, take care of the
2198*ed093b41SRobert Mustacchi * header.
2199*ed093b41SRobert Mustacchi */
2200*ed093b41SRobert Mustacchi hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
2201*ed093b41SRobert Mustacchi bcopy((const void *)((uintptr_t)xsave + hdr_off),
2202*ed093b41SRobert Mustacchi (void *)((uintptr_t)prxsave + hdr_off),
2203*ed093b41SRobert Mustacchi sizeof (struct xsave_header));
2204*ed093b41SRobert Mustacchi }
2205*ed093b41SRobert Mustacchi
2206*ed093b41SRobert Mustacchi static void
fpu_proc_xregs_std_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2207*ed093b41SRobert Mustacchi fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2208*ed093b41SRobert Mustacchi void *datap)
2209*ed093b41SRobert Mustacchi {
2210*ed093b41SRobert Mustacchi if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
2211*ed093b41SRobert Mustacchi size_t size, off;
2212*ed093b41SRobert Mustacchi const void *xsave_off;
2213*ed093b41SRobert Mustacchi
2214*ed093b41SRobert Mustacchi cpuid_get_xsave_info(info->xi_bits, &size, &off);
2215*ed093b41SRobert Mustacchi ASSERT3U(size, ==, info->xi_size);
2216*ed093b41SRobert Mustacchi xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2217*ed093b41SRobert Mustacchi off);
2218*ed093b41SRobert Mustacchi bcopy(xsave_off, datap, info->xi_size);
2219*ed093b41SRobert Mustacchi }
2220*ed093b41SRobert Mustacchi }
2221*ed093b41SRobert Mustacchi
2222*ed093b41SRobert Mustacchi /*
2223*ed093b41SRobert Mustacchi * Users are not allowed to actually set the xcr information this way. However,
2224*ed093b41SRobert Mustacchi * to make it easier for someone to just do a read, modify, write, of the xregs
2225*ed093b41SRobert Mustacchi * data, if it is identical, then we will accept it (and do nothing).
2226*ed093b41SRobert Mustacchi */
2227*ed093b41SRobert Mustacchi static bool
fpu_proc_xregs_xcr_valid(model_t model,const void * datap)2228*ed093b41SRobert Mustacchi fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
2229*ed093b41SRobert Mustacchi {
2230*ed093b41SRobert Mustacchi const prxregset_xcr_t *xcr = datap;
2231*ed093b41SRobert Mustacchi
2232*ed093b41SRobert Mustacchi return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
2233*ed093b41SRobert Mustacchi xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
2234*ed093b41SRobert Mustacchi }
2235*ed093b41SRobert Mustacchi
2236*ed093b41SRobert Mustacchi /*
2237*ed093b41SRobert Mustacchi * To match traditional /proc semantics, we do not error if reserved bits of
2238*ed093b41SRobert Mustacchi * MXCSR are set, they will be masked off when writing data. We do not allow
2239*ed093b41SRobert Mustacchi * someone to indicate that they are asking for compressed xsave data, hence the
2240*ed093b41SRobert Mustacchi * check that prx_xsh_comp_bv is zero. Separately, in fpu_proc_xregs_set() we
2241*ed093b41SRobert Mustacchi * check that each component that was indicated in the xstate_bv is actually
2242*ed093b41SRobert Mustacchi * present.
2243*ed093b41SRobert Mustacchi */
2244*ed093b41SRobert Mustacchi static bool
fpu_proc_xregs_xsave_valid(model_t model,const void * datap)2245*ed093b41SRobert Mustacchi fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
2246*ed093b41SRobert Mustacchi {
2247*ed093b41SRobert Mustacchi const prxregset_xsave_t *xsave = datap;
2248*ed093b41SRobert Mustacchi uint64_t rsvd[6] = { 0 };
2249*ed093b41SRobert Mustacchi
2250*ed093b41SRobert Mustacchi if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
2251*ed093b41SRobert Mustacchi xsave->prx_xsh_xcomp_bv != 0) {
2252*ed093b41SRobert Mustacchi return (false);
2253*ed093b41SRobert Mustacchi }
2254*ed093b41SRobert Mustacchi
2255*ed093b41SRobert Mustacchi if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
2256*ed093b41SRobert Mustacchi return (false);
2257*ed093b41SRobert Mustacchi }
2258*ed093b41SRobert Mustacchi
2259*ed093b41SRobert Mustacchi return (true);
2260*ed093b41SRobert Mustacchi }
2261*ed093b41SRobert Mustacchi
2262*ed093b41SRobert Mustacchi /*
2263*ed093b41SRobert Mustacchi * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
2264*ed093b41SRobert Mustacchi * on x86; however, when operating in ILP32, subsets are reserved. We require
2265*ed093b41SRobert Mustacchi * that all reserved portions are set to zero.
2266*ed093b41SRobert Mustacchi */
2267*ed093b41SRobert Mustacchi static bool
fpu_proc_xregs_ymm_valid(model_t model,const void * datap)2268*ed093b41SRobert Mustacchi fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
2269*ed093b41SRobert Mustacchi {
2270*ed093b41SRobert Mustacchi upad128_t ymm_zero[8];
2271*ed093b41SRobert Mustacchi const prxregset_ymm_t *ymm = datap;
2272*ed093b41SRobert Mustacchi
2273*ed093b41SRobert Mustacchi if (model == DATAMODEL_LP64) {
2274*ed093b41SRobert Mustacchi return (true);
2275*ed093b41SRobert Mustacchi }
2276*ed093b41SRobert Mustacchi
2277*ed093b41SRobert Mustacchi bzero(&ymm_zero, sizeof (ymm_zero));
2278*ed093b41SRobert Mustacchi return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
2279*ed093b41SRobert Mustacchi }
2280*ed093b41SRobert Mustacchi
2281*ed093b41SRobert Mustacchi static bool
fpu_proc_xregs_zmm_valid(model_t model,const void * datap)2282*ed093b41SRobert Mustacchi fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
2283*ed093b41SRobert Mustacchi {
2284*ed093b41SRobert Mustacchi upad256_t zmm_zero[8];
2285*ed093b41SRobert Mustacchi const prxregset_zmm_t *zmm = datap;
2286*ed093b41SRobert Mustacchi
2287*ed093b41SRobert Mustacchi if (model == DATAMODEL_LP64) {
2288*ed093b41SRobert Mustacchi return (true);
2289*ed093b41SRobert Mustacchi }
2290*ed093b41SRobert Mustacchi
2291*ed093b41SRobert Mustacchi bzero(&zmm_zero, sizeof (zmm_zero));
2292*ed093b41SRobert Mustacchi return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
2293*ed093b41SRobert Mustacchi }
2294*ed093b41SRobert Mustacchi
2295*ed093b41SRobert Mustacchi static bool
fpu_proc_xregs_hi_zmm_valid(model_t model,const void * datap)2296*ed093b41SRobert Mustacchi fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
2297*ed093b41SRobert Mustacchi {
2298*ed093b41SRobert Mustacchi prxregset_hi_zmm_t hi_zmm_zero;
2299*ed093b41SRobert Mustacchi const prxregset_hi_zmm_t *hi_zmm = datap;
2300*ed093b41SRobert Mustacchi
2301*ed093b41SRobert Mustacchi if (model == DATAMODEL_LP64) {
2302*ed093b41SRobert Mustacchi return (true);
2303*ed093b41SRobert Mustacchi }
2304*ed093b41SRobert Mustacchi
2305*ed093b41SRobert Mustacchi bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
2306*ed093b41SRobert Mustacchi return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
2307*ed093b41SRobert Mustacchi }
2308*ed093b41SRobert Mustacchi
2309*ed093b41SRobert Mustacchi /*
2310*ed093b41SRobert Mustacchi * The xsave state consists of the first 512 bytes of the XMM state and then the
2311*ed093b41SRobert Mustacchi * xsave header itself. Because of the xsave header, this structure is marked
2312*ed093b41SRobert Mustacchi * with xi_always, so we must always process and consider it.
2313*ed093b41SRobert Mustacchi *
2314*ed093b41SRobert Mustacchi * Semantically if either of the bits around SSE / x87 is set, then we will copy
2315*ed093b41SRobert Mustacchi * the entire thing. This may mean that we end up copying a region that is not
2316*ed093b41SRobert Mustacchi * valid into the save area; however, that should be OK as we still have the
2317*ed093b41SRobert Mustacchi * specific bit flags that indicate what we should consider or not.
2318*ed093b41SRobert Mustacchi *
2319*ed093b41SRobert Mustacchi * There is one additional wrinkle we need to consider and honor here. The CPU
2320*ed093b41SRobert Mustacchi * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
2321*ed093b41SRobert Mustacchi * anything else. So if this is set and we do not have a valid x87/XMM bits
2322*ed093b41SRobert Mustacchi * set then we will set the MXCSR to its default state in case the processor
2323*ed093b41SRobert Mustacchi * tries to load it. For reference see:
2324*ed093b41SRobert Mustacchi *
2325*ed093b41SRobert Mustacchi * o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
2326*ed093b41SRobert Mustacchi * o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
2327*ed093b41SRobert Mustacchi *
2328*ed093b41SRobert Mustacchi * Note, the behavior around this changes depending on whether using the
2329*ed093b41SRobert Mustacchi * compressed xrstor or not. We are not, but it's worth being aware of. We do
2330*ed093b41SRobert Mustacchi * not worry about MXCSR_MASK because the instructions ignore it.
2331*ed093b41SRobert Mustacchi */
2332*ed093b41SRobert Mustacchi static void
fpu_proc_xregs_xsave_set(fpu_ctx_t * fpu,const xsave_proc_info_t * info,uint64_t xsave_bv,const void * datap)2333*ed093b41SRobert Mustacchi fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2334*ed093b41SRobert Mustacchi uint64_t xsave_bv, const void *datap)
2335*ed093b41SRobert Mustacchi {
2336*ed093b41SRobert Mustacchi const struct xsave_state *src_xs = datap;
2337*ed093b41SRobert Mustacchi struct xsave_state *targ_xs = fpu->fpu_regs.kfpu_u.kfpu_xs;
2338*ed093b41SRobert Mustacchi
2339*ed093b41SRobert Mustacchi if ((xsave_bv & info->xi_bits) != 0) {
2340*ed093b41SRobert Mustacchi bcopy(&src_xs->xs_fxsave, &targ_xs->xs_fxsave,
2341*ed093b41SRobert Mustacchi sizeof (struct fxsave_state));
2342*ed093b41SRobert Mustacchi } else if ((xsave_bv & XFEATURE_AVX) != 0) {
2343*ed093b41SRobert Mustacchi targ_xs->xs_fxsave.fx_mxcsr = SSE_MXCSR_INIT;
2344*ed093b41SRobert Mustacchi }
2345*ed093b41SRobert Mustacchi
2346*ed093b41SRobert Mustacchi bcopy(&src_xs->xs_header, &targ_xs->xs_header,
2347*ed093b41SRobert Mustacchi sizeof (struct xsave_header));
2348*ed093b41SRobert Mustacchi targ_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2349*ed093b41SRobert Mustacchi }
2350*ed093b41SRobert Mustacchi
2351*ed093b41SRobert Mustacchi static void
fpu_proc_xregs_std_set(fpu_ctx_t * fpu,const xsave_proc_info_t * info,uint64_t xsave_bv,const void * datap)2352*ed093b41SRobert Mustacchi fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2353*ed093b41SRobert Mustacchi uint64_t xsave_bv, const void *datap)
2354*ed093b41SRobert Mustacchi {
2355*ed093b41SRobert Mustacchi size_t size, off;
2356*ed093b41SRobert Mustacchi void *xsave_off;
2357*ed093b41SRobert Mustacchi
2358*ed093b41SRobert Mustacchi cpuid_get_xsave_info(info->xi_bits, &size, &off);
2359*ed093b41SRobert Mustacchi xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2360*ed093b41SRobert Mustacchi off);
2361*ed093b41SRobert Mustacchi bcopy(datap, xsave_off, size);
2362*ed093b41SRobert Mustacchi }
2363*ed093b41SRobert Mustacchi
2364*ed093b41SRobert Mustacchi /*
2365*ed093b41SRobert Mustacchi * Dealing with XMM data is a little more annoying in signal context. If UC_FPU
2366*ed093b41SRobert Mustacchi * is set, the ucontext_t's fpregset_t contains a copy of the XMM region. That
2367*ed093b41SRobert Mustacchi * must take priority over an XMM region that showed up in the uc_xsave_t data.
2368*ed093b41SRobert Mustacchi * In the signal copyout code we do not save XMM region in the uc_xsave_t or set
2369*ed093b41SRobert Mustacchi * it as a present component because of it being kept in the fpregset_t. Because
2370*ed093b41SRobert Mustacchi * of this behavior, if we find the XMM (or x87) state bits present, we treat
2371*ed093b41SRobert Mustacchi * that as an error.
2372*ed093b41SRobert Mustacchi *
2373*ed093b41SRobert Mustacchi * The system has always gone through and cleaned up the reserved bits in the
2374*ed093b41SRobert Mustacchi * fxsave state when someone calls setcontext(). Therefore we need to do the
2375*ed093b41SRobert Mustacchi * same thing which is why you see the masking of the mxcsr below.
2376*ed093b41SRobert Mustacchi *
2377*ed093b41SRobert Mustacchi * Finally, there is one last wrinkle here that we need to consider. The
2378*ed093b41SRobert Mustacchi * fpregset_t has two private words which cache the status/exception
2379*ed093b41SRobert Mustacchi * information. Therefore, we well... cheat. Intel has left bytes 464 (0x1d0)
2380*ed093b41SRobert Mustacchi * through 511 (0x1ff) available for us to do what we want. So we will pass this
2381*ed093b41SRobert Mustacchi * through that for the moment to help us pass this state around without too
2382*ed093b41SRobert Mustacchi * much extra allocation.
2383*ed093b41SRobert Mustacchi */
2384*ed093b41SRobert Mustacchi static int
fpu_signal_copyin_xmm(const xsave_proc_info_t * info,const ucontext_t * kuc,const uc_xsave_t * ucx,void * fpup,uintptr_t * udatap,const uintptr_t max_udata)2385*ed093b41SRobert Mustacchi fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
2386*ed093b41SRobert Mustacchi const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2387*ed093b41SRobert Mustacchi const uintptr_t max_udata)
2388*ed093b41SRobert Mustacchi {
2389*ed093b41SRobert Mustacchi struct xsave_state *xsave = fpup;
2390*ed093b41SRobert Mustacchi
2391*ed093b41SRobert Mustacchi if ((ucx->ucx_bv & info->xi_bits) != 0) {
2392*ed093b41SRobert Mustacchi return (EINVAL);
2393*ed093b41SRobert Mustacchi }
2394*ed093b41SRobert Mustacchi
2395*ed093b41SRobert Mustacchi if ((kuc->uc_flags & UC_FPU) != 0) {
2396*ed093b41SRobert Mustacchi bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
2397*ed093b41SRobert Mustacchi sizeof (struct fxsave_state));
2398*ed093b41SRobert Mustacchi xsave->xs_fxsave.__fx_ign2[3]._l[0] =
2399*ed093b41SRobert Mustacchi kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
2400*ed093b41SRobert Mustacchi xsave->xs_fxsave.__fx_ign2[3]._l[1] =
2401*ed093b41SRobert Mustacchi kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
2402*ed093b41SRobert Mustacchi xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2403*ed093b41SRobert Mustacchi xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2404*ed093b41SRobert Mustacchi }
2405*ed093b41SRobert Mustacchi
2406*ed093b41SRobert Mustacchi return (0);
2407*ed093b41SRobert Mustacchi }
2408*ed093b41SRobert Mustacchi
2409*ed093b41SRobert Mustacchi static int
fpu_signal_copyin_std(const xsave_proc_info_t * info,const ucontext_t * kuc,const uc_xsave_t * ucx,void * fpup,uintptr_t * udatap,const uintptr_t max_udata)2410*ed093b41SRobert Mustacchi fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
2411*ed093b41SRobert Mustacchi const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2412*ed093b41SRobert Mustacchi const uintptr_t max_udata)
2413*ed093b41SRobert Mustacchi {
2414*ed093b41SRobert Mustacchi size_t len, xsave_off;
2415*ed093b41SRobert Mustacchi void *copy_to;
2416*ed093b41SRobert Mustacchi struct xsave_state *xsave = fpup;
2417*ed093b41SRobert Mustacchi
2418*ed093b41SRobert Mustacchi cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2419*ed093b41SRobert Mustacchi if (*udatap + len > max_udata) {
2420*ed093b41SRobert Mustacchi return (EOVERFLOW);
2421*ed093b41SRobert Mustacchi }
2422*ed093b41SRobert Mustacchi
2423*ed093b41SRobert Mustacchi copy_to = (void *)((uintptr_t)fpup + xsave_off);
2424*ed093b41SRobert Mustacchi if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
2425*ed093b41SRobert Mustacchi return (EFAULT);
2426*ed093b41SRobert Mustacchi }
2427*ed093b41SRobert Mustacchi
2428*ed093b41SRobert Mustacchi xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2429*ed093b41SRobert Mustacchi *udatap = *udatap + len;
2430*ed093b41SRobert Mustacchi
2431*ed093b41SRobert Mustacchi return (0);
2432*ed093b41SRobert Mustacchi }
2433*ed093b41SRobert Mustacchi
2434*ed093b41SRobert Mustacchi static int
fpu_signal_copyout_std(const xsave_proc_info_t * info,fpu_copyout_f copyfunc,uc_xsave_t * ucx,const void * fpup,uintptr_t udatap)2435*ed093b41SRobert Mustacchi fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
2436*ed093b41SRobert Mustacchi uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
2437*ed093b41SRobert Mustacchi {
2438*ed093b41SRobert Mustacchi size_t len, xsave_off;
2439*ed093b41SRobert Mustacchi const void *copy_from;
2440*ed093b41SRobert Mustacchi void *copy_to;
2441*ed093b41SRobert Mustacchi int ret;
2442*ed093b41SRobert Mustacchi
2443*ed093b41SRobert Mustacchi cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2444*ed093b41SRobert Mustacchi copy_from = (void *)(uintptr_t)fpup + xsave_off;
2445*ed093b41SRobert Mustacchi copy_to = (void *)(udatap + ucx->ucx_len);
2446*ed093b41SRobert Mustacchi
2447*ed093b41SRobert Mustacchi ret = copyfunc(copy_from, copy_to, len);
2448*ed093b41SRobert Mustacchi if (ret != 0) {
2449*ed093b41SRobert Mustacchi return (ret);
2450*ed093b41SRobert Mustacchi }
2451*ed093b41SRobert Mustacchi
2452*ed093b41SRobert Mustacchi ucx->ucx_len += len;
2453*ed093b41SRobert Mustacchi ucx->ucx_bv |= info->xi_bits;
2454*ed093b41SRobert Mustacchi return (0);
2455*ed093b41SRobert Mustacchi }
2456*ed093b41SRobert Mustacchi
2457*ed093b41SRobert Mustacchi /*
2458*ed093b41SRobert Mustacchi * This table contains information about the extended FPU states and synthetic
2459*ed093b41SRobert Mustacchi * information we create for /proc, the ucontext_t, and signal handling. The
2460*ed093b41SRobert Mustacchi * definition of the xsave_proc_info_t describes how each member is used.
2461*ed093b41SRobert Mustacchi *
2462*ed093b41SRobert Mustacchi * In general, this table is expected to be in the order of the xsave data
2463*ed093b41SRobert Mustacchi * structure itself. Synthetic elements that we create can go anywhere and new
2464*ed093b41SRobert Mustacchi * ones should be inserted at the end. This structure is walked in order to
2465*ed093b41SRobert Mustacchi * produce the /proc and signal handling logic, so changing the order is
2466*ed093b41SRobert Mustacchi * meaningful for those and should not be done lightly.
2467*ed093b41SRobert Mustacchi */
2468*ed093b41SRobert Mustacchi static const xsave_proc_info_t fpu_xsave_info[] = { {
2469*ed093b41SRobert Mustacchi .xi_type = PRX_INFO_XCR,
2470*ed093b41SRobert Mustacchi .xi_size = sizeof (prxregset_xcr_t),
2471*ed093b41SRobert Mustacchi .xi_align = alignof (prxregset_xcr_t),
2472*ed093b41SRobert Mustacchi .xi_always = true,
2473*ed093b41SRobert Mustacchi .xi_bits = 0,
2474*ed093b41SRobert Mustacchi .xi_fill = fpu_proc_xregs_xcr_fill,
2475*ed093b41SRobert Mustacchi .xi_valid = fpu_proc_xregs_xcr_valid
2476*ed093b41SRobert Mustacchi }, {
2477*ed093b41SRobert Mustacchi /*
2478*ed093b41SRobert Mustacchi * The XSAVE entry covers both the xsave header and the %xmm registers.
2479*ed093b41SRobert Mustacchi * Note, there is no signal copyout information for the %xmm registers
2480*ed093b41SRobert Mustacchi * because it is expected that that data is already in the fpregset_t.
2481*ed093b41SRobert Mustacchi */
2482*ed093b41SRobert Mustacchi .xi_type = PRX_INFO_XSAVE,
2483*ed093b41SRobert Mustacchi .xi_size = sizeof (prxregset_xsave_t),
2484*ed093b41SRobert Mustacchi .xi_align = FPU_ALIGN_XMM,
2485*ed093b41SRobert Mustacchi .xi_always = true,
2486*ed093b41SRobert Mustacchi .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
2487*ed093b41SRobert Mustacchi .xi_fill = fpu_proc_xregs_xsave_fill,
2488*ed093b41SRobert Mustacchi .xi_set = fpu_proc_xregs_xsave_set,
2489*ed093b41SRobert Mustacchi .xi_valid = fpu_proc_xregs_xsave_valid,
2490*ed093b41SRobert Mustacchi .xi_signal_in = fpu_signal_copyin_xmm
2491*ed093b41SRobert Mustacchi }, {
2492*ed093b41SRobert Mustacchi .xi_type = PRX_INFO_YMM,
2493*ed093b41SRobert Mustacchi .xi_size = sizeof (prxregset_ymm_t),
2494*ed093b41SRobert Mustacchi .xi_align = FPU_ALIGN_YMM,
2495*ed093b41SRobert Mustacchi .xi_always = false,
2496*ed093b41SRobert Mustacchi .xi_bits = XFEATURE_AVX,
2497*ed093b41SRobert Mustacchi .xi_fill = fpu_proc_xregs_std_fill,
2498*ed093b41SRobert Mustacchi .xi_set = fpu_proc_xregs_std_set,
2499*ed093b41SRobert Mustacchi .xi_signal_in = fpu_signal_copyin_std,
2500*ed093b41SRobert Mustacchi .xi_valid = fpu_proc_xregs_ymm_valid,
2501*ed093b41SRobert Mustacchi .xi_signal_out = fpu_signal_copyout_std
2502*ed093b41SRobert Mustacchi }, {
2503*ed093b41SRobert Mustacchi /*
2504*ed093b41SRobert Mustacchi * There is no /proc validation function for the mask registers because
2505*ed093b41SRobert Mustacchi * they are the same in ILP32 / LP64 and there is nothing for us to
2506*ed093b41SRobert Mustacchi * actually validate.
2507*ed093b41SRobert Mustacchi */
2508*ed093b41SRobert Mustacchi .xi_type = PRX_INFO_OPMASK,
2509*ed093b41SRobert Mustacchi .xi_size = sizeof (prxregset_opmask_t),
2510*ed093b41SRobert Mustacchi .xi_align = alignof (prxregset_opmask_t),
2511*ed093b41SRobert Mustacchi .xi_always = false,
2512*ed093b41SRobert Mustacchi .xi_bits = XFEATURE_AVX512_OPMASK,
2513*ed093b41SRobert Mustacchi .xi_fill = fpu_proc_xregs_std_fill,
2514*ed093b41SRobert Mustacchi .xi_set = fpu_proc_xregs_std_set,
2515*ed093b41SRobert Mustacchi .xi_signal_in = fpu_signal_copyin_std,
2516*ed093b41SRobert Mustacchi .xi_signal_out = fpu_signal_copyout_std
2517*ed093b41SRobert Mustacchi }, {
2518*ed093b41SRobert Mustacchi .xi_type = PRX_INFO_ZMM,
2519*ed093b41SRobert Mustacchi .xi_size = sizeof (prxregset_zmm_t),
2520*ed093b41SRobert Mustacchi .xi_align = FPU_ALIGN_ZMM,
2521*ed093b41SRobert Mustacchi .xi_always = false,
2522*ed093b41SRobert Mustacchi .xi_bits = XFEATURE_AVX512_ZMM,
2523*ed093b41SRobert Mustacchi .xi_fill = fpu_proc_xregs_std_fill,
2524*ed093b41SRobert Mustacchi .xi_set = fpu_proc_xregs_std_set,
2525*ed093b41SRobert Mustacchi .xi_valid = fpu_proc_xregs_zmm_valid,
2526*ed093b41SRobert Mustacchi .xi_signal_in = fpu_signal_copyin_std,
2527*ed093b41SRobert Mustacchi .xi_signal_out = fpu_signal_copyout_std
2528*ed093b41SRobert Mustacchi }, {
2529*ed093b41SRobert Mustacchi .xi_type = PRX_INFO_HI_ZMM,
2530*ed093b41SRobert Mustacchi .xi_size = sizeof (prxregset_hi_zmm_t),
2531*ed093b41SRobert Mustacchi .xi_align = FPU_ALIGN_ZMM,
2532*ed093b41SRobert Mustacchi .xi_always = false,
2533*ed093b41SRobert Mustacchi .xi_bits = XFEATURE_AVX512_HI_ZMM,
2534*ed093b41SRobert Mustacchi .xi_fill = fpu_proc_xregs_std_fill,
2535*ed093b41SRobert Mustacchi .xi_set = fpu_proc_xregs_std_set,
2536*ed093b41SRobert Mustacchi .xi_valid = fpu_proc_xregs_hi_zmm_valid,
2537*ed093b41SRobert Mustacchi .xi_signal_in = fpu_signal_copyin_std,
2538*ed093b41SRobert Mustacchi .xi_signal_out = fpu_signal_copyout_std
2539*ed093b41SRobert Mustacchi } };
2540*ed093b41SRobert Mustacchi
2541*ed093b41SRobert Mustacchi static bool
fpu_proc_xregs_include(const xsave_proc_info_t * infop)2542*ed093b41SRobert Mustacchi fpu_proc_xregs_include(const xsave_proc_info_t *infop)
2543*ed093b41SRobert Mustacchi {
2544*ed093b41SRobert Mustacchi return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
2545*ed093b41SRobert Mustacchi }
2546*ed093b41SRobert Mustacchi
2547*ed093b41SRobert Mustacchi void
fpu_proc_xregs_info(struct proc * p __unused,uint32_t * ninfop,uint32_t * sizep,uint32_t * dstart)2548*ed093b41SRobert Mustacchi fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
2549*ed093b41SRobert Mustacchi uint32_t *dstart)
2550*ed093b41SRobert Mustacchi {
2551*ed093b41SRobert Mustacchi size_t ret = sizeof (prxregset_hdr_t);
2552*ed093b41SRobert Mustacchi uint32_t ninfo = 0;
2553*ed093b41SRobert Mustacchi
2554*ed093b41SRobert Mustacchi ASSERT(fpu_xsave_enabled());
2555*ed093b41SRobert Mustacchi
2556*ed093b41SRobert Mustacchi /*
2557*ed093b41SRobert Mustacchi * Right now the set of flags that are enabled in the FPU is global.
2558*ed093b41SRobert Mustacchi * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
2559*ed093b41SRobert Mustacchi * actual things that might show up and we care about are all about what
2560*ed093b41SRobert Mustacchi * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
2561*ed093b41SRobert Mustacchi * move to per-process FPU enablement which is likely to come with AMX,
2562*ed093b41SRobert Mustacchi * then this will need the proc_t to look at, hence why we've set things
2563*ed093b41SRobert Mustacchi * up with the unused variable above.
2564*ed093b41SRobert Mustacchi *
2565*ed093b41SRobert Mustacchi * We take two passes through the array. The first is just to count up
2566*ed093b41SRobert Mustacchi * how many informational entries we need.
2567*ed093b41SRobert Mustacchi */
2568*ed093b41SRobert Mustacchi for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2569*ed093b41SRobert Mustacchi if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2570*ed093b41SRobert Mustacchi continue;
2571*ed093b41SRobert Mustacchi ninfo++;
2572*ed093b41SRobert Mustacchi }
2573*ed093b41SRobert Mustacchi
2574*ed093b41SRobert Mustacchi ASSERT3U(ninfo, >, 0);
2575*ed093b41SRobert Mustacchi ret += sizeof (prxregset_info_t) * ninfo;
2576*ed093b41SRobert Mustacchi
2577*ed093b41SRobert Mustacchi for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2578*ed093b41SRobert Mustacchi size_t curphase;
2579*ed093b41SRobert Mustacchi if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2580*ed093b41SRobert Mustacchi continue;
2581*ed093b41SRobert Mustacchi
2582*ed093b41SRobert Mustacchi curphase = ret % fpu_xsave_info[i].xi_align;
2583*ed093b41SRobert Mustacchi if (ret < fpu_xsave_info[i].xi_align) {
2584*ed093b41SRobert Mustacchi ret = fpu_xsave_info[i].xi_align;
2585*ed093b41SRobert Mustacchi } else if (curphase != 0) {
2586*ed093b41SRobert Mustacchi ret += curphase;
2587*ed093b41SRobert Mustacchi }
2588*ed093b41SRobert Mustacchi
2589*ed093b41SRobert Mustacchi if (i == 0 && dstart != NULL) {
2590*ed093b41SRobert Mustacchi *dstart = ret;
2591*ed093b41SRobert Mustacchi }
2592*ed093b41SRobert Mustacchi
2593*ed093b41SRobert Mustacchi ret += fpu_xsave_info[i].xi_size;
2594*ed093b41SRobert Mustacchi }
2595*ed093b41SRobert Mustacchi
2596*ed093b41SRobert Mustacchi VERIFY3U(ret, <=, UINT32_MAX);
2597*ed093b41SRobert Mustacchi if (sizep != NULL) {
2598*ed093b41SRobert Mustacchi *sizep = ret;
2599*ed093b41SRobert Mustacchi }
2600*ed093b41SRobert Mustacchi
2601*ed093b41SRobert Mustacchi if (ninfop != NULL) {
2602*ed093b41SRobert Mustacchi *ninfop = ninfo;
2603*ed093b41SRobert Mustacchi }
2604*ed093b41SRobert Mustacchi }
2605*ed093b41SRobert Mustacchi
2606*ed093b41SRobert Mustacchi /*
2607*ed093b41SRobert Mustacchi * This function supports /proc. Because /proc does not have a process locked
2608*ed093b41SRobert Mustacchi * while processing a PCSXREG, this tries to establish an upper bound that we
2609*ed093b41SRobert Mustacchi * will validate later in fpu_proc_xregs_set(). We basically say that if you
2610*ed093b41SRobert Mustacchi * take the maximum xsave size and add 1 KiB that is a good enough approximation
2611*ed093b41SRobert Mustacchi * for the maximum size. The 1 KiB is us basically trying to rationalize the
2612*ed093b41SRobert Mustacchi * overhead of our structures that we're adding right, while being cognisant of
2613*ed093b41SRobert Mustacchi * differing alignments and the fact that the full xsave size is in some cases
2614*ed093b41SRobert Mustacchi * (when supervisor states or features we don't support are present) going to be
2615*ed093b41SRobert Mustacchi * larger than we would need for this.
2616*ed093b41SRobert Mustacchi */
2617*ed093b41SRobert Mustacchi size_t
fpu_proc_xregs_max_size(void)2618*ed093b41SRobert Mustacchi fpu_proc_xregs_max_size(void)
2619*ed093b41SRobert Mustacchi {
2620*ed093b41SRobert Mustacchi VERIFY(fpu_xsave_enabled());
2621*ed093b41SRobert Mustacchi return (cpuid_get_xsave_size() + 0x1000);
2622*ed093b41SRobert Mustacchi }
2623*ed093b41SRobert Mustacchi
2624*ed093b41SRobert Mustacchi /*
2625*ed093b41SRobert Mustacchi * This functions supports /proc. In particular, it's meant to perform the
2626*ed093b41SRobert Mustacchi * following:
2627*ed093b41SRobert Mustacchi *
2628*ed093b41SRobert Mustacchi * o Potentially save the current thread's registers.
2629*ed093b41SRobert Mustacchi * o Write out the x86 xsave /proc xregs format data from the xsave data we
2630*ed093b41SRobert Mustacchi * actually have. Note, this can be a little weird for cases where the FPU is
2631*ed093b41SRobert Mustacchi * not actually enabled, which happens for system processes.
2632*ed093b41SRobert Mustacchi */
2633*ed093b41SRobert Mustacchi void
fpu_proc_xregs_get(klwp_t * lwp,void * buf)2634*ed093b41SRobert Mustacchi fpu_proc_xregs_get(klwp_t *lwp, void *buf)
2635*ed093b41SRobert Mustacchi {
2636*ed093b41SRobert Mustacchi uint32_t size, ninfo, curinfo, dstart;
2637*ed093b41SRobert Mustacchi fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2638*ed093b41SRobert Mustacchi prxregset_hdr_t *hdr = buf;
2639*ed093b41SRobert Mustacchi
2640*ed093b41SRobert Mustacchi ASSERT(fpu_xsave_enabled());
2641*ed093b41SRobert Mustacchi fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
2642*ed093b41SRobert Mustacchi
2643*ed093b41SRobert Mustacchi /*
2644*ed093b41SRobert Mustacchi * Before we get going, defensively zero out all the data buffer so that
2645*ed093b41SRobert Mustacchi * the rest of the fill functions can assume a specific base.
2646*ed093b41SRobert Mustacchi */
2647*ed093b41SRobert Mustacchi bzero(buf, size);
2648*ed093b41SRobert Mustacchi
2649*ed093b41SRobert Mustacchi kpreempt_disable();
2650*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2651*ed093b41SRobert Mustacchi /*
2652*ed093b41SRobert Mustacchi * This case suggests that thread in question doesn't have a
2653*ed093b41SRobert Mustacchi * valid FPU save state which should only happen when it is on
2654*ed093b41SRobert Mustacchi * CPU. If this is the case, we must ensure that we save the
2655*ed093b41SRobert Mustacchi * current FPU state before proceeding. We also sanity check
2656*ed093b41SRobert Mustacchi * several things here before doing this as using /proc on
2657*ed093b41SRobert Mustacchi * yourself is always exciting. fp_save() will ensure that the
2658*ed093b41SRobert Mustacchi * thread is flagged to go back to being an eager FPU before
2659*ed093b41SRobert Mustacchi * returning back to userland.
2660*ed093b41SRobert Mustacchi */
2661*ed093b41SRobert Mustacchi VERIFY3P(curthread, ==, lwptot(lwp));
2662*ed093b41SRobert Mustacchi VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2663*ed093b41SRobert Mustacchi fp_save(fpu);
2664*ed093b41SRobert Mustacchi }
2665*ed093b41SRobert Mustacchi kpreempt_enable();
2666*ed093b41SRobert Mustacchi
2667*ed093b41SRobert Mustacchi hdr->pr_type = PR_TYPE_XSAVE;
2668*ed093b41SRobert Mustacchi hdr->pr_size = size;
2669*ed093b41SRobert Mustacchi hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
2670*ed093b41SRobert Mustacchi hdr->pr_pad[3] = 0;
2671*ed093b41SRobert Mustacchi hdr->pr_ninfo = ninfo;
2672*ed093b41SRobert Mustacchi
2673*ed093b41SRobert Mustacchi curinfo = 0;
2674*ed093b41SRobert Mustacchi for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2675*ed093b41SRobert Mustacchi void *startp;
2676*ed093b41SRobert Mustacchi uint32_t phase;
2677*ed093b41SRobert Mustacchi
2678*ed093b41SRobert Mustacchi if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2679*ed093b41SRobert Mustacchi continue;
2680*ed093b41SRobert Mustacchi
2681*ed093b41SRobert Mustacchi phase = dstart % fpu_xsave_info[i].xi_align;
2682*ed093b41SRobert Mustacchi if (dstart < fpu_xsave_info[i].xi_align) {
2683*ed093b41SRobert Mustacchi ASSERT3U(i, !=, 0);
2684*ed093b41SRobert Mustacchi dstart = fpu_xsave_info[i].xi_align;
2685*ed093b41SRobert Mustacchi } else if (phase != 0) {
2686*ed093b41SRobert Mustacchi ASSERT3U(i, !=, 0);
2687*ed093b41SRobert Mustacchi dstart += phase;
2688*ed093b41SRobert Mustacchi }
2689*ed093b41SRobert Mustacchi
2690*ed093b41SRobert Mustacchi hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
2691*ed093b41SRobert Mustacchi hdr->pr_info[curinfo].pri_flags = 0;
2692*ed093b41SRobert Mustacchi hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
2693*ed093b41SRobert Mustacchi hdr->pr_info[curinfo].pri_offset = dstart;
2694*ed093b41SRobert Mustacchi
2695*ed093b41SRobert Mustacchi startp = (void *)((uintptr_t)buf + dstart);
2696*ed093b41SRobert Mustacchi fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
2697*ed093b41SRobert Mustacchi dstart += fpu_xsave_info[i].xi_size;
2698*ed093b41SRobert Mustacchi ASSERT3U(curinfo, <=, ninfo);
2699*ed093b41SRobert Mustacchi curinfo++;
2700*ed093b41SRobert Mustacchi }
2701*ed093b41SRobert Mustacchi }
2702*ed093b41SRobert Mustacchi
2703*ed093b41SRobert Mustacchi /*
2704*ed093b41SRobert Mustacchi * We have been asked to set the data in the FPU for a given thread. Our
2705*ed093b41SRobert Mustacchi * prmachdep code has already validated that the raw semantics of the data that
2706*ed093b41SRobert Mustacchi * we have are valid (that is the appropriate sizes, offsets, and flags). We now
2707*ed093b41SRobert Mustacchi * apply additional checking here:
2708*ed093b41SRobert Mustacchi *
2709*ed093b41SRobert Mustacchi * o The xsave structure is present and only valid bits are set.
2710*ed093b41SRobert Mustacchi * o If the xsave component bit-vector is set, we have the corresponding proc
2711*ed093b41SRobert Mustacchi * info item.
2712*ed093b41SRobert Mustacchi * o Read-only items are ignored if and only if they actually match what we
2713*ed093b41SRobert Mustacchi * gave the user mostly as a courtesy to simplify things here.
2714*ed093b41SRobert Mustacchi * o ILP32 processes which can't support many of the regions are allowed to
2715*ed093b41SRobert Mustacchi * have the items here (as we likely gave them to them), but they must be
2716*ed093b41SRobert Mustacchi * zero if they are set.
2717*ed093b41SRobert Mustacchi *
2718*ed093b41SRobert Mustacchi * We take a first pass through all the data, validating it makes sense for the
2719*ed093b41SRobert Mustacchi * FPU. Only after that point do we ensure that we have the FPU data in question
2720*ed093b41SRobert Mustacchi * and then we clobber all the FPU data. Part of the semantics of setting this
2721*ed093b41SRobert Mustacchi * is that we're setting the entire extended FPU.
2722*ed093b41SRobert Mustacchi */
2723*ed093b41SRobert Mustacchi int
fpu_proc_xregs_set(klwp_t * lwp,void * buf)2724*ed093b41SRobert Mustacchi fpu_proc_xregs_set(klwp_t *lwp, void *buf)
2725*ed093b41SRobert Mustacchi {
2726*ed093b41SRobert Mustacchi prxregset_hdr_t *prx = buf;
2727*ed093b41SRobert Mustacchi model_t model = lwp_getdatamodel(lwp);
2728*ed093b41SRobert Mustacchi uint64_t bv_found = 0;
2729*ed093b41SRobert Mustacchi const prxregset_xsave_t *xsave = NULL;
2730*ed093b41SRobert Mustacchi fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2731*ed093b41SRobert Mustacchi
2732*ed093b41SRobert Mustacchi VERIFY(fpu_xsave_enabled());
2733*ed093b41SRobert Mustacchi
2734*ed093b41SRobert Mustacchi /*
2735*ed093b41SRobert Mustacchi * First, walk each note info header that we have from the user and
2736*ed093b41SRobert Mustacchi * proceed to validate it. The prmachdep code has already validated that
2737*ed093b41SRobert Mustacchi * the size, type, and offset information is valid, but it has not
2738*ed093b41SRobert Mustacchi * validated the semantic contents of this or if someone is trying to
2739*ed093b41SRobert Mustacchi * write something they shouldn't.
2740*ed093b41SRobert Mustacchi *
2741*ed093b41SRobert Mustacchi * While we walk this, we keep track of where the xsave header is. We
2742*ed093b41SRobert Mustacchi * also track all of the bits that we have found along the way so we can
2743*ed093b41SRobert Mustacchi * match up and ensure that everything that was set has a corresponding
2744*ed093b41SRobert Mustacchi * bit in the xsave bitmap. If we have something in the xsave bitmap,
2745*ed093b41SRobert Mustacchi * but not its corresponding data, then that is an error. However, we
2746*ed093b41SRobert Mustacchi * allow folks to write data regions without the bit set in the xsave
2747*ed093b41SRobert Mustacchi * data to make the read, modify, write process simpler.
2748*ed093b41SRobert Mustacchi */
2749*ed093b41SRobert Mustacchi for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2750*ed093b41SRobert Mustacchi const prxregset_info_t *info = &prx->pr_info[i];
2751*ed093b41SRobert Mustacchi bool found = false;
2752*ed093b41SRobert Mustacchi
2753*ed093b41SRobert Mustacchi for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2754*ed093b41SRobert Mustacchi void *data;
2755*ed093b41SRobert Mustacchi if (info->pri_type != fpu_xsave_info[pt].xi_type)
2756*ed093b41SRobert Mustacchi continue;
2757*ed093b41SRobert Mustacchi
2758*ed093b41SRobert Mustacchi found = true;
2759*ed093b41SRobert Mustacchi data = (void *)((uintptr_t)buf + info->pri_offset);
2760*ed093b41SRobert Mustacchi if (fpu_xsave_info[pt].xi_valid != NULL &&
2761*ed093b41SRobert Mustacchi !fpu_xsave_info[pt].xi_valid(model, data)) {
2762*ed093b41SRobert Mustacchi return (EINVAL);
2763*ed093b41SRobert Mustacchi }
2764*ed093b41SRobert Mustacchi
2765*ed093b41SRobert Mustacchi if (info->pri_type == PRX_INFO_XSAVE) {
2766*ed093b41SRobert Mustacchi xsave = data;
2767*ed093b41SRobert Mustacchi }
2768*ed093b41SRobert Mustacchi bv_found |= fpu_xsave_info[pt].xi_bits;
2769*ed093b41SRobert Mustacchi break;
2770*ed093b41SRobert Mustacchi }
2771*ed093b41SRobert Mustacchi
2772*ed093b41SRobert Mustacchi if (!found) {
2773*ed093b41SRobert Mustacchi return (EINVAL);
2774*ed093b41SRobert Mustacchi }
2775*ed093b41SRobert Mustacchi }
2776*ed093b41SRobert Mustacchi
2777*ed093b41SRobert Mustacchi /*
2778*ed093b41SRobert Mustacchi * No xsave data, no dice.
2779*ed093b41SRobert Mustacchi */
2780*ed093b41SRobert Mustacchi if (xsave == NULL) {
2781*ed093b41SRobert Mustacchi return (EINVAL);
2782*ed093b41SRobert Mustacchi }
2783*ed093b41SRobert Mustacchi
2784*ed093b41SRobert Mustacchi /*
2785*ed093b41SRobert Mustacchi * If anything is set in the xsave header that was not found as we
2786*ed093b41SRobert Mustacchi * walked structures, then that is an error. The opposite is not true as
2787*ed093b41SRobert Mustacchi * discussed above.
2788*ed093b41SRobert Mustacchi */
2789*ed093b41SRobert Mustacchi if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
2790*ed093b41SRobert Mustacchi return (EINVAL);
2791*ed093b41SRobert Mustacchi }
2792*ed093b41SRobert Mustacchi
2793*ed093b41SRobert Mustacchi /*
2794*ed093b41SRobert Mustacchi * At this point, we consider all the data actually valid. Now we must
2795*ed093b41SRobert Mustacchi * set up this information in the save area. If this is our own lwp, we
2796*ed093b41SRobert Mustacchi * must disable it first. Otherwise, we expect that it is already valid.
2797*ed093b41SRobert Mustacchi * To try to sanitize this, we will defensively zero the entire region
2798*ed093b41SRobert Mustacchi * as we are setting everything that will result in here.
2799*ed093b41SRobert Mustacchi */
2800*ed093b41SRobert Mustacchi kpreempt_disable();
2801*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2802*ed093b41SRobert Mustacchi /*
2803*ed093b41SRobert Mustacchi * This case suggests that thread in question doesn't have a
2804*ed093b41SRobert Mustacchi * valid FPU save state which should only happen when it is on
2805*ed093b41SRobert Mustacchi * CPU. If this is the case, we explicitly disable the FPU, but
2806*ed093b41SRobert Mustacchi * do not save it before proceeding. We also sanity check
2807*ed093b41SRobert Mustacchi * several things here before doing this as using /proc on
2808*ed093b41SRobert Mustacchi * yourself is always exciting. Unlike fp_save(), fp_free() does
2809*ed093b41SRobert Mustacchi * not signal that an update is required, so we unconditionally
2810*ed093b41SRobert Mustacchi * set that for all threads.
2811*ed093b41SRobert Mustacchi */
2812*ed093b41SRobert Mustacchi VERIFY3P(curthread, ==, lwptot(lwp));
2813*ed093b41SRobert Mustacchi VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2814*ed093b41SRobert Mustacchi fp_free(fpu);
2815*ed093b41SRobert Mustacchi }
2816*ed093b41SRobert Mustacchi PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
2817*ed093b41SRobert Mustacchi bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2818*ed093b41SRobert Mustacchi cpuid_get_xsave_size());
2819*ed093b41SRobert Mustacchi
2820*ed093b41SRobert Mustacchi for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2821*ed093b41SRobert Mustacchi const prxregset_info_t *info = &prx->pr_info[i];
2822*ed093b41SRobert Mustacchi bool found = false;
2823*ed093b41SRobert Mustacchi
2824*ed093b41SRobert Mustacchi for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2825*ed093b41SRobert Mustacchi const void *data;
2826*ed093b41SRobert Mustacchi if (info->pri_type != fpu_xsave_info[pt].xi_type)
2827*ed093b41SRobert Mustacchi continue;
2828*ed093b41SRobert Mustacchi
2829*ed093b41SRobert Mustacchi /*
2830*ed093b41SRobert Mustacchi * Check if we have a set function and if we should
2831*ed093b41SRobert Mustacchi * include this. We may not if this is something like
2832*ed093b41SRobert Mustacchi * PRX_INFO_XCR which is read-only.
2833*ed093b41SRobert Mustacchi *
2834*ed093b41SRobert Mustacchi * We may not include a given entry as it may not have
2835*ed093b41SRobert Mustacchi * been set in the actual xsave state that we have been
2836*ed093b41SRobert Mustacchi * asked to restore, in which case to not break the
2837*ed093b41SRobert Mustacchi * xsaveopt logic, we must leave it in its initial
2838*ed093b41SRobert Mustacchi * state, e.g. zeroed (generally). XMM data initial
2839*ed093b41SRobert Mustacchi * state is not zeroed, but is marked with xi_always to
2840*ed093b41SRobert Mustacchi * help account for this.
2841*ed093b41SRobert Mustacchi */
2842*ed093b41SRobert Mustacchi found = true;
2843*ed093b41SRobert Mustacchi if (fpu_xsave_info[pt].xi_set == NULL)
2844*ed093b41SRobert Mustacchi break;
2845*ed093b41SRobert Mustacchi if (!fpu_xsave_info[pt].xi_always &&
2846*ed093b41SRobert Mustacchi (xsave->prx_xsh_xstate_bv &
2847*ed093b41SRobert Mustacchi fpu_xsave_info[pt].xi_bits) !=
2848*ed093b41SRobert Mustacchi fpu_xsave_info[pt].xi_bits) {
2849*ed093b41SRobert Mustacchi break;
2850*ed093b41SRobert Mustacchi }
2851*ed093b41SRobert Mustacchi
2852*ed093b41SRobert Mustacchi data = (void *)((uintptr_t)buf + info->pri_offset);
2853*ed093b41SRobert Mustacchi fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
2854*ed093b41SRobert Mustacchi xsave->prx_xsh_xstate_bv, data);
2855*ed093b41SRobert Mustacchi }
2856*ed093b41SRobert Mustacchi
2857*ed093b41SRobert Mustacchi VERIFY(found);
2858*ed093b41SRobert Mustacchi }
2859*ed093b41SRobert Mustacchi kpreempt_enable();
2860*ed093b41SRobert Mustacchi
2861*ed093b41SRobert Mustacchi return (0);
2862*ed093b41SRobert Mustacchi }
2863*ed093b41SRobert Mustacchi
2864*ed093b41SRobert Mustacchi /*
2865*ed093b41SRobert Mustacchi * To be included in the signal copyout logic we must have a copy function and
2866*ed093b41SRobert Mustacchi * the bit in question must be included. Note, we don't consult xi_always here
2867*ed093b41SRobert Mustacchi * as that is really part of what is always present for xsave logic and
2868*ed093b41SRobert Mustacchi * therefore isn't really pertinent here because of our custom format. See the
2869*ed093b41SRobert Mustacchi * big theory statement for more info.
2870*ed093b41SRobert Mustacchi */
2871*ed093b41SRobert Mustacchi static bool
fpu_signal_include(const xsave_proc_info_t * infop,uint64_t xs_bv)2872*ed093b41SRobert Mustacchi fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
2873*ed093b41SRobert Mustacchi {
2874*ed093b41SRobert Mustacchi return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
2875*ed093b41SRobert Mustacchi infop->xi_signal_out != NULL);
2876*ed093b41SRobert Mustacchi }
2877*ed093b41SRobert Mustacchi
2878*ed093b41SRobert Mustacchi /*
2879*ed093b41SRobert Mustacchi * We need to fill out the xsave related data into the ucontext_t that we've
2880*ed093b41SRobert Mustacchi * been given. We should have a valid user pointer at this point in the uc_xsave
2881*ed093b41SRobert Mustacchi * member. This is much simpler than the copyin that we have. Here are the
2882*ed093b41SRobert Mustacchi * current assumptions:
2883*ed093b41SRobert Mustacchi *
2884*ed093b41SRobert Mustacchi * o This is being called for the current thread. This is not meant to operate
2885*ed093b41SRobert Mustacchi * on an arbitrary thread's state.
2886*ed093b41SRobert Mustacchi * o We cannot assume whether the FPU is valid in the pcb or not. While most
2887*ed093b41SRobert Mustacchi * callers will have just called getfpregs() which saved the state, don't
2888*ed093b41SRobert Mustacchi * assume that.
2889*ed093b41SRobert Mustacchi * o We assume that the user address has the requisite required space for this
2890*ed093b41SRobert Mustacchi * to be copied out.
2891*ed093b41SRobert Mustacchi * o We assume that copyfunc() will ensure we are not copying into a kernel
2892*ed093b41SRobert Mustacchi * address.
2893*ed093b41SRobert Mustacchi *
2894*ed093b41SRobert Mustacchi * For more information on the format of the data, see the 'Signal Handling and
2895*ed093b41SRobert Mustacchi * the ucontext_t' portion of the big theory statement. We copy out all the
2896*ed093b41SRobert Mustacchi * constituent parts and then come back and write out the actual final header
2897*ed093b41SRobert Mustacchi * information.
2898*ed093b41SRobert Mustacchi */
2899*ed093b41SRobert Mustacchi int
fpu_signal_copyout(klwp_t * lwp,uintptr_t uaddr,fpu_copyout_f copyfunc)2900*ed093b41SRobert Mustacchi fpu_signal_copyout(klwp_t *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
2901*ed093b41SRobert Mustacchi {
2902*ed093b41SRobert Mustacchi struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
2903*ed093b41SRobert Mustacchi uint64_t xs_bv;
2904*ed093b41SRobert Mustacchi uc_xsave_t ucx;
2905*ed093b41SRobert Mustacchi int ret;
2906*ed093b41SRobert Mustacchi
2907*ed093b41SRobert Mustacchi VERIFY3P(curthread, ==, lwptot(lwp));
2908*ed093b41SRobert Mustacchi VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2909*ed093b41SRobert Mustacchi VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
2910*ed093b41SRobert Mustacchi
2911*ed093b41SRobert Mustacchi if (!fpu_xsave_enabled()) {
2912*ed093b41SRobert Mustacchi return (ENOTSUP);
2913*ed093b41SRobert Mustacchi }
2914*ed093b41SRobert Mustacchi
2915*ed093b41SRobert Mustacchi /*
2916*ed093b41SRobert Mustacchi * Unlike when we're dealing with /proc, we can unconditionally call
2917*ed093b41SRobert Mustacchi * fp_save() because this is always called in the context where the lwp
2918*ed093b41SRobert Mustacchi * we're operating on is always the one on CPU (which is what fp_save()
2919*ed093b41SRobert Mustacchi * asserts).
2920*ed093b41SRobert Mustacchi */
2921*ed093b41SRobert Mustacchi fp_save(fpu);
2922*ed093b41SRobert Mustacchi
2923*ed093b41SRobert Mustacchi bzero(&ucx, sizeof (ucx));
2924*ed093b41SRobert Mustacchi ucx.ucx_vers = UC_XSAVE_VERS;
2925*ed093b41SRobert Mustacchi ucx.ucx_len += sizeof (uc_xsave_t);
2926*ed093b41SRobert Mustacchi
2927*ed093b41SRobert Mustacchi xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
2928*ed093b41SRobert Mustacchi for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2929*ed093b41SRobert Mustacchi const xsave_proc_info_t *info = &fpu_xsave_info[i];
2930*ed093b41SRobert Mustacchi
2931*ed093b41SRobert Mustacchi if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
2932*ed093b41SRobert Mustacchi continue;
2933*ed093b41SRobert Mustacchi ret = info->xi_signal_out(info, copyfunc, &ucx,
2934*ed093b41SRobert Mustacchi lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2935*ed093b41SRobert Mustacchi uaddr);
2936*ed093b41SRobert Mustacchi if (ret != 0) {
2937*ed093b41SRobert Mustacchi kpreempt_enable();
2938*ed093b41SRobert Mustacchi return (ret);
2939*ed093b41SRobert Mustacchi }
2940*ed093b41SRobert Mustacchi }
2941*ed093b41SRobert Mustacchi
2942*ed093b41SRobert Mustacchi /*
2943*ed093b41SRobert Mustacchi * Now that everything has been copied out, we should have an accurate
2944*ed093b41SRobert Mustacchi * value in the uc_xsave_t header and we can copy that out at the start
2945*ed093b41SRobert Mustacchi * of the user data.
2946*ed093b41SRobert Mustacchi */
2947*ed093b41SRobert Mustacchi ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
2948*ed093b41SRobert Mustacchi return (ret);
2949*ed093b41SRobert Mustacchi }
2950*ed093b41SRobert Mustacchi
2951*ed093b41SRobert Mustacchi /*
2952*ed093b41SRobert Mustacchi * Here we've been given a ucontext_t which potentially has a user pointer to
2953*ed093b41SRobert Mustacchi * xsave state that we've copied out previously. In this case we need to do the
2954*ed093b41SRobert Mustacchi * following, assuming UC_XSAVE is present:
2955*ed093b41SRobert Mustacchi *
2956*ed093b41SRobert Mustacchi * o Copy in our header and validate it.
2957*ed093b41SRobert Mustacchi * o Allocate an fpu context to use as a holding ground for all this data.
2958*ed093b41SRobert Mustacchi * o If UC_FPU is set, override the xsave structure with the saved XMM state,
2959*ed093b41SRobert Mustacchi * clear UC_FPU, and make sure that the correct xsave_bv bits are set.
2960*ed093b41SRobert Mustacchi *
2961*ed093b41SRobert Mustacchi * Currently we always allocate the additional state as a holding ground for the
2962*ed093b41SRobert Mustacchi * FPU. What we're copying in may not be valid and we don't want to clobber the
2963*ed093b41SRobert Mustacchi * existing FPU state or deal with merging it until we believe it's reasonable
2964*ed093b41SRobert Mustacchi * enough. The proc_t is here to set us up for when we have per-process settings
2965*ed093b41SRobert Mustacchi * in the extended feature disable MSRs.
2966*ed093b41SRobert Mustacchi */
2967*ed093b41SRobert Mustacchi int
fpu_signal_copyin(klwp_t * lwp,ucontext_t * kuc)2968*ed093b41SRobert Mustacchi fpu_signal_copyin(klwp_t *lwp, ucontext_t *kuc)
2969*ed093b41SRobert Mustacchi {
2970*ed093b41SRobert Mustacchi uc_xsave_t ucx;
2971*ed093b41SRobert Mustacchi uint64_t bv;
2972*ed093b41SRobert Mustacchi uintptr_t data, max_data;
2973*ed093b41SRobert Mustacchi void *fpu;
2974*ed093b41SRobert Mustacchi proc_t *p = lwp->lwp_procp;
2975*ed093b41SRobert Mustacchi size_t ksize;
2976*ed093b41SRobert Mustacchi
2977*ed093b41SRobert Mustacchi /*
2978*ed093b41SRobert Mustacchi * Because this has been opaque filler and the kernel has never
2979*ed093b41SRobert Mustacchi * historically looked at it, we don't really care about the uc_xsave
2980*ed093b41SRobert Mustacchi * pointer being garbage in the case that the flag is not set. While
2981*ed093b41SRobert Mustacchi * this isn't perhaps the most sporting choice in some cases, this is on
2982*ed093b41SRobert Mustacchi * the other hand, pragmatic.
2983*ed093b41SRobert Mustacchi */
2984*ed093b41SRobert Mustacchi if ((kuc->uc_flags & UC_XSAVE) != 0) {
2985*ed093b41SRobert Mustacchi if (kuc->uc_xsave == 0) {
2986*ed093b41SRobert Mustacchi return (EINVAL);
2987*ed093b41SRobert Mustacchi }
2988*ed093b41SRobert Mustacchi
2989*ed093b41SRobert Mustacchi if (!fpu_xsave_enabled()) {
2990*ed093b41SRobert Mustacchi return (ENOTSUP);
2991*ed093b41SRobert Mustacchi }
2992*ed093b41SRobert Mustacchi } else {
2993*ed093b41SRobert Mustacchi return (0);
2994*ed093b41SRobert Mustacchi }
2995*ed093b41SRobert Mustacchi
2996*ed093b41SRobert Mustacchi if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
2997*ed093b41SRobert Mustacchi 0) {
2998*ed093b41SRobert Mustacchi return (EFAULT);
2999*ed093b41SRobert Mustacchi }
3000*ed093b41SRobert Mustacchi
3001*ed093b41SRobert Mustacchi ksize = cpuid_get_xsave_size();
3002*ed093b41SRobert Mustacchi if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
3003*ed093b41SRobert Mustacchi ucx.ucx_len > ksize ||
3004*ed093b41SRobert Mustacchi (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
3005*ed093b41SRobert Mustacchi (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
3006*ed093b41SRobert Mustacchi (uintptr_t)kuc->uc_xsave) {
3007*ed093b41SRobert Mustacchi return (EINVAL);
3008*ed093b41SRobert Mustacchi }
3009*ed093b41SRobert Mustacchi
3010*ed093b41SRobert Mustacchi /*
3011*ed093b41SRobert Mustacchi * OK, our goal right now is to recreate a valid xsave_state structure
3012*ed093b41SRobert Mustacchi * that we'll ultimately end up having to merge with our existing one in
3013*ed093b41SRobert Mustacchi * the FPU save state. The reason we describe this as a merge is to help
3014*ed093b41SRobert Mustacchi * future us when we want to retain supervisor state which will never be
3015*ed093b41SRobert Mustacchi * part of userland signal state. The design of the userland signal
3016*ed093b41SRobert Mustacchi * state is basically to compress it as much as we can. This is done for
3017*ed093b41SRobert Mustacchi * two reasons:
3018*ed093b41SRobert Mustacchi *
3019*ed093b41SRobert Mustacchi * 1) We currently consider this a private interface.
3020*ed093b41SRobert Mustacchi * 2) We really want to minimize the actual amount of stack space we
3021*ed093b41SRobert Mustacchi * use as much as possible. Most applications aren't using AVX-512
3022*ed093b41SRobert Mustacchi * right now, so doing our own compression style is worthwhile. If
3023*ed093b41SRobert Mustacchi * libc adopts AVX-512 routines, we may want to change this.
3024*ed093b41SRobert Mustacchi *
3025*ed093b41SRobert Mustacchi * On the allocation below, our assumption is that if a thread has taken
3026*ed093b41SRobert Mustacchi * a signal, then it is likely to take a signal again in the future (or
3027*ed093b41SRobert Mustacchi * be shortly headed to its demise). As such, when that happens we will
3028*ed093b41SRobert Mustacchi * leave the allocated signal stack around for the process. Most
3029*ed093b41SRobert Mustacchi * applications don't allow all threads to take signals, so this should
3030*ed093b41SRobert Mustacchi * hopefully help amortize the cost of the allocation.
3031*ed093b41SRobert Mustacchi */
3032*ed093b41SRobert Mustacchi max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
3033*ed093b41SRobert Mustacchi data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
3034*ed093b41SRobert Mustacchi bv = ucx.ucx_bv;
3035*ed093b41SRobert Mustacchi if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
3036*ed093b41SRobert Mustacchi lwp->lwp_pcb.pcb_fpu.fpu_signal =
3037*ed093b41SRobert Mustacchi kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
3038*ed093b41SRobert Mustacchi }
3039*ed093b41SRobert Mustacchi fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
3040*ed093b41SRobert Mustacchi
3041*ed093b41SRobert Mustacchi /*
3042*ed093b41SRobert Mustacchi * Unconditionally initialize the memory we get in here to ensure that
3043*ed093b41SRobert Mustacchi * it is in a reasonable state for ourselves. This ensures that unused
3044*ed093b41SRobert Mustacchi * regions are mostly left in their initial state (the main exception
3045*ed093b41SRobert Mustacchi * here is the x87/XMM state, but that should be OK). We don't fill in
3046*ed093b41SRobert Mustacchi * the initial xsave state as we expect that to happen as part of our
3047*ed093b41SRobert Mustacchi * processing.
3048*ed093b41SRobert Mustacchi */
3049*ed093b41SRobert Mustacchi bzero(fpu, ksize);
3050*ed093b41SRobert Mustacchi
3051*ed093b41SRobert Mustacchi for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3052*ed093b41SRobert Mustacchi int ret;
3053*ed093b41SRobert Mustacchi const xsave_proc_info_t *info = &fpu_xsave_info[i];
3054*ed093b41SRobert Mustacchi if (!info->xi_always && (info->xi_bits & bv) == 0)
3055*ed093b41SRobert Mustacchi continue;
3056*ed093b41SRobert Mustacchi bv &= ~info->xi_bits;
3057*ed093b41SRobert Mustacchi
3058*ed093b41SRobert Mustacchi if (info->xi_signal_in == NULL)
3059*ed093b41SRobert Mustacchi continue;
3060*ed093b41SRobert Mustacchi ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
3061*ed093b41SRobert Mustacchi if (ret != 0) {
3062*ed093b41SRobert Mustacchi return (ret);
3063*ed093b41SRobert Mustacchi }
3064*ed093b41SRobert Mustacchi }
3065*ed093b41SRobert Mustacchi ASSERT0(bv);
3066*ed093b41SRobert Mustacchi
3067*ed093b41SRobert Mustacchi /*
3068*ed093b41SRobert Mustacchi * As described in the big theory statement section 'Signal Handling and
3069*ed093b41SRobert Mustacchi * the ucontext_t', we always remove UC_FPU from here as we've taken
3070*ed093b41SRobert Mustacchi * care of reassembling it ourselves.
3071*ed093b41SRobert Mustacchi */
3072*ed093b41SRobert Mustacchi kuc->uc_flags &= ~UC_FPU;
3073*ed093b41SRobert Mustacchi kuc->uc_xsave = (uintptr_t)fpu;
3074*ed093b41SRobert Mustacchi
3075*ed093b41SRobert Mustacchi return (0);
3076*ed093b41SRobert Mustacchi }
3077*ed093b41SRobert Mustacchi
3078*ed093b41SRobert Mustacchi /*
3079*ed093b41SRobert Mustacchi * This determines the size of the signal stack that we need for our custom form
3080*ed093b41SRobert Mustacchi * of the xsave state.
3081*ed093b41SRobert Mustacchi */
3082*ed093b41SRobert Mustacchi size_t
fpu_signal_size(klwp_t * lwp)3083*ed093b41SRobert Mustacchi fpu_signal_size(klwp_t *lwp)
3084*ed093b41SRobert Mustacchi {
3085*ed093b41SRobert Mustacchi struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3086*ed093b41SRobert Mustacchi size_t len = sizeof (uc_xsave_t);
3087*ed093b41SRobert Mustacchi uint64_t xs_bv;
3088*ed093b41SRobert Mustacchi
3089*ed093b41SRobert Mustacchi VERIFY3P(curthread, ==, lwptot(lwp));
3090*ed093b41SRobert Mustacchi VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3091*ed093b41SRobert Mustacchi VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3092*ed093b41SRobert Mustacchi
3093*ed093b41SRobert Mustacchi if (!fpu_xsave_enabled()) {
3094*ed093b41SRobert Mustacchi return (0);
3095*ed093b41SRobert Mustacchi }
3096*ed093b41SRobert Mustacchi
3097*ed093b41SRobert Mustacchi kpreempt_disable();
3098*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3099*ed093b41SRobert Mustacchi fp_save(fpu);
3100*ed093b41SRobert Mustacchi }
3101*ed093b41SRobert Mustacchi
3102*ed093b41SRobert Mustacchi xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
3103*ed093b41SRobert Mustacchi for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3104*ed093b41SRobert Mustacchi size_t comp_size;
3105*ed093b41SRobert Mustacchi
3106*ed093b41SRobert Mustacchi if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
3107*ed093b41SRobert Mustacchi continue;
3108*ed093b41SRobert Mustacchi
3109*ed093b41SRobert Mustacchi cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
3110*ed093b41SRobert Mustacchi NULL);
3111*ed093b41SRobert Mustacchi len += comp_size;
3112*ed093b41SRobert Mustacchi }
3113*ed093b41SRobert Mustacchi
3114*ed093b41SRobert Mustacchi kpreempt_enable();
3115*ed093b41SRobert Mustacchi return (len);
3116*ed093b41SRobert Mustacchi }
3117*ed093b41SRobert Mustacchi
3118*ed093b41SRobert Mustacchi /*
3119*ed093b41SRobert Mustacchi * This function is used in service of restorecontext() to set the specified
3120*ed093b41SRobert Mustacchi * thread's extended FPU state to the passed in data. Our assumptions at this
3121*ed093b41SRobert Mustacchi * point from the system are:
3122*ed093b41SRobert Mustacchi *
3123*ed093b41SRobert Mustacchi * o Someone has already verified that the actual xsave header is correct.
3124*ed093b41SRobert Mustacchi * o Any traditional XMM state that causes a #gp has been clamped.
3125*ed093b41SRobert Mustacchi * o That data is basically the correct sized xsave state structure. Right now
3126*ed093b41SRobert Mustacchi * that means it is not compressed and follows the CPUID-based rules for
3127*ed093b41SRobert Mustacchi * constructing and laying out data.
3128*ed093b41SRobert Mustacchi * o That the lwp argument refers to the current thread.
3129*ed093b41SRobert Mustacchi *
3130*ed093b41SRobert Mustacchi * Our primary purpose here is to merge the current FPU state with what exists
3131*ed093b41SRobert Mustacchi * here. Right now, "merge", strictly speaking is just "replace". We can get
3132*ed093b41SRobert Mustacchi * away with just replacing everything because all we currently save are user
3133*ed093b41SRobert Mustacchi * states. If we start saving kernel states in here, this will get more nuanced
3134*ed093b41SRobert Mustacchi * and we will need to be more careful about how we store data here.
3135*ed093b41SRobert Mustacchi */
3136*ed093b41SRobert Mustacchi void
fpu_set_xsave(klwp_t * lwp,const void * data)3137*ed093b41SRobert Mustacchi fpu_set_xsave(klwp_t *lwp, const void *data)
3138*ed093b41SRobert Mustacchi {
3139*ed093b41SRobert Mustacchi struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3140*ed093b41SRobert Mustacchi uint32_t status, xstatus;
3141*ed093b41SRobert Mustacchi struct xsave_state *dst_xsave;
3142*ed093b41SRobert Mustacchi
3143*ed093b41SRobert Mustacchi VERIFY(fpu_xsave_enabled());
3144*ed093b41SRobert Mustacchi VERIFY3P(curthread, ==, lwptot(lwp));
3145*ed093b41SRobert Mustacchi VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3146*ed093b41SRobert Mustacchi ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3147*ed093b41SRobert Mustacchi
3148*ed093b41SRobert Mustacchi /*
3149*ed093b41SRobert Mustacchi * We use fp_save() here rather than a stock fpdisable() so we can
3150*ed093b41SRobert Mustacchi * attempt to honor our invariants that when the thread state has been
3151*ed093b41SRobert Mustacchi * saved, the valid flag is set, even though we're going to be
3152*ed093b41SRobert Mustacchi * overwriting it shortly. If we just called fpdisable() then we would
3153*ed093b41SRobert Mustacchi * basically be asking for trouble.
3154*ed093b41SRobert Mustacchi *
3155*ed093b41SRobert Mustacchi * Because we are modifying the state here and we don't want the system
3156*ed093b41SRobert Mustacchi * to end up in an odd state, we are being a little paranoid and
3157*ed093b41SRobert Mustacchi * disabling preemption across this operation. In particular, once the
3158*ed093b41SRobert Mustacchi * state is properly tagged with FPU_VALID, there should be no other way
3159*ed093b41SRobert Mustacchi * that this thread can return to userland and get cleared out because
3160*ed093b41SRobert Mustacchi * we're resetting its context; however, we let paranoia win out.
3161*ed093b41SRobert Mustacchi */
3162*ed093b41SRobert Mustacchi kpreempt_disable();
3163*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3164*ed093b41SRobert Mustacchi fp_save(fpu);
3165*ed093b41SRobert Mustacchi }
3166*ed093b41SRobert Mustacchi
3167*ed093b41SRobert Mustacchi bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
3168*ed093b41SRobert Mustacchi cpuid_get_xsave_size());
3169*ed093b41SRobert Mustacchi dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
3170*ed093b41SRobert Mustacchi status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
3171*ed093b41SRobert Mustacchi xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
3172*ed093b41SRobert Mustacchi dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
3173*ed093b41SRobert Mustacchi dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
3174*ed093b41SRobert Mustacchi
3175*ed093b41SRobert Mustacchi /*
3176*ed093b41SRobert Mustacchi * These two status words are information that the kernel itself uses to
3177*ed093b41SRobert Mustacchi * track additional information and is part of the traditional fpregset,
3178*ed093b41SRobert Mustacchi * but is not part of our xregs information. Because we are setting this
3179*ed093b41SRobert Mustacchi * state, we leave it up to the rest of the kernel to determine whether
3180*ed093b41SRobert Mustacchi * this came from an fpregset_t or is being reset to the default of 0.
3181*ed093b41SRobert Mustacchi */
3182*ed093b41SRobert Mustacchi fpu->fpu_regs.kfpu_status = status;
3183*ed093b41SRobert Mustacchi fpu->fpu_regs.kfpu_xstatus = xstatus;
3184*ed093b41SRobert Mustacchi
3185*ed093b41SRobert Mustacchi fpu->fpu_flags |= FPU_VALID;
3186*ed093b41SRobert Mustacchi PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3187*ed093b41SRobert Mustacchi kpreempt_enable();
3188*ed093b41SRobert Mustacchi }
3189*ed093b41SRobert Mustacchi
3190*ed093b41SRobert Mustacchi /*
3191*ed093b41SRobert Mustacchi * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
3192*ed093b41SRobert Mustacchi * kernel, this is just an fxsave_state with additional values for the status
3193*ed093b41SRobert Mustacchi * and xstatus members.
3194*ed093b41SRobert Mustacchi *
3195*ed093b41SRobert Mustacchi * This has the same nuance as the xregs cases discussed above, but is simpler
3196*ed093b41SRobert Mustacchi * in that we only need to handle the fxsave state, but more complicated because
3197*ed093b41SRobert Mustacchi * we need to check our save mechanism.
3198*ed093b41SRobert Mustacchi */
3199*ed093b41SRobert Mustacchi void
fpu_get_fpregset(klwp_t * lwp,fpregset_t * fp)3200*ed093b41SRobert Mustacchi fpu_get_fpregset(klwp_t *lwp, fpregset_t *fp)
3201*ed093b41SRobert Mustacchi {
3202*ed093b41SRobert Mustacchi struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3203*ed093b41SRobert Mustacchi
3204*ed093b41SRobert Mustacchi kpreempt_disable();
3205*ed093b41SRobert Mustacchi fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
3206*ed093b41SRobert Mustacchi fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
3207*ed093b41SRobert Mustacchi
3208*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3209*ed093b41SRobert Mustacchi /*
3210*ed093b41SRobert Mustacchi * If we're requesting the fpregs of a thread that isn't
3211*ed093b41SRobert Mustacchi * currently valid and isn't the one that we're executing, then
3212*ed093b41SRobert Mustacchi * we consider getting this information to be a best-effort and
3213*ed093b41SRobert Mustacchi * we will not stop the thread in question to serialize it,
3214*ed093b41SRobert Mustacchi * which means possibly getting stale data. This is the
3215*ed093b41SRobert Mustacchi * traditional semantics that the system has used to service
3216*ed093b41SRobert Mustacchi * this for /proc.
3217*ed093b41SRobert Mustacchi */
3218*ed093b41SRobert Mustacchi if (curthread == lwptot(lwp)) {
3219*ed093b41SRobert Mustacchi VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3220*ed093b41SRobert Mustacchi fp_save(fpu);
3221*ed093b41SRobert Mustacchi }
3222*ed093b41SRobert Mustacchi }
3223*ed093b41SRobert Mustacchi
3224*ed093b41SRobert Mustacchi /*
3225*ed093b41SRobert Mustacchi * If the FPU is not enabled and the state isn't valid (due to someone
3226*ed093b41SRobert Mustacchi * else setting it), just copy the initial state.
3227*ed093b41SRobert Mustacchi */
3228*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
3229*ed093b41SRobert Mustacchi bcopy(&sse_initial, fp, sizeof (sse_initial));
3230*ed093b41SRobert Mustacchi kpreempt_enable();
3231*ed093b41SRobert Mustacchi return;
3232*ed093b41SRobert Mustacchi }
3233*ed093b41SRobert Mustacchi
3234*ed093b41SRobert Mustacchi /*
3235*ed093b41SRobert Mustacchi * Given that we have an enabled FPU, we must look at the type of FPU
3236*ed093b41SRobert Mustacchi * save mechanism to clean this up. In particular, while we can just
3237*ed093b41SRobert Mustacchi * copy the save area with FXSAVE, with XSAVE we must carefully copy
3238*ed093b41SRobert Mustacchi * only the bits that are valid and reset the rest to their default
3239*ed093b41SRobert Mustacchi * state.
3240*ed093b41SRobert Mustacchi */
3241*ed093b41SRobert Mustacchi switch (fp_save_mech) {
3242*ed093b41SRobert Mustacchi case FP_FXSAVE:
3243*ed093b41SRobert Mustacchi bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
3244*ed093b41SRobert Mustacchi sizeof (struct fxsave_state));
3245*ed093b41SRobert Mustacchi break;
3246*ed093b41SRobert Mustacchi case FP_XSAVE:
3247*ed093b41SRobert Mustacchi fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
3248*ed093b41SRobert Mustacchi (struct fxsave_state *)fp);
3249*ed093b41SRobert Mustacchi break;
3250*ed093b41SRobert Mustacchi default:
3251*ed093b41SRobert Mustacchi panic("Invalid fp_save_mech");
3252*ed093b41SRobert Mustacchi }
3253*ed093b41SRobert Mustacchi
3254*ed093b41SRobert Mustacchi kpreempt_enable();
3255*ed093b41SRobert Mustacchi }
3256*ed093b41SRobert Mustacchi
3257*ed093b41SRobert Mustacchi /*
3258*ed093b41SRobert Mustacchi * This is a request to set the ABI fpregset_t into our actual hardware state.
3259*ed093b41SRobert Mustacchi * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
3260*ed093b41SRobert Mustacchi * 512-byte fxsave area.
3261*ed093b41SRobert Mustacchi */
3262*ed093b41SRobert Mustacchi void
fpu_set_fpregset(klwp_t * lwp,const fpregset_t * fp)3263*ed093b41SRobert Mustacchi fpu_set_fpregset(klwp_t *lwp, const fpregset_t *fp)
3264*ed093b41SRobert Mustacchi {
3265*ed093b41SRobert Mustacchi struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3266*ed093b41SRobert Mustacchi
3267*ed093b41SRobert Mustacchi kpreempt_disable();
3268*ed093b41SRobert Mustacchi if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3269*ed093b41SRobert Mustacchi /*
3270*ed093b41SRobert Mustacchi * We always save the entire FPU. This is required if we're
3271*ed093b41SRobert Mustacchi * using xsave. If we're using fxsave, we could skip the
3272*ed093b41SRobert Mustacchi * 512-byte write and instead just disable the FPU since we'd be
3273*ed093b41SRobert Mustacchi * replacing it all. For now we don't bother with more
3274*ed093b41SRobert Mustacchi * conditional logic.
3275*ed093b41SRobert Mustacchi */
3276*ed093b41SRobert Mustacchi VERIFY3P(curthread, ==, lwptot(lwp));
3277*ed093b41SRobert Mustacchi VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3278*ed093b41SRobert Mustacchi fp_save(fpu);
3279*ed093b41SRobert Mustacchi }
3280*ed093b41SRobert Mustacchi
3281*ed093b41SRobert Mustacchi fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
3282*ed093b41SRobert Mustacchi fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
3283*ed093b41SRobert Mustacchi switch (fp_save_mech) {
3284*ed093b41SRobert Mustacchi case FP_FXSAVE:
3285*ed093b41SRobert Mustacchi bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
3286*ed093b41SRobert Mustacchi sizeof (struct fxsave_state));
3287*ed093b41SRobert Mustacchi break;
3288*ed093b41SRobert Mustacchi case FP_XSAVE:
3289*ed093b41SRobert Mustacchi bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
3290*ed093b41SRobert Mustacchi sizeof (struct fxsave_state));
3291*ed093b41SRobert Mustacchi fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
3292*ed093b41SRobert Mustacchi XFEATURE_LEGACY_FP | XFEATURE_SSE;
3293*ed093b41SRobert Mustacchi break;
3294*ed093b41SRobert Mustacchi default:
3295*ed093b41SRobert Mustacchi panic("Invalid fp_save_mech");
3296*ed093b41SRobert Mustacchi }
3297*ed093b41SRobert Mustacchi
3298*ed093b41SRobert Mustacchi fpu->fpu_flags |= FPU_VALID;
3299*ed093b41SRobert Mustacchi PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3300*ed093b41SRobert Mustacchi kpreempt_enable();
3301*ed093b41SRobert Mustacchi }
3302