xref: /illumos-gate/usr/src/uts/i86pc/os/cpuid.c (revision 9514ab44)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2022 Oxide Computer Company
28  */
29 /*
30  * Copyright (c) 2010, Intel Corporation.
31  * All rights reserved.
32  */
33 /*
34  * Portions Copyright 2009 Advanced Micro Devices, Inc.
35  */
36 
37 /*
38  * CPU Identification logic
39  *
40  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
41  * with the identification of CPUs, their features, and their topologies. More
42  * specifically, this file helps drive the following:
43  *
44  * 1. Enumeration of features of the processor which are used by the kernel to
45  *    determine what features to enable or disable. These may be instruction set
46  *    enhancements or features that we use.
47  *
48  * 2. Enumeration of instruction set architecture (ISA) additions that userland
49  *    will be told about through the auxiliary vector.
50  *
51  * 3. Understanding the physical topology of the CPU such as the number of
52  *    caches, how many cores it has, whether or not it supports symmetric
53  *    multi-processing (SMT), etc.
54  *
55  * ------------------------
56  * CPUID History and Basics
57  * ------------------------
58  *
59  * The cpuid instruction was added by Intel roughly around the time that the
60  * original Pentium was introduced. The purpose of cpuid was to tell in a
61  * programmatic fashion information about the CPU that previously was guessed
62  * at. For example, an important part of cpuid is that we can know what
63  * extensions to the ISA exist. If you use an invalid opcode you would get a
64  * #UD, so this method allows a program (whether a user program or the kernel)
65  * to determine what exists without crashing or getting a SIGILL. Of course,
66  * this was also during the era of the clones and the AMD Am5x86. The vendor
67  * name shows up first in cpuid for a reason.
68  *
69  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
70  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
71  * its own meaning. The different leaves are broken down into different regions:
72  *
73  *	[ 0, 7fffffff ]			This region is called the 'basic'
74  *					region. This region is generally defined
75  *					by Intel, though some of the original
76  *					portions have different meanings based
77  *					on the manufacturer. These days, Intel
78  *					adds most new features to this region.
79  *					AMD adds non-Intel compatible
80  *					information in the third, extended
81  *					region. Intel uses this for everything
82  *					including ISA extensions, CPU
83  *					features, cache information, topology,
84  *					and more.
85  *
86  *					There is a hole carved out of this
87  *					region which is reserved for
88  *					hypervisors.
89  *
90  *	[ 40000000, 4fffffff ]		This region, which is found in the
91  *					middle of the previous region, is
92  *					explicitly promised to never be used by
93  *					CPUs. Instead, it is used by hypervisors
94  *					to communicate information about
95  *					themselves to the operating system. The
96  *					values and details are unique for each
97  *					hypervisor.
98  *
99  *	[ 80000000, ffffffff ]		This region is called the 'extended'
100  *					region. Some of the low leaves mirror
101  *					parts of the basic leaves. This region
102  *					has generally been used by AMD for
103  *					various extensions. For example, AMD-
104  *					specific information about caches,
105  *					features, and topology are found in this
106  *					region.
107  *
108  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
109  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
110  * the ranges, one of the primary things returned is the maximum valid leaf in
111  * that range. This allows for discovery of what range of CPUID is valid.
112  *
113  * The CPUs have potentially surprising behavior when using an invalid leaf or
114  * unimplemented leaf. If the requested leaf is within the valid basic or
115  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
116  * set to zero. However, if you specify a leaf that is outside of a valid range,
117  * then instead it will be filled with the last valid _basic_ leaf. For example,
118  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
119  * an invalid extended leaf will return the information for leaf 3.
120  *
121  * Some leaves are broken down into sub-leaves. This means that the value
122  * depends on both the leaf asked for in %eax and a secondary register. For
123  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
124  * additional information. Or when getting topology information in leaf 0xb, the
125  * initial value in %ecx changes which level of the topology that you are
126  * getting information about.
127  *
128  * cpuid values are always kept to 32 bits regardless of whether or not the
129  * program is in 64-bit mode. When executing in 64-bit mode, the upper
130  * 32 bits of the register are always set to zero so that way the values are the
131  * same regardless of execution mode.
132  *
133  * ----------------------
134  * Identifying Processors
135  * ----------------------
136  *
137  * We can identify a processor in two steps. The first step looks at cpuid leaf
138  * 0. Leaf 0 contains the processor's vendor information. This is done by
139  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
140  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
141  *
142  * From there, a processor is identified by a combination of three different
143  * values:
144  *
145  *  1. Family
146  *  2. Model
147  *  3. Stepping
148  *
149  * Each vendor uses the family and model to uniquely identify a processor. The
150  * way that family and model are changed depends on the vendor. For example,
151  * Intel has been using family 0x6 for almost all of their processor since the
152  * Pentium Pro/Pentium II era, often called the P6. The model is used to
153  * identify the exact processor. Different models are often used for the client
154  * (consumer) and server parts. Even though each processor often has major
155  * architectural differences, they still are considered the same family by
156  * Intel.
157  *
158  * On the other hand, each major AMD architecture generally has its own family.
159  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
160  * the model number is used to help identify specific processors.
161  *
162  * The stepping is used to refer to a revision of a specific microprocessor. The
163  * term comes from equipment used to produce masks that are used to create
164  * integrated circuits.
165  *
166  * The information is present in leaf 1, %eax. In technical documentation you
167  * will see the terms extended model and extended family. The original family,
168  * model, and stepping fields were each 4 bits wide. If the values in either
169  * are 0xf, then one is to consult the extended model and extended family, which
170  * take previously reserved bits and allow for a larger number of models and add
171  * 0xf to them.
172  *
173  * When we process this information, we store the full family, model, and
174  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
175  * cpi_step, respectively. Whenever you are performing comparisons with the
176  * family, model, and stepping, you should use these members and not the raw
177  * values from cpuid. If you must use the raw values from cpuid directly, you
178  * must make sure that you add the extended model and family to the base model
179  * and family.
180  *
181  * In general, we do not use information about the family, model, and stepping
182  * to determine whether or not a feature is present; that is generally driven by
183  * specific leaves. However, when something we care about on the processor is
184  * not considered 'architectural' meaning that it is specific to a set of
185  * processors and not promised in the architecture model to be consistent from
186  * generation to generation, then we will fall back on this information. The
187  * most common cases where this comes up is when we have to workaround errata in
188  * the processor, are dealing with processor-specific features such as CPU
189  * performance counters, or we want to provide additional information for things
190  * such as fault management.
191  *
192  * While processors also do have a brand string, which is the name that people
193  * are familiar with when buying the processor, they are not meant for
194  * programmatic consumption. That is what the family, model, and stepping are
195  * for.
196  *
197  * ------------
198  * CPUID Passes
199  * ------------
200  *
201  * As part of performing feature detection, we break this into several different
202  * passes. The passes are as follows:
203  *
204  *	Pass 0		This is a primordial pass done in locore.s to deal with
205  *			Cyrix CPUs that don't support cpuid. The reality is that
206  *			we likely don't run on them any more, but there is still
207  *			logic for handling them.
208  *
209  *	Pass 1		This is the primary pass and is responsible for doing a
210  *			large number of different things:
211  *
212  *			1. Determine which vendor manufactured the CPU and
213  *			determining the family, model, and stepping information.
214  *
215  *			2. Gathering a large number of feature flags to
216  *			determine which features the CPU support and which
217  *			indicate things that we need to do other work in the OS
218  *			to enable. Features detected this way are added to the
219  *			x86_featureset which can be queried to
220  *			determine what we should do. This includes processing
221  *			all of the basic and extended CPU features that we care
222  *			about.
223  *
224  *			3. Determining the CPU's topology. This includes
225  *			information about how many cores and threads are present
226  *			in the package. It also is responsible for figuring out
227  *			which logical CPUs are potentially part of the same core
228  *			and what other resources they might share. For more
229  *			information see the 'Topology' section.
230  *
231  *			4. Determining the set of CPU security-specific features
232  *			that we need to worry about and determine the
233  *			appropriate set of workarounds.
234  *
235  *			Pass 1 on the boot CPU occurs before KMDB is started.
236  *
237  *	Pass 2		The second pass is done after startup(). Here, we check
238  *			other miscellaneous features. Most of this is gathering
239  *			additional basic and extended features that we'll use in
240  *			later passes or for debugging support.
241  *
242  *	Pass 3		The third pass occurs after the kernel memory allocator
243  *			has been fully initialized. This gathers information
244  *			where we might need dynamic memory available for our
245  *			uses. This includes several varying width leaves that
246  *			have cache information and the processor's brand string.
247  *
248  *	Pass 4		The fourth and final normal pass is performed after the
249  *			kernel has brought most everything online. This is
250  *			invoked from post_startup(). In this pass, we go through
251  *			the set of features that we have enabled and turn that
252  *			into the hardware auxiliary vector features that
253  *			userland receives. This is used by userland, primarily
254  *			by the run-time link-editor (RTLD), though userland
255  *			software could also refer to it directly.
256  *
257  *	Microcode	After a microcode update, we do a selective rescan of
258  *			the cpuid leaves to determine what features have
259  *			changed. Microcode updates can provide more details
260  *			about security related features to deal with issues like
261  *			Spectre and L1TF. On occasion, vendors have violated
262  *			their contract and removed bits. However, we don't try
263  *			to detect that because that puts us in a situation that
264  *			we really can't deal with. As such, the only thing we
265  *			rescan are security related features today. See
266  *			cpuid_pass_ucode().
267  *
268  * All of the passes (except pass 0) are run on all CPUs. However, for the most
269  * part we only care about what the boot CPU says about this information and use
270  * the other CPUs as a rough guide to sanity check that we have the same feature
271  * set.
272  *
273  * We do not support running multiple logical CPUs with disjoint, let alone
274  * different, feature sets.
275  *
276  * ------------------
277  * Processor Topology
278  * ------------------
279  *
280  * One of the important things that we need to do is to understand the topology
281  * of the underlying processor. When we say topology in this case, we're trying
282  * to understand the relationship between the logical CPUs that the operating
283  * system sees and the underlying physical layout. Different logical CPUs may
284  * share different resources which can have important consequences for the
285  * performance of the system. For example, they may share caches, execution
286  * units, and more.
287  *
288  * The topology of the processor changes from generation to generation and
289  * vendor to vendor.  Along with that, different vendors use different
290  * terminology, and the operating system itself uses occasionally overlapping
291  * terminology. It's important to understand what this topology looks like so
292  * one can understand the different things that we try to calculate and
293  * determine.
294  *
295  * To get started, let's talk about a little bit of terminology that we've used
296  * so far, is used throughout this file, and is fairly generic across multiple
297  * vendors:
298  *
299  * CPU
300  *	A central processing unit (CPU) refers to a logical and/or virtual
301  *	entity that the operating system can execute instructions on. The
302  *	underlying resources for this CPU may be shared between multiple
303  *	entities; however, to the operating system it is a discrete unit.
304  *
305  * PROCESSOR and PACKAGE
306  *
307  *	Generally, when we use the term 'processor' on its own, we are referring
308  *	to the physical entity that one buys and plugs into a board. However,
309  *	because processor has been overloaded and one might see it used to mean
310  *	multiple different levels, we will instead use the term 'package' for
311  *	the rest of this file. The term package comes from the electrical
312  *	engineering side and refers to the physical entity that encloses the
313  *	electronics inside. Strictly speaking the package can contain more than
314  *	just the CPU, for example, on many processors it may also have what's
315  *	called an 'integrated graphical processing unit (GPU)'. Because the
316  *	package can encapsulate multiple units, it is the largest physical unit
317  *	that we refer to.
318  *
319  * SOCKET
320  *
321  *	A socket refers to unit on a system board (generally the motherboard)
322  *	that can receive a package. A single package, or processor, is plugged
323  *	into a single socket. A system may have multiple sockets. Often times,
324  *	the term socket is used interchangeably with package and refers to the
325  *	electrical component that has plugged in, and not the receptacle itself.
326  *
327  * CORE
328  *
329  *	A core refers to the physical instantiation of a CPU, generally, with a
330  *	full set of hardware resources available to it. A package may contain
331  *	multiple cores inside of it or it may just have a single one. A
332  *	processor with more than one core is often referred to as 'multi-core'.
333  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
334  *	that has 'multi-core' processors.
335  *
336  *	A core may expose a single logical CPU to the operating system, or it
337  *	may expose multiple CPUs, which we call threads, defined below.
338  *
339  *	Some resources may still be shared by cores in the same package. For
340  *	example, many processors will share the level 3 cache between cores.
341  *	Some AMD generations share hardware resources between cores. For more
342  *	information on that see the section 'AMD Topology'.
343  *
344  * THREAD and STRAND
345  *
346  *	In this file, generally a thread refers to a hardware resources and not
347  *	the operating system's logical abstraction. A thread is always exposed
348  *	as an independent logical CPU to the operating system. A thread belongs
349  *	to a specific core. A core may have more than one thread. When that is
350  *	the case, the threads that are part of the same core are often referred
351  *	to as 'siblings'.
352  *
353  *	When multiple threads exist, this is generally referred to as
354  *	simultaneous multi-threading (SMT). When Intel introduced this in their
355  *	processors they called it hyper-threading (HT). When multiple threads
356  *	are active in a core, they split the resources of the core. For example,
357  *	two threads may share the same set of hardware execution units.
358  *
359  *	The operating system often uses the term 'strand' to refer to a thread.
360  *	This helps disambiguate it from the software concept.
361  *
362  * CHIP
363  *
364  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
365  *	base meaning, it is used to refer to a single integrated circuit, which
366  *	may or may not be the only thing in the package. In illumos, when you
367  *	see the term 'chip' it is almost always referring to the same thing as
368  *	the 'package'. However, many vendors may use chip to refer to one of
369  *	many integrated circuits that have been placed in the package. As an
370  *	example, see the subsequent definition.
371  *
372  *	To try and keep things consistent, we will only use chip when referring
373  *	to the entire integrated circuit package, with the exception of the
374  *	definition of multi-chip module (because it is in the name) and use the
375  *	term 'die' when we want the more general, potential sub-component
376  *	definition.
377  *
378  * DIE
379  *
380  *	A die refers to an integrated circuit. Inside of the package there may
381  *	be a single die or multiple dies. This is sometimes called a 'chip' in
382  *	vendor's parlance, but in this file, we use the term die to refer to a
383  *	subcomponent.
384  *
385  * MULTI-CHIP MODULE
386  *
387  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
388  *	are connected together in the same package. When a multi-chip design is
389  *	used, generally each chip is manufactured independently and then joined
390  *	together in the package. For example, on AMD's Zen microarchitecture
391  *	(family 0x17), the package contains several dies (the second meaning of
392  *	chip from above) that are connected together.
393  *
394  * CACHE
395  *
396  *	A cache is a part of the processor that maintains copies of recently
397  *	accessed memory. Caches are split into levels and then into types.
398  *	Commonly there are one to three levels, called level one, two, and
399  *	three. The lower the level, the smaller it is, the closer it is to the
400  *	execution units of the CPU, and the faster it is to access. The layout
401  *	and design of the cache come in many different flavors, consult other
402  *	resources for a discussion of those.
403  *
404  *	Caches are generally split into two types, the instruction and data
405  *	cache. The caches contain what their names suggest, the instruction
406  *	cache has executable program text, while the data cache has all other
407  *	memory that the processor accesses. As of this writing, data is kept
408  *	coherent between all of the caches on x86, so if one modifies program
409  *	text before it is executed, that will be in the data cache, and the
410  *	instruction cache will be synchronized with that change when the
411  *	processor actually executes those instructions. This coherency also
412  *	covers the fact that data could show up in multiple caches.
413  *
414  *	Generally, the lowest level caches are specific to a core. However, the
415  *	last layer cache is shared between some number of cores. The number of
416  *	CPUs sharing this last level cache is important. This has implications
417  *	for the choices that the scheduler makes, as accessing memory that might
418  *	be in a remote cache after thread migration can be quite expensive.
419  *
420  *	Sometimes, the word cache is abbreviated with a '$', because in US
421  *	English the word cache is pronounced the same as cash. So L1D$ refers to
422  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
423  *	in the rest of this theory statement for clarity.
424  *
425  * MEMORY CONTROLLER
426  *
427  *	The memory controller is a component that provides access to DRAM. Each
428  *	memory controller can access a set number of DRAM channels. Each channel
429  *	can have a number of DIMMs (sticks of memory) associated with it. A
430  *	given package may have more than one memory controller. The association
431  *	of the memory controller to a group of cores is important as it is
432  *	cheaper to access memory on the controller that you are associated with.
433  *
434  * NUMA
435  *
436  *	NUMA or non-uniform memory access, describes a way that systems are
437  *	built. On x86, any processor core can address all of the memory in the
438  *	system. However, When using multiple sockets or possibly within a
439  *	multi-chip module, some of that memory is physically closer and some of
440  *	it is further. Memory that is further away is more expensive to access.
441  *	Consider the following image of multiple sockets with memory:
442  *
443  *	+--------+                                                +--------+
444  *	| DIMM A |         +----------+      +----------+         | DIMM D |
445  *	+--------+-+       |          |      |          |       +-+------+-+
446  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
447  *	  +--------+-+     |          |      |          |     +-+------+-+
448  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
449  *	    +--------+                                        +--------+
450  *
451  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
452  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
453  *	access DIMMs A-C and more expensive to access D-F as it has to go
454  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
455  *	D-F are cheaper than A-C. While the socket form is the most common, when
456  *	using multi-chip modules, this can also sometimes occur. For another
457  *	example of this that's more involved, see the AMD topology section.
458  *
459  *
460  * Intel Topology
461  * --------------
462  *
463  * Most Intel processors since Nehalem, (as of this writing the current gen
464  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
465  * the package is a single monolithic die. MCMs currently aren't used. Most
466  * parts have three levels of caches, with the L3 cache being shared between
467  * all of the cores on the package. The L1/L2 cache is generally specific to
468  * an individual core. The following image shows at a simplified level what
469  * this looks like. The memory controller is commonly part of something called
470  * the 'Uncore', that used to be separate physical chips that were not a part of
471  * the package, but are now part of the same chip.
472  *
473  *  +-----------------------------------------------------------------------+
474  *  | Package                                                               |
475  *  |  +-------------------+  +-------------------+  +-------------------+  |
476  *  |  | Core              |  | Core              |  | Core              |  |
477  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
478  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
479  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
480  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
481  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
482  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
483  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
484  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
485  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
486  *  |  +-------------------+  +-------------------+  +-------------------+  |
487  *  | +-------------------------------------------------------------------+ |
488  *  | |                         Shared L3 Cache                           | |
489  *  | +-------------------------------------------------------------------+ |
490  *  | +-------------------------------------------------------------------+ |
491  *  | |                        Memory Controller                          | |
492  *  | +-------------------------------------------------------------------+ |
493  *  +-----------------------------------------------------------------------+
494  *
495  * A side effect of this current architecture is that what we care about from a
496  * scheduling and topology perspective, is simplified. In general we care about
497  * understanding which logical CPUs are part of the same core and socket.
498  *
499  * To determine the relationship between threads and cores, Intel initially used
500  * the identifier in the advanced programmable interrupt controller (APIC). They
501  * also added cpuid leaf 4 to give additional information about the number of
502  * threads and CPUs in the processor. With the addition of x2apic (which
503  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
504  * additional cpuid topology leaf 0xB was added.
505  *
506  * AMD Topology
507  * ------------
508  *
509  * When discussing AMD topology, we want to break this into three distinct
510  * generations of topology. There's the basic topology that has been used in
511  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
512  * with family 0x15 (Bulldozer), and there's the topology that was introduced
513  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
514  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
515  * additional terminology that's worth talking about.
516  *
517  * Until the introduction of family 0x17 (Zen), AMD did not implement something
518  * that they considered SMT. Whether or not the AMD processors have SMT
519  * influences many things including scheduling and reliability, availability,
520  * and serviceability (RAS) features.
521  *
522  * NODE
523  *
524  *	AMD uses the term node to refer to a die that contains a number of cores
525  *	and I/O resources. Depending on the processor family and model, more
526  *	than one node can be present in the package. When there is more than one
527  *	node this indicates a multi-chip module. Usually each node has its own
528  *	access to memory and I/O devices. This is important and generally
529  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
530  *	result, we track this relationship in the operating system.
531  *
532  *	In processors with an L3 cache, the L3 cache is generally shared across
533  *	the entire node, though the way this is carved up varies from generation
534  *	to generation.
535  *
536  * BULLDOZER
537  *
538  *	Starting with the Bulldozer family (0x15) and continuing until the
539  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
540  *	compute unit. In a compute unit, two traditional cores share a number of
541  *	hardware resources. Critically, they share the FPU, L1 instruction
542  *	cache, and the L2 cache. Several compute units were then combined inside
543  *	of a single node.  Because the integer execution units, L1 data cache,
544  *	and some other resources were not shared between the cores, AMD never
545  *	considered this to be SMT.
546  *
547  * ZEN
548  *
549  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
550  *	is called Zeppelin. These modules are similar to the idea of nodes used
551  *	previously. Each of these nodes has two DRAM channels which all of the
552  *	cores in the node can access uniformly. These nodes are linked together
553  *	in the package, creating a NUMA environment.
554  *
555  *	The Zeppelin die itself contains two different 'core complexes'. Each
556  *	core complex consists of four cores which each have two threads, for a
557  *	total of 8 logical CPUs per complex. Unlike other generations,
558  *	where all the logical CPUs in a given node share the L3 cache, here each
559  *	core complex has its own shared L3 cache.
560  *
561  *	A further thing that we need to consider is that in some configurations,
562  *	particularly with the Threadripper line of processors, not every die
563  *	actually has its memory controllers wired up to actual memory channels.
564  *	This means that some cores have memory attached to them and others
565  *	don't.
566  *
567  *	To put Zen in perspective, consider the following images:
568  *
569  *      +--------------------------------------------------------+
570  *      | Core Complex                                           |
571  *      | +-------------------+    +-------------------+  +---+  |
572  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
573  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
574  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
575  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
576  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
577  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
578  *      | +-------------------+    +-------------------+  | C |  |
579  *      | +-------------------+    +-------------------+  | a |  |
580  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
581  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
582  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
583  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
584  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
585  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
586  *      | +-------------------+    +-------------------+  +---+  |
587  *      |                                                        |
588  *	+--------------------------------------------------------+
589  *
590  *  This first image represents a single Zen core complex that consists of four
591  *  cores.
592  *
593  *
594  *	+--------------------------------------------------------+
595  *	| Zeppelin Die                                           |
596  *	|  +--------------------------------------------------+  |
597  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
598  *	|  +--------------------------------------------------+  |
599  *      |                           HH                           |
600  *	|          +-----------+    HH    +-----------+          |
601  *	|          |           |    HH    |           |          |
602  *	|          |    Core   |==========|    Core   |          |
603  *	|          |  Complex  |==========|  Complex  |          |
604  *	|          |           |    HH    |           |          |
605  *	|          +-----------+    HH    +-----------+          |
606  *      |                           HH                           |
607  *	|  +--------------------------------------------------+  |
608  *	|  |                Memory Controller                 |  |
609  *	|  +--------------------------------------------------+  |
610  *      |                                                        |
611  *	+--------------------------------------------------------+
612  *
613  *  This image represents a single Zeppelin Die. Note how both cores are
614  *  connected to the same memory controller and I/O units. While each core
615  *  complex has its own L3 cache as seen in the first image, they both have
616  *  uniform access to memory.
617  *
618  *
619  *                      PP                     PP
620  *                      PP                     PP
621  *           +----------PP---------------------PP---------+
622  *           |          PP                     PP         |
623  *           |    +-----------+          +-----------+    |
624  *           |    |           |          |           |    |
625  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
626  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
627  *           |    |           |          |           |    |
628  *           |    +-----------+ooo    ...+-----------+    |
629  *           |          HH      ooo  ...       HH         |
630  *           |          HH        oo..         HH         |
631  *           |          HH        ..oo         HH         |
632  *           |          HH      ...  ooo       HH         |
633  *           |    +-----------+...    ooo+-----------+    |
634  *           |    |           |          |           |    |
635  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
636  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
637  *           |    |           |          |           |    |
638  *           |    +-----------+          +-----------+    |
639  *           |          PP                     PP         |
640  *           +----------PP---------------------PP---------+
641  *                      PP                     PP
642  *                      PP                     PP
643  *
644  *  This image represents a single Zen package. In this example, it has four
645  *  Zeppelin dies, though some configurations only have a single one. In this
646  *  example, each die is directly connected to the next. Also, each die is
647  *  represented as being connected to memory by the 'M' character and connected
648  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
649  *  die is made up of two core complexes, we have multiple different NUMA
650  *  domains that we care about for these systems.
651  *
652  * ZEN 2
653  *
654  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
655  *	each Zeppelin Die had its own I/O die, that has been moved out of the
656  *	core complex in Zen 2. The actual core complex looks pretty similar, but
657  *	now the die actually looks much simpler:
658  *
659  *      +--------------------------------------------------------+
660  *      | Zen 2 Core Complex Die    HH                           |
661  *      |                           HH                           |
662  *      |          +-----------+    HH    +-----------+          |
663  *      |          |           |    HH    |           |          |
664  *      |          |    Core   |==========|    Core   |          |
665  *      |          |  Complex  |==========|  Complex  |          |
666  *      |          |           |    HH    |           |          |
667  *      |          +-----------+    HH    +-----------+          |
668  *      |                           HH                           |
669  *      |                           HH                           |
670  *      +--------------------------------------------------------+
671  *
672  *	From here, when we add the central I/O die, this changes things a bit.
673  *	Each die is connected to the I/O die, rather than trying to interconnect
674  *	them directly. The following image takes the same Zen 1 image that we
675  *	had earlier and shows what it looks like with the I/O die instead:
676  *
677  *                                 PP    PP
678  *                                 PP    PP
679  *           +---------------------PP----PP---------------------+
680  *           |                     PP    PP                     |
681  *           |  +-----------+      PP    PP      +-----------+  |
682  *           |  |           |      PP    PP      |           |  |
683  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
684  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
685  *           |  |         |o|oooo|          |oooo|o|         |  |
686  *           |  +-----------+    |          |    +-----------+  |
687  *           |                   |   I/O    |                   |
688  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
689  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
690  *           |                   |          |                   |
691  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
692  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
693  *           |                   |          |                   |
694  *           |  +-----------+    |          |    +-----------+  |
695  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
696  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
697  *           |  |    Die    |      PP    PP      |    Die    |  |
698  *           |  |           |      PP    PP      |           |  |
699  *           |  +-----------+      PP    PP      +-----------+  |
700  *           |                     PP    PP                     |
701  *           +---------------------PP----PP---------------------+
702  *                                 PP    PP
703  *                                 PP    PP
704  *
705  *	The above has four core complex dies installed, though the Zen 2 EPYC
706  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
707  *	generally only have one to two. The more notable difference here is how
708  *	everything communicates. Note that memory and PCIe come out of the
709  *	central die. This changes the way that one die accesses a resource. It
710  *	basically always has to go to the I/O die, where as in Zen 1 it may have
711  *	satisfied it locally. In general, this ends up being a better strategy
712  *	for most things, though it is possible to still treat everything in four
713  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
714  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
715  *	now there is only one 'node' present.
716  *
717  * ZEN 3
718  *
719  *	From an architectural perspective, Zen 3 is a much smaller change from
720  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
721  *	its microarchitectural changes. The biggest thing for us is how the die
722  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
723  *	cache. However, in Zen 3, the L3 is now shared between the entire core
724  *	complex die and is no longer partitioned between each core complex. This
725  *	means that all cores on the die can share the same L3 cache. Otherwise,
726  *	the general layout of the overall package with various core complexes
727  *	and an I/O die stays the same. Here's what the Core Complex Die looks
728  *	like in a bit more detail:
729  *
730  *               +-------------------------------------------------+
731  *               | Zen 3 Core Complex Die                          |
732  *               | +-------------------+    +-------------------+  |
733  *               | | Core       +----+ |    | Core       +----+ |  |
734  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
735  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
736  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
737  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
738  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
739  *               | +-------------------+    +-------------------+  |
740  *               | +-------------------+    +-------------------+  |
741  *               | | Core       +----+ |    | Core       +----+ |  |
742  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
743  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
744  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
745  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
746  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
747  *               | +-------------------+    +-------------------+  |
748  *               |                                                 |
749  *               | +--------------------------------------------+  |
750  *               | |                 L3 Cache                   |  |
751  *               | +--------------------------------------------+  |
752  *               |                                                 |
753  *               | +-------------------+    +-------------------+  |
754  *               | | Core       +----+ |    | Core       +----+ |  |
755  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
756  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
757  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
758  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
759  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
760  *               | +-------------------+    +-------------------+  |
761  *               | +-------------------+    +-------------------+  |
762  *               | | Core       +----+ |    | Core       +----+ |  |
763  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
764  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
765  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
766  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
767  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
768  *               | +-------------------+    +-------------------+  |
769  *               +-------------------------------------------------+
770  *
771  *	While it is not pictured, there are connections from the die to the
772  *	broader data fabric and additional functional blocks to support that
773  *	communication and coherency.
774  *
775  * CPUID LEAVES
776  *
777  * There are a few different CPUID leaves that we can use to try and understand
778  * the actual state of the world. As part of the introduction of family 0xf, AMD
779  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
780  * processors that are in the system. Because families before Zen didn't have
781  * SMT, this was always the number of cores that were in the system. However, it
782  * should always be thought of as the number of logical threads to be consistent
783  * between generations. In addition we also get the size of the APIC ID that is
784  * used to represent the number of logical processors. This is important for
785  * deriving topology information.
786  *
787  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
788  * bit between Bulldozer and later families, but it is quite useful in
789  * determining the topology information. Because this information has changed
790  * across family generations, it's worth calling out what these mean
791  * explicitly. The registers have the following meanings:
792  *
793  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
794  *		APIC ID, even though on systems without x2apic support, it will
795  *		be limited to 8 bits.
796  *
797  *	%ebx	On Bulldozer-era systems this contains information about the
798  *		number of cores that are in a compute unit (cores that share
799  *		resources). It also contains a per-package compute unit ID that
800  *		identifies which compute unit the logical CPU is a part of.
801  *
802  *		On Zen-era systems this instead contains the number of threads
803  *		per core and the ID of the core that the logical CPU is a part
804  *		of. Note, this ID is unique only to the package, it is not
805  *		globally unique across the entire system.
806  *
807  *	%ecx	This contains the number of nodes that exist in the package. It
808  *		also contains an ID that identifies which node the logical CPU
809  *		is a part of.
810  *
811  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
812  * cache layout to determine which logical CPUs are sharing which caches.
813  *
814  * illumos Topology
815  * ----------------
816  *
817  * Based on the above we synthesize the information into several different
818  * variables that we store in the 'struct cpuid_info'. We'll go into the details
819  * of what each member is supposed to represent and their uniqueness. In
820  * general, there are two levels of uniqueness that we care about. We care about
821  * an ID that is globally unique. That means that it will be unique across all
822  * entities in the system. For example, the default logical CPU ID is globally
823  * unique. On the other hand, there is some information that we only care about
824  * being unique within the context of a single package / socket. Here are the
825  * variables that we keep track of and their meaning.
826  *
827  * Several of the values that are asking for an identifier, with the exception
828  * of cpi_apicid, are allowed to be synthetic.
829  *
830  *
831  * cpi_apicid
832  *
833  *	This is the value of the CPU's APIC id. This should be the full 32-bit
834  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
835  *	APIC ID. This value is globally unique between all logical CPUs across
836  *	all packages. This is usually required by the APIC.
837  *
838  * cpi_chipid
839  *
840  *	This value indicates the ID of the package that the logical CPU is a
841  *	part of. This value is allowed to be synthetic. It is usually derived by
842  *	taking the CPU's APIC ID and determining how many bits are used to
843  *	represent CPU cores in the package. All logical CPUs that are part of
844  *	the same package must have the same value.
845  *
846  * cpi_coreid
847  *
848  *	This represents the ID of a CPU core. Two logical CPUs should only have
849  *	the same cpi_coreid value if they are part of the same core. These
850  *	values may be synthetic. On systems that support SMT, this value is
851  *	usually derived from the APIC ID, otherwise it is often synthetic and
852  *	just set to the value of the cpu_id in the cpu_t.
853  *
854  * cpi_pkgcoreid
855  *
856  *	This is similar to the cpi_coreid in that logical CPUs that are part of
857  *	the same core should have the same ID. The main difference is that these
858  *	values are only required to be unique to a given socket.
859  *
860  * cpi_clogid
861  *
862  *	This represents the logical ID of a logical CPU. This value should be
863  *	unique within a given socket for each logical CPU. This is allowed to be
864  *	synthetic, though it is usually based off of the CPU's apic ID. The
865  *	broader system expects that logical CPUs that have are part of the same
866  *	core have contiguous numbers. For example, if there were two threads per
867  *	core, then the core IDs divided by two should be the same and the first
868  *	modulus two should be zero and the second one. For example, IDs 4 and 5
869  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
870  *	6 represent two logical CPUs that are part of different cores.
871  *
872  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
873  *	from the same source, strictly speaking, they don't have to be and the
874  *	two values should be considered logically independent. One should not
875  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
876  *	some kind of relationship. While this is tempting, we've seen cases on
877  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
878  *
879  * cpi_ncpu_per_chip
880  *
881  *	This value indicates the total number of logical CPUs that exist in the
882  *	physical package. Critically, this is not the number of logical CPUs
883  *	that exist for just the single core.
884  *
885  *	This value should be the same for all logical CPUs in the same package.
886  *
887  * cpi_ncore_per_chip
888  *
889  *	This value indicates the total number of physical CPU cores that exist
890  *	in the package. The system compares this value with cpi_ncpu_per_chip to
891  *	determine if simultaneous multi-threading (SMT) is enabled. When
892  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
893  *	the X86FSET_HTT feature is not set. If this value is greater than one,
894  *	than we consider the processor to have the feature X86FSET_CMP, to
895  *	indicate that there is support for more than one core.
896  *
897  *	This value should be the same for all logical CPUs in the same package.
898  *
899  * cpi_procnodes_per_pkg
900  *
901  *	This value indicates the number of 'nodes' that exist in the package.
902  *	When processors are actually a multi-chip module, this represents the
903  *	number of such modules that exist in the package. Currently, on Intel
904  *	based systems this member is always set to 1.
905  *
906  *	This value should be the same for all logical CPUs in the same package.
907  *
908  * cpi_procnodeid
909  *
910  *	This value indicates the ID of the node that the logical CPU is a part
911  *	of. All logical CPUs that are in the same node must have the same value
912  *	here. This value must be unique across all of the packages in the
913  *	system.  On Intel based systems, this is currently set to the value in
914  *	cpi_chipid because there is only one node.
915  *
916  * cpi_cores_per_compunit
917  *
918  *	This value indicates the number of cores that are part of a compute
919  *	unit. See the AMD topology section for this. This member only has real
920  *	meaning currently for AMD Bulldozer family processors. For all other
921  *	processors, this should currently be set to 1.
922  *
923  * cpi_compunitid
924  *
925  *	This indicates the compute unit that the logical CPU belongs to. For
926  *	processors without AMD Bulldozer-style compute units this should be set
927  *	to the value of cpi_coreid.
928  *
929  * cpi_ncpu_shr_last_cache
930  *
931  *	This indicates the number of logical CPUs that are sharing the same last
932  *	level cache. This value should be the same for all CPUs that are sharing
933  *	that cache. The last cache refers to the cache that is closest to memory
934  *	and furthest away from the CPU.
935  *
936  * cpi_last_lvl_cacheid
937  *
938  *	This indicates the ID of the last cache that the logical CPU uses. This
939  *	cache is often shared between multiple logical CPUs and is the cache
940  *	that is closest to memory and furthest away from the CPU. This value
941  *	should be the same for a group of logical CPUs only if they actually
942  *	share the same last level cache. IDs should not overlap between
943  *	packages.
944  *
945  * cpi_ncore_bits
946  *
947  *	This indicates the number of bits that are required to represent all of
948  *	the cores in the system. As cores are derived based on their APIC IDs,
949  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
950  *	this value to be larger than the actual number of IDs that are present
951  *	in the system. This is used to size tables by the CMI framework. It is
952  *	only filled in for Intel and AMD CPUs.
953  *
954  * cpi_nthread_bits
955  *
956  *	This indicates the number of bits required to represent all of the IDs
957  *	that cover the logical CPUs that exist on a given core. It's OK for this
958  *	value to be larger than the actual number of IDs that are present in the
959  *	system.  This is used to size tables by the CMI framework. It is
960  *	only filled in for Intel and AMD CPUs.
961  *
962  * -----------
963  * Hypervisors
964  * -----------
965  *
966  * If trying to manage the differences between vendors wasn't bad enough, it can
967  * get worse thanks to our friend hardware virtualization. Hypervisors are given
968  * the ability to interpose on all cpuid instructions and change them to suit
969  * their purposes. In general, this is necessary as the hypervisor wants to be
970  * able to present a more uniform set of features or not necessarily give the
971  * guest operating system kernel knowledge of all features so it can be
972  * more easily migrated between systems.
973  *
974  * When it comes to trying to determine topology information, this can be a
975  * double edged sword. When a hypervisor doesn't actually implement a cpuid
976  * leaf, it'll often return all zeros. Because of that, you'll often see various
977  * checks scattered about fields being non-zero before we assume we can use
978  * them.
979  *
980  * When it comes to topology information, the hypervisor is often incentivized
981  * to lie to you about topology. This is because it doesn't always actually
982  * guarantee that topology at all. The topology path we take in the system
983  * depends on how the CPU advertises itself. If it advertises itself as an Intel
984  * or AMD CPU, then we basically do our normal path. However, when they don't
985  * use an actual vendor, then that usually turns into multiple one-core CPUs
986  * that we enumerate that are often on different sockets. The actual behavior
987  * depends greatly on what the hypervisor actually exposes to us.
988  *
989  * --------------------
990  * Exposing Information
991  * --------------------
992  *
993  * We expose CPUID information in three different forms in the system.
994  *
995  * The first is through the x86_featureset variable. This is used in conjunction
996  * with the is_x86_feature() function. This is queried by x86-specific functions
997  * to determine which features are or aren't present in the system and to make
998  * decisions based upon them. For example, users of this include everything from
999  * parts of the system dedicated to reliability, availability, and
1000  * serviceability (RAS), to making decisions about how to handle security
1001  * mitigations, to various x86-specific drivers. General purpose or
1002  * architecture independent drivers should never be calling this function.
1003  *
1004  * The second means is through the auxiliary vector. The auxiliary vector is a
1005  * series of tagged data that the kernel passes down to a user program when it
1006  * begins executing. This information is used to indicate to programs what
1007  * instruction set extensions are present. For example, information about the
1008  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1009  * since user programs cannot make use of it. However, things like the AVX
1010  * instruction sets are. Programs use this information to make run-time
1011  * decisions about what features they should use. As an example, the run-time
1012  * link-editor (rtld) can relocate different functions depending on the hardware
1013  * support available.
1014  *
1015  * The final form is through a series of accessor functions that all have the
1016  * form cpuid_get*. This is used by a number of different subsystems in the
1017  * kernel to determine more detailed information about what we're running on,
1018  * topology information, etc. Some of these subsystems include processor groups
1019  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1020  * microcode, and performance monitoring. These functions all ASSERT that the
1021  * CPU they're being called on has reached a certain cpuid pass. If the passes
1022  * are rearranged, then this needs to be adjusted.
1023  *
1024  * -----------------------------------------------
1025  * Speculative Execution CPU Side Channel Security
1026  * -----------------------------------------------
1027  *
1028  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1029  * execution in the CPU to create side channels there have been a number of
1030  * different attacks and corresponding issues that the operating system needs to
1031  * mitigate against. The following list is some of the common, but not
1032  * exhaustive, set of issues that we know about and have done some or need to do
1033  * more work in the system to mitigate against:
1034  *
1035  *   - Spectre v1
1036  *   - swapgs (Spectre v1 variant)
1037  *   - Spectre v2
1038  *   - Meltdown (Spectre v3)
1039  *   - Rogue Register Read (Spectre v3a)
1040  *   - Speculative Store Bypass (Spectre v4)
1041  *   - ret2spec, SpectreRSB
1042  *   - L1 Terminal Fault (L1TF)
1043  *   - Microarchitectural Data Sampling (MDS)
1044  *
1045  * Each of these requires different sets of mitigations and has different attack
1046  * surfaces. For the most part, this discussion is about protecting the kernel
1047  * from non-kernel executing environments such as user processes and hardware
1048  * virtual machines. Unfortunately, there are a number of user vs. user
1049  * scenarios that exist with these. The rest of this section will describe the
1050  * overall approach that the system has taken to address these as well as their
1051  * shortcomings. Unfortunately, not all of the above have been handled today.
1052  *
1053  * SPECTRE v2, ret2spec, SpectreRSB
1054  *
1055  * The second variant of the spectre attack focuses on performing branch target
1056  * injection. This generally impacts indirect call instructions in the system.
1057  * There are three different ways to mitigate this issue that are commonly
1058  * described today:
1059  *
1060  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1061  *  2. Using Retpolines and RSB Stuffing
1062  *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
1063  *
1064  * IBRS uses a feature added to microcode to restrict speculation, among other
1065  * things. This form of mitigation has not been used as it has been generally
1066  * seen as too expensive and requires reactivation upon various transitions in
1067  * the system.
1068  *
1069  * As a less impactful alternative to IBRS, retpolines were developed by
1070  * Google. These basically require one to replace indirect calls with a specific
1071  * trampoline that will cause speculation to fail and break the attack.
1072  * Retpolines require compiler support. We always build with retpolines in the
1073  * external thunk mode. This means that a traditional indirect call is replaced
1074  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1075  * of this is that all indirect function calls are performed through a register.
1076  *
1077  * We have to use a common external location of the thunk and not inline it into
1078  * the callsite so that way we can have a single place to patch these functions.
1079  * As it turns out, we currently have two different forms of retpolines that
1080  * exist in the system:
1081  *
1082  *  1. A full retpoline
1083  *  2. A no-op version
1084  *
1085  * The first one is used in the general case. Historically, there was an
1086  * AMD-specific optimized retopoline variant that was based around using a
1087  * serializing lfence instruction; however, in March 2022 it was announced that
1088  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1089  * use it and it is no longer available in the system.
1090  *
1091  * The third form described above is the most curious. It turns out that the way
1092  * that retpolines are implemented is that they rely on how speculation is
1093  * performed on a 'ret' instruction. Intel has continued to optimize this
1094  * process (which is partly why we need to have return stack buffer stuffing,
1095  * but more on that in a bit) and in processors starting with Cascade Lake
1096  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1097  * mechanism has been introduced called Enhanced IBRS (EIBRS).
1098  *
1099  * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
1100  * physical core. However, if this is the case, we don't want to use retpolines
1101  * any more. Therefore if EIBRS is present, we end up turning each retpoline
1102  * function (called a thunk) into a jmp instruction. This means that we're still
1103  * paying the cost of an extra jump to the external thunk, but it gives us
1104  * flexibility and the ability to have a single kernel image that works across a
1105  * wide variety of systems and hardware features.
1106  *
1107  * Unfortunately, this alone is insufficient. First, Skylake systems have
1108  * additional speculation for the Return Stack Buffer (RSB) which is used to
1109  * return from call instructions which retpolines take advantage of. However,
1110  * this problem is not just limited to Skylake and is actually more pernicious.
1111  * The SpectreRSB paper introduces several more problems that can arise with
1112  * dealing with this. The RSB can be poisoned just like the indirect branch
1113  * predictor. This means that one needs to clear the RSB when transitioning
1114  * between two different privilege domains. Some examples include:
1115  *
1116  *  - Switching between two different user processes
1117  *  - Going between user land and the kernel
1118  *  - Returning to the kernel from a hardware virtual machine
1119  *
1120  * Mitigating this involves combining a couple of different things. The first is
1121  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1122  * Bridge. When an RSB entry refers to a user address and we're executing in the
1123  * kernel, speculation through it will be stopped when SMEP is enabled. This
1124  * protects against a number of the different cases that we would normally be
1125  * worried about such as when we enter the kernel from user land.
1126  *
1127  * To prevent against additional manipulation of the RSB from other contexts
1128  * such as a non-root VMX context attacking the kernel we first look to enhanced
1129  * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1130  * need to do to protect the kernel at this time.
1131  *
1132  * On CPUs without EIBRS we need to manually overwrite the contents of the
1133  * return stack buffer. We do this through the x86_rsb_stuff() function.
1134  * Currently this is employed on context switch. The x86_rsb_stuff() function is
1135  * disabled when enhanced IBRS is present because Intel claims on such systems
1136  * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1137  * to user attacks via the RSB.
1138  *
1139  * If SMEP is not present, then we would have to stuff the RSB every time we
1140  * transitioned from user mode to the kernel, which isn't very practical right
1141  * now.
1142  *
1143  * To fully protect user to user and vmx to vmx attacks from these classes of
1144  * issues, we would also need to allow them to opt into performing an Indirect
1145  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1146  *
1147  * By default, the system will enable RSB stuffing and the required variant of
1148  * retpolines and store that information in the x86_spectrev2_mitigation value.
1149  * This will be evaluated after a microcode update as well, though it is
1150  * expected that microcode updates will not take away features. This may mean
1151  * that a late loaded microcode may not end up in the optimal configuration
1152  * (though this should be rare).
1153  *
1154  * Currently we do not build kmdb with retpolines or perform any additional side
1155  * channel security mitigations for it. One complication with kmdb is that it
1156  * requires its own retpoline thunks and it would need to adjust itself based on
1157  * what the kernel does. The threat model of kmdb is more limited and therefore
1158  * it may make more sense to investigate using prediction barriers as the whole
1159  * system is only executing a single instruction at a time while in kmdb.
1160  *
1161  * SPECTRE v1, v4
1162  *
1163  * The v1 and v4 variants of spectre are not currently mitigated in the
1164  * system and require other classes of changes to occur in the code.
1165  *
1166  * SPECTRE v1 (SWAPGS VARIANT)
1167  *
1168  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1169  * can generally affect any branch-dependent code. The swapgs issue is one
1170  * variant of this. If we are coming in from userspace, we can have code like
1171  * this:
1172  *
1173  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1174  *	je	1f
1175  *	movq	$0, REGOFF_SAVFP(%rsp)
1176  *	swapgs
1177  *	1:
1178  *	movq	%gs:CPU_THREAD, %rax
1179  *
1180  * If an attacker can cause a mis-speculation of the branch here, we could skip
1181  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1182  * load. If subsequent code can act as the usual Spectre cache gadget, this
1183  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1184  * any use of the %gs override.
1185  *
1186  * The other case is also an issue: if we're coming into a trap from kernel
1187  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1188  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1189  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1190  * case, and the fix is the same in both cases (an lfence at the branch target
1191  * 1: in this example), we'll just do it unconditionally.
1192  *
1193  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1194  * harder for user-space to actually set a useful %gsbase value: although it's
1195  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1196  * mitigate anyway.
1197  *
1198  * MELTDOWN
1199  *
1200  * Meltdown, or spectre v3, allowed a user process to read any data in their
1201  * address space regardless of whether or not the page tables in question
1202  * allowed the user to have the ability to read them. The solution to meltdown
1203  * is kernel page table isolation. In this world, there are two page tables that
1204  * are used for a process, one in user land and one in the kernel. To implement
1205  * this we use per-CPU page tables and switch between the user and kernel
1206  * variants when entering and exiting the kernel.  For more information about
1207  * this process and how the trampolines work, please see the big theory
1208  * statements and additional comments in:
1209  *
1210  *  - uts/i86pc/ml/kpti_trampolines.s
1211  *  - uts/i86pc/vm/hat_i86.c
1212  *
1213  * While Meltdown only impacted Intel systems and there are also Intel systems
1214  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1215  * kernel page table isolation enabled. While this may at first seem weird, an
1216  * important thing to remember is that you can't speculatively read an address
1217  * if it's never in your page table at all. Having user processes without kernel
1218  * pages present provides us with an important layer of defense in the kernel
1219  * against any other side channel attacks that exist and have yet to be
1220  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1221  * default, no matter the x86 system.
1222  *
1223  * L1 TERMINAL FAULT
1224  *
1225  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1226  * execution uses page table entries. Effectively, it is two different problems.
1227  * The first is that it ignores the not present bit in the page table entries
1228  * when performing speculative execution. This means that something can
1229  * speculatively read the listed physical address if it's present in the L1
1230  * cache under certain conditions (see Intel's documentation for the full set of
1231  * conditions). Secondly, this can be used to bypass hardware virtualization
1232  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1233  * instructions.
1234  *
1235  * For the non-hardware virtualized case, this is relatively easy to deal with.
1236  * We must make sure that all unmapped pages have an address of zero. This means
1237  * that they could read the first 4k of physical memory; however, we never use
1238  * that first page in the operating system and always skip putting it in our
1239  * memory map, even if firmware tells us we can use it in our memory map. While
1240  * other systems try to put extra metadata in the address and reserved bits,
1241  * which led to this being problematic in those cases, we do not.
1242  *
1243  * For hardware virtual machines things are more complicated. Because they can
1244  * construct their own page tables, it isn't hard for them to perform this
1245  * attack against any physical address. The one wrinkle is that this physical
1246  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1247  * to flush the L1 data cache. We wrap this up in the function
1248  * spec_uarch_flush(). This function is also used in the mitigation of
1249  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1250  * hypervisors such as KVM or bhyve are responsible for performing this before
1251  * entering the guest.
1252  *
1253  * Because this attack takes place in the L1 cache, there's another wrinkle
1254  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1255  * designs. This means that when a thread enters a hardware virtualized context
1256  * and flushes the L1 data cache, the other thread on the processor may then go
1257  * ahead and put new data in it that can be potentially attacked. While one
1258  * solution is to disable SMT on the system, another option that is available is
1259  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1260  * goes through and makes sure that if a HVM is being scheduled on one thread,
1261  * then the thing on the other thread is from the same hardware virtual machine.
1262  * If an interrupt comes in or the guest exits to the broader system, then the
1263  * other SMT thread will be kicked out.
1264  *
1265  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1266  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1267  * perform L1TF related mitigations.
1268  *
1269  * MICROARCHITECTURAL DATA SAMPLING
1270  *
1271  * Microarchitectural data sampling (MDS) is a combination of four discrete
1272  * vulnerabilities that are similar issues affecting various parts of the CPU's
1273  * microarchitectural implementation around load, store, and fill buffers.
1274  * Specifically it is made up of the following subcomponents:
1275  *
1276  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1277  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1278  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1279  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1280  *
1281  * To begin addressing these, Intel has introduced another feature in microcode
1282  * called MD_CLEAR. This changes the verw instruction to operate in a different
1283  * way. This allows us to execute the verw instruction in a particular way to
1284  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1285  * updated when this microcode is present to flush this state.
1286  *
1287  * Primarily we need to flush this state whenever we transition from the kernel
1288  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1289  * little bit different. Here the structures are statically sized when a logical
1290  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1291  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1292  * mwait, or another ACPI method. To perform these flushes, we call
1293  * x86_md_clear() at all of these transition points.
1294  *
1295  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1296  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1297  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1298  * a no-op.
1299  *
1300  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1301  * particular, everything we've discussed above is only valid for a single
1302  * thread executing on a core. In the case where you have hyper-threading
1303  * present, this attack can be performed between threads. The theoretical fix
1304  * for this is to ensure that both threads are always in the same security
1305  * domain. This means that they are executing in the same ring and mutually
1306  * trust each other. Practically speaking, this would mean that a system call
1307  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1308  * Rather than implement this, we recommend that one disables hyper-threading
1309  * through the use of psradm -aS.
1310  *
1311  * TSX ASYNCHRONOUS ABORT
1312  *
1313  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1314  * behaves like MDS, but leverages Intel's transactional instructions as another
1315  * vector. Effectively, when a transaction hits one of these cases (unmapped
1316  * page, various cache snoop activity, etc.) then the same data can be exposed
1317  * as in the case of MDS. This means that you can attack your twin.
1318  *
1319  * Intel has described that there are two different ways that we can mitigate
1320  * this problem on affected processors:
1321  *
1322  *   1) We can use the same techniques used to deal with MDS. Flushing the
1323  *      microarchitectural buffers and disabling hyperthreading will mitigate
1324  *      this in the same way.
1325  *
1326  *   2) Using microcode to disable TSX.
1327  *
1328  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1329  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1330  * That's OK as we're already doing all such mitigations. On the other hand,
1331  * processors with MDS_NO are all supposed to receive microcode updates that
1332  * enumerate support for disabling TSX. In general, we'd rather use this method
1333  * when available as it doesn't require disabling hyperthreading to be
1334  * effective. Currently we basically are relying on microcode for processors
1335  * that enumerate MDS_NO.
1336  *
1337  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1338  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1339  * different powers. The first allows us to cause all transactions to
1340  * immediately abort. The second gives us a means of disabling TSX completely,
1341  * which includes removing it from cpuid. If we have support for this in
1342  * microcode during the first cpuid pass, then we'll disable TSX completely such
1343  * that user land never has a chance to observe the bit. However, if we are late
1344  * loading the microcode, then we must use the functionality to cause
1345  * transactions to automatically abort. This is necessary for user land's sake.
1346  * Once a program sees a cpuid bit, it must not be taken away.
1347  *
1348  * We track whether or not we should do this based on what cpuid pass we're in.
1349  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1350  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1351  * should happen twice. Once in the normal cpuid_pass1() code and then a second
1352  * time after we do the initial microcode update.  As a result we need to be
1353  * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable
1354  * microcode on the current CPU (which happens prior to cpuid_pass_ucode()).
1355  *
1356  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1357  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1358  * unfortunate feature in a number of ways, and taking the opportunity to
1359  * finally be able to turn it off is likely to be of benefit in the future.
1360  *
1361  * SUMMARY
1362  *
1363  * The following table attempts to summarize the mitigations for various issues
1364  * and what's done in various places:
1365  *
1366  *  - Spectre v1: Not currently mitigated
1367  *  - swapgs: lfences after swapgs paths
1368  *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1369  *  - Meltdown: Kernel Page Table Isolation
1370  *  - Spectre v3a: Updated CPU microcode
1371  *  - Spectre v4: Not currently mitigated
1372  *  - SpectreRSB: SMEP and RSB Stuffing
1373  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1374  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1375  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1376  *
1377  * The following table indicates the x86 feature set bits that indicate that a
1378  * given problem has been solved or a notable feature is present:
1379  *
1380  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1381  *  - MDS_NO: All forms of MDS
1382  *  - TAA_NO: TAA
1383  */
1384 
1385 #include <sys/types.h>
1386 #include <sys/archsystm.h>
1387 #include <sys/x86_archext.h>
1388 #include <sys/kmem.h>
1389 #include <sys/systm.h>
1390 #include <sys/cmn_err.h>
1391 #include <sys/sunddi.h>
1392 #include <sys/sunndi.h>
1393 #include <sys/cpuvar.h>
1394 #include <sys/processor.h>
1395 #include <sys/sysmacros.h>
1396 #include <sys/pg.h>
1397 #include <sys/fp.h>
1398 #include <sys/controlregs.h>
1399 #include <sys/bitmap.h>
1400 #include <sys/auxv_386.h>
1401 #include <sys/memnode.h>
1402 #include <sys/pci_cfgspace.h>
1403 #include <sys/comm_page.h>
1404 #include <sys/mach_mmu.h>
1405 #include <sys/ucode.h>
1406 #include <sys/tsc.h>
1407 #include <sys/kobj.h>
1408 #include <sys/asm_misc.h>
1409 
1410 #ifdef __xpv
1411 #include <sys/hypervisor.h>
1412 #else
1413 #include <sys/ontrap.h>
1414 #endif
1415 
1416 uint_t x86_vendor = X86_VENDOR_IntelClone;
1417 uint_t x86_type = X86_TYPE_OTHER;
1418 uint_t x86_clflush_size = 0;
1419 
1420 #if defined(__xpv)
1421 int x86_use_pcid = 0;
1422 int x86_use_invpcid = 0;
1423 #else
1424 int x86_use_pcid = -1;
1425 int x86_use_invpcid = -1;
1426 #endif
1427 
1428 typedef enum {
1429 	X86_SPECTREV2_RETPOLINE,
1430 	X86_SPECTREV2_ENHANCED_IBRS,
1431 	X86_SPECTREV2_DISABLED
1432 } x86_spectrev2_mitigation_t;
1433 
1434 uint_t x86_disable_spectrev2 = 0;
1435 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1436     X86_SPECTREV2_RETPOLINE;
1437 
1438 /*
1439  * The mitigation status for TAA:
1440  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1441  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1442  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1443  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1444  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1445  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1446  */
1447 typedef enum {
1448 	X86_TAA_NOTHING,
1449 	X86_TAA_DISABLED,
1450 	X86_TAA_MD_CLEAR,
1451 	X86_TAA_TSX_FORCE_ABORT,
1452 	X86_TAA_TSX_DISABLE,
1453 	X86_TAA_HW_MITIGATED
1454 } x86_taa_mitigation_t;
1455 
1456 uint_t x86_disable_taa = 0;
1457 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1458 
1459 uint_t pentiumpro_bug4046376;
1460 
1461 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1462 
1463 static char *x86_feature_names[NUM_X86_FEATURES] = {
1464 	"lgpg",
1465 	"tsc",
1466 	"msr",
1467 	"mtrr",
1468 	"pge",
1469 	"de",
1470 	"cmov",
1471 	"mmx",
1472 	"mca",
1473 	"pae",
1474 	"cv8",
1475 	"pat",
1476 	"sep",
1477 	"sse",
1478 	"sse2",
1479 	"htt",
1480 	"asysc",
1481 	"nx",
1482 	"sse3",
1483 	"cx16",
1484 	"cmp",
1485 	"tscp",
1486 	"mwait",
1487 	"sse4a",
1488 	"cpuid",
1489 	"ssse3",
1490 	"sse4_1",
1491 	"sse4_2",
1492 	"1gpg",
1493 	"clfsh",
1494 	"64",
1495 	"aes",
1496 	"pclmulqdq",
1497 	"xsave",
1498 	"avx",
1499 	"vmx",
1500 	"svm",
1501 	"topoext",
1502 	"f16c",
1503 	"rdrand",
1504 	"x2apic",
1505 	"avx2",
1506 	"bmi1",
1507 	"bmi2",
1508 	"fma",
1509 	"smep",
1510 	"smap",
1511 	"adx",
1512 	"rdseed",
1513 	"mpx",
1514 	"avx512f",
1515 	"avx512dq",
1516 	"avx512pf",
1517 	"avx512er",
1518 	"avx512cd",
1519 	"avx512bw",
1520 	"avx512vl",
1521 	"avx512fma",
1522 	"avx512vbmi",
1523 	"avx512_vpopcntdq",
1524 	"avx512_4vnniw",
1525 	"avx512_4fmaps",
1526 	"xsaveopt",
1527 	"xsavec",
1528 	"xsaves",
1529 	"sha",
1530 	"umip",
1531 	"pku",
1532 	"ospke",
1533 	"pcid",
1534 	"invpcid",
1535 	"ibrs",
1536 	"ibpb",
1537 	"stibp",
1538 	"ssbd",
1539 	"ssbd_virt",
1540 	"rdcl_no",
1541 	"ibrs_all",
1542 	"rsba",
1543 	"ssb_no",
1544 	"stibp_all",
1545 	"flush_cmd",
1546 	"l1d_vmentry_no",
1547 	"fsgsbase",
1548 	"clflushopt",
1549 	"clwb",
1550 	"monitorx",
1551 	"clzero",
1552 	"xop",
1553 	"fma4",
1554 	"tbm",
1555 	"avx512_vnni",
1556 	"amd_pcec",
1557 	"md_clear",
1558 	"mds_no",
1559 	"core_thermal",
1560 	"pkg_thermal",
1561 	"tsx_ctrl",
1562 	"taa_no",
1563 	"ppin",
1564 	"vaes",
1565 	"vpclmulqdq",
1566 	"lfence_serializing"
1567 };
1568 
1569 boolean_t
is_x86_feature(void * featureset,uint_t feature)1570 is_x86_feature(void *featureset, uint_t feature)
1571 {
1572 	ASSERT(feature < NUM_X86_FEATURES);
1573 	return (BT_TEST((ulong_t *)featureset, feature));
1574 }
1575 
1576 void
add_x86_feature(void * featureset,uint_t feature)1577 add_x86_feature(void *featureset, uint_t feature)
1578 {
1579 	ASSERT(feature < NUM_X86_FEATURES);
1580 	BT_SET((ulong_t *)featureset, feature);
1581 }
1582 
1583 void
remove_x86_feature(void * featureset,uint_t feature)1584 remove_x86_feature(void *featureset, uint_t feature)
1585 {
1586 	ASSERT(feature < NUM_X86_FEATURES);
1587 	BT_CLEAR((ulong_t *)featureset, feature);
1588 }
1589 
1590 boolean_t
compare_x86_featureset(void * setA,void * setB)1591 compare_x86_featureset(void *setA, void *setB)
1592 {
1593 	/*
1594 	 * We assume that the unused bits of the bitmap are always zero.
1595 	 */
1596 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1597 		return (B_TRUE);
1598 	} else {
1599 		return (B_FALSE);
1600 	}
1601 }
1602 
1603 void
print_x86_featureset(void * featureset)1604 print_x86_featureset(void *featureset)
1605 {
1606 	uint_t i;
1607 
1608 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1609 		if (is_x86_feature(featureset, i)) {
1610 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1611 			    x86_feature_names[i]);
1612 		}
1613 	}
1614 }
1615 
1616 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1617 static size_t xsave_state_size = 0;
1618 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1619 boolean_t xsave_force_disable = B_FALSE;
1620 extern int disable_smap;
1621 
1622 /*
1623  * This is set to platform type we are running on.
1624  */
1625 static int platform_type = -1;
1626 
1627 #if !defined(__xpv)
1628 /*
1629  * Variable to patch if hypervisor platform detection needs to be
1630  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1631  */
1632 int enable_platform_detection = 1;
1633 #endif
1634 
1635 /*
1636  * monitor/mwait info.
1637  *
1638  * size_actual and buf_actual are the real address and size allocated to get
1639  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1640  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1641  * processor cache-line alignment, but this is not guarantied in the furture.
1642  */
1643 struct mwait_info {
1644 	size_t		mon_min;	/* min size to avoid missed wakeups */
1645 	size_t		mon_max;	/* size to avoid false wakeups */
1646 	size_t		size_actual;	/* size actually allocated */
1647 	void		*buf_actual;	/* memory actually allocated */
1648 	uint32_t	support;	/* processor support of monitor/mwait */
1649 };
1650 
1651 /*
1652  * xsave/xrestor info.
1653  *
1654  * This structure contains HW feature bits and the size of the xsave save area.
1655  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1656  * (xsave_state) to describe the xsave layout. However, at runtime the
1657  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1658  * xsave_state structure simply represents the legacy layout of the beginning
1659  * of the xsave area.
1660  */
1661 struct xsave_info {
1662 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1663 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1664 	size_t		xsav_max_size;  /* max size save area for HW features */
1665 	size_t		ymm_size;	/* AVX: size of ymm save area */
1666 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1667 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1668 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1669 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1670 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1671 	size_t		opmask_size;	/* AVX512: size of opmask save */
1672 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1673 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1674 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1675 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1676 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1677 };
1678 
1679 
1680 /*
1681  * These constants determine how many of the elements of the
1682  * cpuid we cache in the cpuid_info data structure; the
1683  * remaining elements are accessible via the cpuid instruction.
1684  */
1685 
1686 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1687 #define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1688 
1689 /*
1690  * See the big theory statement for a more detailed explanation of what some of
1691  * these members mean.
1692  */
1693 struct cpuid_info {
1694 	uint_t cpi_pass;		/* last pass completed */
1695 	/*
1696 	 * standard function information
1697 	 */
1698 	uint_t cpi_maxeax;		/* fn 0: %eax */
1699 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1700 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1701 
1702 	uint_t cpi_family;		/* fn 1: extended family */
1703 	uint_t cpi_model;		/* fn 1: extended model */
1704 	uint_t cpi_step;		/* fn 1: stepping */
1705 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1706 					/*		AMD: package/socket # */
1707 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1708 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1709 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1710 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1711 	uint_t cpi_ncache;		/* fn 2: number of elements */
1712 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1713 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1714 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1715 					/* Intel fn: 4, AMD fn: 8000001d */
1716 	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1717 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1718 	/*
1719 	 * extended function information
1720 	 */
1721 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1722 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1723 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1724 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1725 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1726 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1727 
1728 	id_t cpi_coreid;		/* same coreid => strands share core */
1729 	int cpi_pkgcoreid;		/* core number within single package */
1730 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1731 					/* Intel: fn 4: %eax[31-26] */
1732 
1733 	/*
1734 	 * These values represent the number of bits that are required to store
1735 	 * information about the number of cores and threads.
1736 	 */
1737 	uint_t cpi_ncore_bits;
1738 	uint_t cpi_nthread_bits;
1739 	/*
1740 	 * supported feature information
1741 	 */
1742 	uint32_t cpi_support[6];
1743 #define	STD_EDX_FEATURES	0
1744 #define	AMD_EDX_FEATURES	1
1745 #define	TM_EDX_FEATURES		2
1746 #define	STD_ECX_FEATURES	3
1747 #define	AMD_ECX_FEATURES	4
1748 #define	STD_EBX_FEATURES	5
1749 	/*
1750 	 * Synthesized information, where known.
1751 	 */
1752 	uint32_t cpi_chiprev;		/* See X86_CHIPREV_* in x86_archext.h */
1753 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1754 	uint32_t cpi_socket;		/* Chip package/socket type */
1755 
1756 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1757 	uint32_t cpi_apicid;
1758 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1759 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1760 					/* Intel: 1 */
1761 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1762 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1763 
1764 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1765 };
1766 
1767 
1768 static struct cpuid_info cpuid_info0;
1769 
1770 /*
1771  * These bit fields are defined by the Intel Application Note AP-485
1772  * "Intel Processor Identification and the CPUID Instruction"
1773  */
1774 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1775 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1776 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1777 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1778 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1779 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1780 
1781 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1782 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1783 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1784 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1785 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1786 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1787 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1788 
1789 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1790 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1791 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1792 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1793 
1794 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1795 #define	CPI_XMAXEAX_MAX		0x80000100
1796 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1797 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1798 
1799 /*
1800  * Function 4 (Deterministic Cache Parameters) macros
1801  * Defined by Intel Application Note AP-485
1802  */
1803 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1804 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1805 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1806 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1807 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1808 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1809 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1810 
1811 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1812 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1813 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1814 
1815 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1816 
1817 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1818 
1819 
1820 /*
1821  * A couple of shorthand macros to identify "later" P6-family chips
1822  * like the Pentium M and Core.  First, the "older" P6-based stuff
1823  * (loosely defined as "pre-Pentium-4"):
1824  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1825  */
1826 #define	IS_LEGACY_P6(cpi) (			\
1827 	cpi->cpi_family == 6 &&			\
1828 		(cpi->cpi_model == 1 ||		\
1829 		cpi->cpi_model == 3 ||		\
1830 		cpi->cpi_model == 5 ||		\
1831 		cpi->cpi_model == 6 ||		\
1832 		cpi->cpi_model == 7 ||		\
1833 		cpi->cpi_model == 8 ||		\
1834 		cpi->cpi_model == 0xA ||	\
1835 		cpi->cpi_model == 0xB)		\
1836 )
1837 
1838 /* A "new F6" is everything with family 6 that's not the above */
1839 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1840 
1841 /* Extended family/model support */
1842 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1843 	cpi->cpi_family >= 0xf)
1844 
1845 /*
1846  * Info for monitor/mwait idle loop.
1847  *
1848  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1849  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1850  * 2006.
1851  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1852  * Documentation Updates" #33633, Rev 2.05, December 2006.
1853  */
1854 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1855 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1856 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1857 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1858 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1859 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1860 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1861 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1862 /*
1863  * Number of sub-cstates for a given c-state.
1864  */
1865 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1866 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1867 
1868 /*
1869  * XSAVE leaf 0xD enumeration
1870  */
1871 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1872 #define	CPUID_LEAFD_2_YMM_SIZE		256
1873 
1874 /*
1875  * Common extended leaf names to cut down on typos.
1876  */
1877 #define	CPUID_LEAF_EXT_0		0x80000000
1878 #define	CPUID_LEAF_EXT_8		0x80000008
1879 #define	CPUID_LEAF_EXT_1d		0x8000001d
1880 #define	CPUID_LEAF_EXT_1e		0x8000001e
1881 
1882 /*
1883  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1884  * file to try and keep people using the expected cpuid_* interfaces.
1885  */
1886 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1887 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1888 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1889 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1890 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1891 
1892 /*
1893  * Apply up various platform-dependent restrictions where the
1894  * underlying platform restrictions mean the CPU can be marked
1895  * as less capable than its cpuid instruction would imply.
1896  */
1897 #if defined(__xpv)
1898 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)1899 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1900 {
1901 	switch (eax) {
1902 	case 1: {
1903 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1904 		    0 : CPUID_INTC_EDX_MCA;
1905 		cp->cp_edx &=
1906 		    ~(mcamask |
1907 		    CPUID_INTC_EDX_PSE |
1908 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1909 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1910 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1911 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1912 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1913 		break;
1914 	}
1915 
1916 	case 0x80000001:
1917 		cp->cp_edx &=
1918 		    ~(CPUID_AMD_EDX_PSE |
1919 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1920 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1921 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1922 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1923 		    CPUID_AMD_EDX_TSCP);
1924 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1925 		break;
1926 	default:
1927 		break;
1928 	}
1929 
1930 	switch (vendor) {
1931 	case X86_VENDOR_Intel:
1932 		switch (eax) {
1933 		case 4:
1934 			/*
1935 			 * Zero out the (ncores-per-chip - 1) field
1936 			 */
1937 			cp->cp_eax &= 0x03fffffff;
1938 			break;
1939 		default:
1940 			break;
1941 		}
1942 		break;
1943 	case X86_VENDOR_AMD:
1944 	case X86_VENDOR_HYGON:
1945 		switch (eax) {
1946 
1947 		case 0x80000001:
1948 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1949 			break;
1950 
1951 		case CPUID_LEAF_EXT_8:
1952 			/*
1953 			 * Zero out the (ncores-per-chip - 1) field
1954 			 */
1955 			cp->cp_ecx &= 0xffffff00;
1956 			break;
1957 		default:
1958 			break;
1959 		}
1960 		break;
1961 	default:
1962 		break;
1963 	}
1964 }
1965 #else
1966 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
1967 #endif
1968 
1969 /*
1970  *  Some undocumented ways of patching the results of the cpuid
1971  *  instruction to permit running Solaris 10 on future cpus that
1972  *  we don't currently support.  Could be set to non-zero values
1973  *  via settings in eeprom.
1974  */
1975 
1976 uint32_t cpuid_feature_ecx_include;
1977 uint32_t cpuid_feature_ecx_exclude;
1978 uint32_t cpuid_feature_edx_include;
1979 uint32_t cpuid_feature_edx_exclude;
1980 
1981 /*
1982  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1983  */
1984 void
cpuid_alloc_space(cpu_t * cpu)1985 cpuid_alloc_space(cpu_t *cpu)
1986 {
1987 	/*
1988 	 * By convention, cpu0 is the boot cpu, which is set up
1989 	 * before memory allocation is available.  All other cpus get
1990 	 * their cpuid_info struct allocated here.
1991 	 */
1992 	ASSERT(cpu->cpu_id != 0);
1993 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1994 	cpu->cpu_m.mcpu_cpi =
1995 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1996 }
1997 
1998 void
cpuid_free_space(cpu_t * cpu)1999 cpuid_free_space(cpu_t *cpu)
2000 {
2001 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2002 	int i;
2003 
2004 	ASSERT(cpi != NULL);
2005 	ASSERT(cpi != &cpuid_info0);
2006 
2007 	/*
2008 	 * Free up any cache leaf related dynamic storage. The first entry was
2009 	 * cached from the standard cpuid storage, so we should not free it.
2010 	 */
2011 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2012 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2013 	if (cpi->cpi_cache_leaf_size > 0)
2014 		kmem_free(cpi->cpi_cache_leaves,
2015 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2016 
2017 	kmem_free(cpi, sizeof (*cpi));
2018 	cpu->cpu_m.mcpu_cpi = NULL;
2019 }
2020 
2021 #if !defined(__xpv)
2022 /*
2023  * Determine the type of the underlying platform. This is used to customize
2024  * initialization of various subsystems (e.g. TSC). determine_platform() must
2025  * only ever be called once to prevent two processors from seeing different
2026  * values of platform_type. Must be called before cpuid_pass1(), the earliest
2027  * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
2028  */
2029 void
determine_platform(void)2030 determine_platform(void)
2031 {
2032 	struct cpuid_regs cp;
2033 	uint32_t base;
2034 	uint32_t regs[4];
2035 	char *hvstr = (char *)regs;
2036 
2037 	ASSERT(platform_type == -1);
2038 
2039 	platform_type = HW_NATIVE;
2040 
2041 	if (!enable_platform_detection)
2042 		return;
2043 
2044 	/*
2045 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2046 	 * vendor signature, and set platform type accordingly.
2047 	 *
2048 	 * References:
2049 	 * http://lkml.org/lkml/2008/10/1/246
2050 	 * http://kb.vmware.com/kb/1009458
2051 	 */
2052 	cp.cp_eax = 0x1;
2053 	(void) __cpuid_insn(&cp);
2054 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2055 		cp.cp_eax = 0x40000000;
2056 		(void) __cpuid_insn(&cp);
2057 		regs[0] = cp.cp_ebx;
2058 		regs[1] = cp.cp_ecx;
2059 		regs[2] = cp.cp_edx;
2060 		regs[3] = 0;
2061 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2062 			platform_type = HW_XEN_HVM;
2063 			return;
2064 		}
2065 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2066 			platform_type = HW_VMWARE;
2067 			return;
2068 		}
2069 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2070 			platform_type = HW_KVM;
2071 			return;
2072 		}
2073 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2074 			platform_type = HW_BHYVE;
2075 			return;
2076 		}
2077 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
2078 			platform_type = HW_MICROSOFT;
2079 	} else {
2080 		/*
2081 		 * Check older VMware hardware versions. VMware hypervisor is
2082 		 * detected by performing an IN operation to VMware hypervisor
2083 		 * port and checking that value returned in %ebx is VMware
2084 		 * hypervisor magic value.
2085 		 *
2086 		 * References: http://kb.vmware.com/kb/1009458
2087 		 */
2088 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2089 		if (regs[1] == VMWARE_HVMAGIC) {
2090 			platform_type = HW_VMWARE;
2091 			return;
2092 		}
2093 	}
2094 
2095 	/*
2096 	 * Check Xen hypervisor. In a fully virtualized domain,
2097 	 * Xen's pseudo-cpuid function returns a string representing the
2098 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2099 	 * supported cpuid function. We need at least a (base + 2) leaf value
2100 	 * to do what we want to do. Try different base values, since the
2101 	 * hypervisor might use a different one depending on whether Hyper-V
2102 	 * emulation is switched on by default or not.
2103 	 */
2104 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2105 		cp.cp_eax = base;
2106 		(void) __cpuid_insn(&cp);
2107 		regs[0] = cp.cp_ebx;
2108 		regs[1] = cp.cp_ecx;
2109 		regs[2] = cp.cp_edx;
2110 		regs[3] = 0;
2111 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2112 		    cp.cp_eax >= (base + 2)) {
2113 			platform_type &= ~HW_NATIVE;
2114 			platform_type |= HW_XEN_HVM;
2115 			return;
2116 		}
2117 	}
2118 }
2119 
2120 int
get_hwenv(void)2121 get_hwenv(void)
2122 {
2123 	ASSERT(platform_type != -1);
2124 	return (platform_type);
2125 }
2126 
2127 int
is_controldom(void)2128 is_controldom(void)
2129 {
2130 	return (0);
2131 }
2132 
2133 #else
2134 
2135 int
get_hwenv(void)2136 get_hwenv(void)
2137 {
2138 	return (HW_XEN_PV);
2139 }
2140 
2141 int
is_controldom(void)2142 is_controldom(void)
2143 {
2144 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2145 }
2146 
2147 #endif	/* __xpv */
2148 
2149 /*
2150  * Make sure that we have gathered all of the CPUID leaves that we might need to
2151  * determine topology. We assume that the standard leaf 1 has already been done
2152  * and that xmaxeax has already been calculated.
2153  */
2154 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2155 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2156 {
2157 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2158 
2159 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2160 		struct cpuid_regs *cp;
2161 
2162 		cp = &cpi->cpi_extd[8];
2163 		cp->cp_eax = CPUID_LEAF_EXT_8;
2164 		(void) __cpuid_insn(cp);
2165 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2166 	}
2167 
2168 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2169 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2170 		struct cpuid_regs *cp;
2171 
2172 		cp = &cpi->cpi_extd[0x1e];
2173 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2174 		(void) __cpuid_insn(cp);
2175 	}
2176 }
2177 
2178 /*
2179  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2180  * it to everything else. If not, and we're on an AMD system where 8000001e is
2181  * valid, then we use that. Othewrise, we fall back to the default value for the
2182  * APIC ID in leaf 1.
2183  */
2184 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2185 cpuid_gather_apicid(struct cpuid_info *cpi)
2186 {
2187 	/*
2188 	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2189 	 * it, we need to gather it again.
2190 	 */
2191 	if (cpi->cpi_maxeax >= 0xB) {
2192 		struct cpuid_regs regs;
2193 		struct cpuid_regs *cp;
2194 
2195 		cp = &regs;
2196 		cp->cp_eax = 0xB;
2197 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2198 		(void) __cpuid_insn(cp);
2199 
2200 		if (cp->cp_ebx != 0) {
2201 			return (cp->cp_edx);
2202 		}
2203 	}
2204 
2205 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2206 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2207 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2208 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2209 		return (cpi->cpi_extd[0x1e].cp_eax);
2210 	}
2211 
2212 	return (CPI_APIC_ID(cpi));
2213 }
2214 
2215 /*
2216  * For AMD processors, attempt to calculate the number of chips and cores that
2217  * exist. The way that we do this varies based on the generation, because the
2218  * generations themselves have changed dramatically.
2219  *
2220  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2221  * However, with the advent of family 17h (Zen) it actually tells us the number
2222  * of threads, so we need to look at leaf 0x8000001e if available to determine
2223  * its value. Otherwise, for all prior families, the number of enabled cores is
2224  * the same as threads.
2225  *
2226  * If we do not have leaf 0x80000008, then we assume that this processor does
2227  * not have anything. AMD's older CPUID specification says there's no reason to
2228  * fall back to leaf 1.
2229  *
2230  * In some virtualization cases we will not have leaf 8000001e or it will be
2231  * zero. When that happens we assume the number of threads is one.
2232  */
2233 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2234 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2235 {
2236 	uint_t nthreads, nthread_per_core;
2237 
2238 	nthreads = nthread_per_core = 1;
2239 
2240 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2241 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2242 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2243 		nthreads = CPI_CPU_COUNT(cpi);
2244 	}
2245 
2246 	/*
2247 	 * For us to have threads, and know about it, we have to be at least at
2248 	 * family 17h and have the cpuid bit that says we have extended
2249 	 * topology.
2250 	 */
2251 	if (cpi->cpi_family >= 0x17 &&
2252 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2253 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2254 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2255 	}
2256 
2257 	*ncpus = nthreads;
2258 	*ncores = nthreads / nthread_per_core;
2259 }
2260 
2261 /*
2262  * Seed the initial values for the cores and threads for an Intel based
2263  * processor. These values will be overwritten if we detect that the processor
2264  * supports CPUID leaf 0xb.
2265  */
2266 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2267 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2268 {
2269 	/*
2270 	 * Only seed the number of physical cores from the first level leaf 4
2271 	 * information. The number of threads there indicate how many share the
2272 	 * L1 cache, which may or may not have anything to do with the number of
2273 	 * logical CPUs per core.
2274 	 */
2275 	if (cpi->cpi_maxeax >= 4) {
2276 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2277 	} else {
2278 		*ncores = 1;
2279 	}
2280 
2281 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2282 		*ncpus = CPI_CPU_COUNT(cpi);
2283 	} else {
2284 		*ncpus = *ncores;
2285 	}
2286 }
2287 
2288 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2289 cpuid_leafB_getids(cpu_t *cpu)
2290 {
2291 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2292 	struct cpuid_regs regs;
2293 	struct cpuid_regs *cp;
2294 
2295 	if (cpi->cpi_maxeax < 0xB)
2296 		return (B_FALSE);
2297 
2298 	cp = &regs;
2299 	cp->cp_eax = 0xB;
2300 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2301 
2302 	(void) __cpuid_insn(cp);
2303 
2304 	/*
2305 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2306 	 * indicates that the extended topology enumeration leaf is
2307 	 * available.
2308 	 */
2309 	if (cp->cp_ebx != 0) {
2310 		uint32_t x2apic_id = 0;
2311 		uint_t coreid_shift = 0;
2312 		uint_t ncpu_per_core = 1;
2313 		uint_t chipid_shift = 0;
2314 		uint_t ncpu_per_chip = 1;
2315 		uint_t i;
2316 		uint_t level;
2317 
2318 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2319 			cp->cp_eax = 0xB;
2320 			cp->cp_ecx = i;
2321 
2322 			(void) __cpuid_insn(cp);
2323 			level = CPI_CPU_LEVEL_TYPE(cp);
2324 
2325 			if (level == 1) {
2326 				x2apic_id = cp->cp_edx;
2327 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2328 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2329 			} else if (level == 2) {
2330 				x2apic_id = cp->cp_edx;
2331 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2332 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2333 			}
2334 		}
2335 
2336 		/*
2337 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2338 		 */
2339 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2340 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2341 		    ncpu_per_core;
2342 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2343 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2344 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2345 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2346 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2347 		cpi->cpi_compunitid = cpi->cpi_coreid;
2348 
2349 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2350 			cpi->cpi_nthread_bits = coreid_shift;
2351 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2352 		}
2353 
2354 		return (B_TRUE);
2355 	} else {
2356 		return (B_FALSE);
2357 	}
2358 }
2359 
2360 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2361 cpuid_intel_getids(cpu_t *cpu, void *feature)
2362 {
2363 	uint_t i;
2364 	uint_t chipid_shift = 0;
2365 	uint_t coreid_shift = 0;
2366 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2367 
2368 	/*
2369 	 * There are no compute units or processor nodes currently on Intel.
2370 	 * Always set these to one.
2371 	 */
2372 	cpi->cpi_procnodes_per_pkg = 1;
2373 	cpi->cpi_cores_per_compunit = 1;
2374 
2375 	/*
2376 	 * If cpuid Leaf B is present, use that to try and get this information.
2377 	 * It will be the most accurate for Intel CPUs.
2378 	 */
2379 	if (cpuid_leafB_getids(cpu))
2380 		return;
2381 
2382 	/*
2383 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2384 	 * and ncore_per_chip. These represent the largest power of two values
2385 	 * that we need to cover all of the IDs in the system. Therefore, we use
2386 	 * those values to seed the number of bits needed to cover information
2387 	 * in the case when leaf B is not available. These values will probably
2388 	 * be larger than required, but that's OK.
2389 	 */
2390 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2391 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2392 
2393 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2394 		chipid_shift++;
2395 
2396 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2397 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2398 
2399 	if (is_x86_feature(feature, X86FSET_CMP)) {
2400 		/*
2401 		 * Multi-core (and possibly multi-threaded)
2402 		 * processors.
2403 		 */
2404 		uint_t ncpu_per_core = 0;
2405 
2406 		if (cpi->cpi_ncore_per_chip == 1)
2407 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2408 		else if (cpi->cpi_ncore_per_chip > 1)
2409 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2410 			    cpi->cpi_ncore_per_chip;
2411 		/*
2412 		 * 8bit APIC IDs on dual core Pentiums
2413 		 * look like this:
2414 		 *
2415 		 * +-----------------------+------+------+
2416 		 * | Physical Package ID   |  MC  |  HT  |
2417 		 * +-----------------------+------+------+
2418 		 * <------- chipid -------->
2419 		 * <------- coreid --------------->
2420 		 *			   <--- clogid -->
2421 		 *			   <------>
2422 		 *			   pkgcoreid
2423 		 *
2424 		 * Where the number of bits necessary to
2425 		 * represent MC and HT fields together equals
2426 		 * to the minimum number of bits necessary to
2427 		 * store the value of cpi->cpi_ncpu_per_chip.
2428 		 * Of those bits, the MC part uses the number
2429 		 * of bits necessary to store the value of
2430 		 * cpi->cpi_ncore_per_chip.
2431 		 */
2432 		for (i = 1; i < ncpu_per_core; i <<= 1)
2433 			coreid_shift++;
2434 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2435 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2436 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2437 		/*
2438 		 * Single-core multi-threaded processors.
2439 		 */
2440 		cpi->cpi_coreid = cpi->cpi_chipid;
2441 		cpi->cpi_pkgcoreid = 0;
2442 	} else {
2443 		/*
2444 		 * Single-core single-thread processors.
2445 		 */
2446 		cpi->cpi_coreid = cpu->cpu_id;
2447 		cpi->cpi_pkgcoreid = 0;
2448 	}
2449 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2450 	cpi->cpi_compunitid = cpi->cpi_coreid;
2451 }
2452 
2453 /*
2454  * Historically, AMD has had CMP chips with only a single thread per core.
2455  * However, starting in family 17h (Zen), this has changed and they now have
2456  * multiple threads. Our internal core id needs to be a unique value.
2457  *
2458  * To determine the core id of an AMD system, if we're from a family before 17h,
2459  * then we just use the cpu id, as that gives us a good value that will be
2460  * unique for each core. If instead, we're on family 17h or later, then we need
2461  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2462  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2463  * We can't use the normal core id in that leaf as it's only unique within the
2464  * socket, which is perfect for cpi_pkgcoreid, but not us.
2465  */
2466 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2467 cpuid_amd_get_coreid(cpu_t *cpu)
2468 {
2469 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2470 
2471 	if (cpi->cpi_family >= 0x17 &&
2472 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2473 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2474 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2475 		if (nthreads > 1) {
2476 			VERIFY3U(nthreads, ==, 2);
2477 			return (cpi->cpi_apicid >> 1);
2478 		}
2479 	}
2480 
2481 	return (cpu->cpu_id);
2482 }
2483 
2484 /*
2485  * IDs on AMD is a more challenging task. This is notable because of the
2486  * following two facts:
2487  *
2488  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2489  *     also no way to get an actual unique core id from the system. As such, we
2490  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2491  *     however, guarantee that sibling cores of a chip will have sequential
2492  *     coreids starting at a multiple of the number of cores per chip - that is
2493  *     usually the case, but if the ACPI MADT table is presented in a different
2494  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2495  *
2496  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2497  *     called compute units. These compute units share the L1I cache, L2 cache,
2498  *     and the FPU. To deal with this, a new topology leaf was added in
2499  *     0x8000001e. However, parts of this leaf have different meanings
2500  *     once we get to family 0x17.
2501  */
2502 
2503 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2504 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2505 {
2506 	int i, first_half, coreidsz;
2507 	uint32_t nb_caps_reg;
2508 	uint_t node2_1;
2509 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2510 	struct cpuid_regs *cp;
2511 
2512 	/*
2513 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2514 	 * hasn't been stripped by virtualization). We always set the compute
2515 	 * unit id to the same value. Also, initialize the default number of
2516 	 * cores per compute unit and nodes per package. This will be
2517 	 * overwritten when we know information about a particular family.
2518 	 */
2519 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2520 	cpi->cpi_compunitid = cpi->cpi_coreid;
2521 	cpi->cpi_cores_per_compunit = 1;
2522 	cpi->cpi_procnodes_per_pkg = 1;
2523 
2524 	/*
2525 	 * To construct the logical ID, we need to determine how many APIC IDs
2526 	 * are dedicated to the cores and threads. This is provided for us in
2527 	 * 0x80000008. However, if it's not present (say due to virtualization),
2528 	 * then we assume it's one. This should be present on all 64-bit AMD
2529 	 * processors.  It was added in family 0xf (Hammer).
2530 	 */
2531 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2532 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2533 
2534 		/*
2535 		 * In AMD parlance chip is really a node while illumos
2536 		 * uses chip as equivalent to socket/package.
2537 		 */
2538 		if (coreidsz == 0) {
2539 			/* Use legacy method */
2540 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2541 				coreidsz++;
2542 			if (coreidsz == 0)
2543 				coreidsz = 1;
2544 		}
2545 	} else {
2546 		/* Assume single-core part */
2547 		coreidsz = 1;
2548 	}
2549 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2550 
2551 	/*
2552 	 * The package core ID varies depending on the family. While it may be
2553 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2554 	 * this value is the core id in the given node. For non-virtualized
2555 	 * family 17h, we need to take the logical core id and shift off the
2556 	 * threads like we do when getting the core id.  Otherwise, we can use
2557 	 * the clogid as is. When family 17h is virtualized, the clogid should
2558 	 * be sufficient as if we don't have valid data in the leaf, then we
2559 	 * won't think we have SMT, in which case the cpi_clogid should be
2560 	 * sufficient.
2561 	 */
2562 	if (cpi->cpi_family >= 0x17 &&
2563 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2564 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2565 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2566 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2567 		if (nthreads > 1) {
2568 			VERIFY3U(nthreads, ==, 2);
2569 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2570 		} else {
2571 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2572 		}
2573 	} else {
2574 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2575 	}
2576 
2577 	/*
2578 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2579 	 * (bulldozer) or newer, then we can derive all of this from leaf
2580 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2581 	 */
2582 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2583 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2584 		cp = &cpi->cpi_extd[0x1e];
2585 
2586 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2587 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2588 
2589 		/*
2590 		 * For Bulldozer-era CPUs, recalculate the compute unit
2591 		 * information.
2592 		 */
2593 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2594 			cpi->cpi_cores_per_compunit =
2595 			    BITX(cp->cp_ebx, 15, 8) + 1;
2596 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2597 			    (cpi->cpi_ncore_per_chip /
2598 			    cpi->cpi_cores_per_compunit) *
2599 			    (cpi->cpi_procnodeid /
2600 			    cpi->cpi_procnodes_per_pkg);
2601 		}
2602 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2603 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2604 	} else if (cpi->cpi_family == 0x10) {
2605 		/*
2606 		 * See if we are a multi-node processor.
2607 		 * All processors in the system have the same number of nodes
2608 		 */
2609 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2610 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2611 			/* Single-node */
2612 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2613 			    coreidsz);
2614 		} else {
2615 
2616 			/*
2617 			 * Multi-node revision D (2 nodes per package
2618 			 * are supported)
2619 			 */
2620 			cpi->cpi_procnodes_per_pkg = 2;
2621 
2622 			first_half = (cpi->cpi_pkgcoreid <=
2623 			    (cpi->cpi_ncore_per_chip/2 - 1));
2624 
2625 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2626 				/* We are BSP */
2627 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2628 			} else {
2629 
2630 				/* We are AP */
2631 				/* NodeId[2:1] bits to use for reading F3xe8 */
2632 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2633 
2634 				nb_caps_reg =
2635 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2636 
2637 				/*
2638 				 * Check IntNodeNum bit (31:30, but bit 31 is
2639 				 * always 0 on dual-node processors)
2640 				 */
2641 				if (BITX(nb_caps_reg, 30, 30) == 0)
2642 					cpi->cpi_procnodeid = node2_1 +
2643 					    !first_half;
2644 				else
2645 					cpi->cpi_procnodeid = node2_1 +
2646 					    first_half;
2647 			}
2648 		}
2649 	} else {
2650 		cpi->cpi_procnodeid = 0;
2651 	}
2652 
2653 	cpi->cpi_chipid =
2654 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2655 
2656 	cpi->cpi_ncore_bits = coreidsz;
2657 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2658 	    cpi->cpi_ncore_per_chip);
2659 }
2660 
2661 static void
spec_uarch_flush_noop(void)2662 spec_uarch_flush_noop(void)
2663 {
2664 }
2665 
2666 /*
2667  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2668  * MDS-related micro-architectural state that would normally happen by calling
2669  * x86_md_clear().
2670  */
2671 static void
spec_uarch_flush_msr(void)2672 spec_uarch_flush_msr(void)
2673 {
2674 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2675 }
2676 
2677 /*
2678  * This function points to a function that will flush certain
2679  * micro-architectural state on the processor. This flush is used to mitigate
2680  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2681  * function can point to one of three functions:
2682  *
2683  * - A noop which is done because we either are vulnerable, but do not have
2684  *   microcode available to help deal with a fix, or because we aren't
2685  *   vulnerable.
2686  *
2687  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2688  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2689  *   however, it only flushes the MDS related micro-architectural state on the
2690  *   current hyperthread, it does not do anything for the twin.
2691  *
2692  * - x86_md_clear which will flush the MDS related state. This is done when we
2693  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2694  *   (RDCL_NO is set).
2695  */
2696 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2697 
2698 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2699 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2700 {
2701 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2702 
2703 	/*
2704 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2705 	 * has been fixed in hardware, it doesn't cover everything related to
2706 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2707 	 * need to mitigate this.
2708 	 */
2709 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2710 	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2711 		return;
2712 	}
2713 
2714 	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2715 		const uint8_t nop = NOP_INSTR;
2716 		uint8_t *md = (uint8_t *)x86_md_clear;
2717 
2718 		*md = nop;
2719 	}
2720 
2721 	membar_producer();
2722 }
2723 
2724 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2725 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2726 {
2727 	boolean_t need_l1d, need_mds;
2728 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2729 
2730 	/*
2731 	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2732 	 * hardware, then there's nothing left for us to do for enabling the
2733 	 * flush. We can also go ahead and say that SMT exclusion is
2734 	 * unnecessary.
2735 	 */
2736 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2737 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2738 	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2739 		extern int smt_exclusion;
2740 		smt_exclusion = 0;
2741 		spec_uarch_flush = spec_uarch_flush_noop;
2742 		membar_producer();
2743 		return;
2744 	}
2745 
2746 	/*
2747 	 * The locations where we need to perform an L1D flush are required both
2748 	 * for mitigating L1TF and MDS. When verw support is present in
2749 	 * microcode, then the L1D flush will take care of doing that as well.
2750 	 * However, if we have a system where RDCL_NO is present, but we don't
2751 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2752 	 * L1D flush.
2753 	 */
2754 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2755 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2756 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2757 		need_l1d = B_TRUE;
2758 	} else {
2759 		need_l1d = B_FALSE;
2760 	}
2761 
2762 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2763 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2764 		need_mds = B_TRUE;
2765 	} else {
2766 		need_mds = B_FALSE;
2767 	}
2768 
2769 	if (need_l1d) {
2770 		spec_uarch_flush = spec_uarch_flush_msr;
2771 	} else if (need_mds) {
2772 		spec_uarch_flush = x86_md_clear;
2773 	} else {
2774 		/*
2775 		 * We have no hardware mitigations available to us.
2776 		 */
2777 		spec_uarch_flush = spec_uarch_flush_noop;
2778 	}
2779 	membar_producer();
2780 }
2781 
2782 /*
2783  * We default to enabling RSB mitigations.
2784  */
2785 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)2786 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2787 {
2788 	const uint8_t ret = RET_INSTR;
2789 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2790 
2791 	switch (mit) {
2792 	case X86_SPECTREV2_ENHANCED_IBRS:
2793 	case X86_SPECTREV2_DISABLED:
2794 		*stuff = ret;
2795 		break;
2796 	default:
2797 		break;
2798 	}
2799 }
2800 
2801 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)2802 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2803 {
2804 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2805 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2806 	    "_r14", "_r15" };
2807 	const uint_t nthunks = ARRAY_SIZE(thunks);
2808 	const char *type;
2809 	uint_t i;
2810 
2811 	if (mit == x86_spectrev2_mitigation)
2812 		return;
2813 
2814 	switch (mit) {
2815 	case X86_SPECTREV2_RETPOLINE:
2816 		type = "gen";
2817 		break;
2818 	case X86_SPECTREV2_ENHANCED_IBRS:
2819 	case X86_SPECTREV2_DISABLED:
2820 		type = "jmp";
2821 		break;
2822 	default:
2823 		panic("asked to updated retpoline state with unknown state!");
2824 	}
2825 
2826 	for (i = 0; i < nthunks; i++) {
2827 		uintptr_t source, dest;
2828 		int ssize, dsize;
2829 		char sourcebuf[64], destbuf[64];
2830 
2831 		(void) snprintf(destbuf, sizeof (destbuf),
2832 		    "__x86_indirect_thunk%s", thunks[i]);
2833 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2834 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2835 
2836 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2837 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2838 		VERIFY3U(source, !=, 0);
2839 		VERIFY3U(dest, !=, 0);
2840 		VERIFY3S(dsize, >=, ssize);
2841 		bcopy((void *)source, (void *)dest, ssize);
2842 	}
2843 }
2844 
2845 static void
cpuid_enable_enhanced_ibrs(void)2846 cpuid_enable_enhanced_ibrs(void)
2847 {
2848 	uint64_t val;
2849 
2850 	val = rdmsr(MSR_IA32_SPEC_CTRL);
2851 	val |= IA32_SPEC_CTRL_IBRS;
2852 	wrmsr(MSR_IA32_SPEC_CTRL, val);
2853 }
2854 
2855 /*
2856  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2857  * we can disable TSX, we do so.
2858  *
2859  * This determination is done only on the boot CPU, potentially after loading
2860  * updated microcode.
2861  */
2862 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)2863 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2864 {
2865 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2866 
2867 	VERIFY(cpu->cpu_id == 0);
2868 
2869 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2870 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2871 		return;
2872 	}
2873 
2874 	if (x86_disable_taa) {
2875 		x86_taa_mitigation = X86_TAA_DISABLED;
2876 		return;
2877 	}
2878 
2879 	/*
2880 	 * If we do not have the ability to disable TSX, then our only
2881 	 * mitigation options are in hardware (TAA_NO), or by using our existing
2882 	 * MDS mitigation as described above.  The latter relies upon us having
2883 	 * configured MDS mitigations correctly! This includes disabling SMT if
2884 	 * we want to cross-CPU-thread protection.
2885 	 */
2886 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2887 		/*
2888 		 * It's not clear whether any parts will enumerate TAA_NO
2889 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
2890 		 */
2891 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2892 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2893 			return;
2894 		}
2895 
2896 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2897 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2898 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
2899 		} else {
2900 			x86_taa_mitigation = X86_TAA_NOTHING;
2901 		}
2902 		return;
2903 	}
2904 
2905 	/*
2906 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
2907 	 * enough in boot.
2908 	 *
2909 	 * Otherwise, we'll fall back to causing transactions to abort as our
2910 	 * mitigation. TSX-using code will always take the fallback path.
2911 	 */
2912 	if (cpi->cpi_pass < 4) {
2913 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2914 	} else {
2915 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2916 	}
2917 }
2918 
2919 /*
2920  * As mentioned, we should only touch the MSR when we've got a suitable
2921  * microcode loaded on this CPU.
2922  */
2923 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)2924 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2925 {
2926 	uint64_t val;
2927 
2928 	switch (taa) {
2929 	case X86_TAA_TSX_DISABLE:
2930 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2931 			return;
2932 		val = rdmsr(MSR_IA32_TSX_CTRL);
2933 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2934 		wrmsr(MSR_IA32_TSX_CTRL, val);
2935 		break;
2936 	case X86_TAA_TSX_FORCE_ABORT:
2937 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2938 			return;
2939 		val = rdmsr(MSR_IA32_TSX_CTRL);
2940 		val |= IA32_TSX_CTRL_RTM_DISABLE;
2941 		wrmsr(MSR_IA32_TSX_CTRL, val);
2942 		break;
2943 	case X86_TAA_HW_MITIGATED:
2944 	case X86_TAA_MD_CLEAR:
2945 	case X86_TAA_DISABLED:
2946 	case X86_TAA_NOTHING:
2947 		break;
2948 	}
2949 }
2950 
2951 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)2952 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2953 {
2954 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2955 	x86_spectrev2_mitigation_t v2mit;
2956 
2957 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2958 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2959 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2960 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2961 			add_x86_feature(featureset, X86FSET_IBPB);
2962 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2963 			add_x86_feature(featureset, X86FSET_IBRS);
2964 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2965 			add_x86_feature(featureset, X86FSET_STIBP);
2966 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2967 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
2968 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2969 			add_x86_feature(featureset, X86FSET_SSBD);
2970 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2971 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2972 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2973 			add_x86_feature(featureset, X86FSET_SSB_NO);
2974 		/*
2975 		 * Don't enable enhanced IBRS unless we're told that we should
2976 		 * prefer it and it has the same semantics as Intel. This is
2977 		 * split into two bits rather than a single one.
2978 		 */
2979 		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2980 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2981 			add_x86_feature(featureset, X86FSET_IBRS_ALL);
2982 		}
2983 
2984 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2985 	    cpi->cpi_maxeax >= 7) {
2986 		struct cpuid_regs *ecp;
2987 		ecp = &cpi->cpi_std[7];
2988 
2989 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2990 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
2991 		}
2992 
2993 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2994 			add_x86_feature(featureset, X86FSET_IBRS);
2995 			add_x86_feature(featureset, X86FSET_IBPB);
2996 		}
2997 
2998 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2999 			add_x86_feature(featureset, X86FSET_STIBP);
3000 		}
3001 
3002 		/*
3003 		 * Don't read the arch caps MSR on xpv where we lack the
3004 		 * on_trap().
3005 		 */
3006 #ifndef __xpv
3007 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3008 			on_trap_data_t otd;
3009 
3010 			/*
3011 			 * Be paranoid and assume we'll get a #GP.
3012 			 */
3013 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3014 				uint64_t reg;
3015 
3016 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3017 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3018 					add_x86_feature(featureset,
3019 					    X86FSET_RDCL_NO);
3020 				}
3021 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3022 					add_x86_feature(featureset,
3023 					    X86FSET_IBRS_ALL);
3024 				}
3025 				if (reg & IA32_ARCH_CAP_RSBA) {
3026 					add_x86_feature(featureset,
3027 					    X86FSET_RSBA);
3028 				}
3029 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3030 					add_x86_feature(featureset,
3031 					    X86FSET_L1D_VM_NO);
3032 				}
3033 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3034 					add_x86_feature(featureset,
3035 					    X86FSET_SSB_NO);
3036 				}
3037 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3038 					add_x86_feature(featureset,
3039 					    X86FSET_MDS_NO);
3040 				}
3041 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3042 					add_x86_feature(featureset,
3043 					    X86FSET_TSX_CTRL);
3044 				}
3045 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3046 					add_x86_feature(featureset,
3047 					    X86FSET_TAA_NO);
3048 				}
3049 			}
3050 			no_trap();
3051 		}
3052 #endif	/* !__xpv */
3053 
3054 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3055 			add_x86_feature(featureset, X86FSET_SSBD);
3056 
3057 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3058 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3059 	}
3060 
3061 	/*
3062 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3063 	 * will have already run this function and determined what we need to
3064 	 * do. This gives us a hook for per-HW thread mitigations such as
3065 	 * enhanced IBRS, or disabling TSX.
3066 	 */
3067 	if (cpu->cpu_id != 0) {
3068 		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
3069 			cpuid_enable_enhanced_ibrs();
3070 		}
3071 
3072 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3073 		return;
3074 	}
3075 
3076 	/*
3077 	 * Go through and initialize various security mechanisms that we should
3078 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3079 	 * TAA.
3080 	 */
3081 
3082 	/*
3083 	 * By default we've come in with retpolines enabled. Check whether we
3084 	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3085 	 * by default, but disabled if we are using enhanced IBRS. Note, we do
3086 	 * not allow the use of AMD optimized retpolines as it was disclosed by
3087 	 * AMD in March 2022 that they were still vulnerable. Prior to that
3088 	 * point, we used them.
3089 	 */
3090 	if (x86_disable_spectrev2 != 0) {
3091 		v2mit = X86_SPECTREV2_DISABLED;
3092 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3093 		cpuid_enable_enhanced_ibrs();
3094 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3095 	} else {
3096 		v2mit = X86_SPECTREV2_RETPOLINE;
3097 	}
3098 
3099 	cpuid_patch_retpolines(v2mit);
3100 	cpuid_patch_rsb(v2mit);
3101 	x86_spectrev2_mitigation = v2mit;
3102 	membar_producer();
3103 
3104 	/*
3105 	 * We need to determine what changes are required for mitigating L1TF
3106 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3107 	 * is required.
3108 	 *
3109 	 * If any of these are present, then we need to flush u-arch state at
3110 	 * various points. For MDS, we need to do so whenever we change to a
3111 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3112 	 * flush the L1D cache at VM entry. When we have microcode that handles
3113 	 * MDS, the L1D flush also clears the other u-arch state that the
3114 	 * md_clear does.
3115 	 */
3116 
3117 	/*
3118 	 * Update whether or not we need to be taking explicit action against
3119 	 * MDS.
3120 	 */
3121 	cpuid_update_md_clear(cpu, featureset);
3122 
3123 	/*
3124 	 * Determine whether SMT exclusion is required and whether or not we
3125 	 * need to perform an l1d flush.
3126 	 */
3127 	cpuid_update_l1d_flush(cpu, featureset);
3128 
3129 	/*
3130 	 * Determine what our mitigation strategy should be for TAA and then
3131 	 * also apply TAA mitigations.
3132 	 */
3133 	cpuid_update_tsx(cpu, featureset);
3134 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3135 }
3136 
3137 /*
3138  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3139  */
3140 void
setup_xfem(void)3141 setup_xfem(void)
3142 {
3143 	uint64_t flags = XFEATURE_LEGACY_FP;
3144 
3145 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3146 
3147 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3148 		flags |= XFEATURE_SSE;
3149 
3150 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3151 		flags |= XFEATURE_AVX;
3152 
3153 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3154 		flags |= XFEATURE_AVX512;
3155 
3156 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3157 
3158 	xsave_bv_all = flags;
3159 }
3160 
3161 static void
cpuid_pass1_topology(cpu_t * cpu,uchar_t * featureset)3162 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
3163 {
3164 	struct cpuid_info *cpi;
3165 
3166 	cpi = cpu->cpu_m.mcpu_cpi;
3167 
3168 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3169 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3170 		cpuid_gather_amd_topology_leaves(cpu);
3171 	}
3172 
3173 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3174 
3175 	/*
3176 	 * Before we can calculate the IDs that we should assign to this
3177 	 * processor, we need to understand how many cores and threads it has.
3178 	 */
3179 	switch (cpi->cpi_vendor) {
3180 	case X86_VENDOR_Intel:
3181 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3182 		    &cpi->cpi_ncore_per_chip);
3183 		break;
3184 	case X86_VENDOR_AMD:
3185 	case X86_VENDOR_HYGON:
3186 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3187 		    &cpi->cpi_ncore_per_chip);
3188 		break;
3189 	default:
3190 		/*
3191 		 * If we have some other x86 compatible chip, it's not clear how
3192 		 * they would behave. The most common case is virtualization
3193 		 * today, though there are also 64-bit VIA chips. Assume that
3194 		 * all we can get is the basic Leaf 1 HTT information.
3195 		 */
3196 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3197 			cpi->cpi_ncore_per_chip = 1;
3198 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3199 		}
3200 		break;
3201 	}
3202 
3203 	/*
3204 	 * Based on the calculated number of threads and cores, potentially
3205 	 * assign the HTT and CMT features.
3206 	 */
3207 	if (cpi->cpi_ncore_per_chip > 1) {
3208 		add_x86_feature(featureset, X86FSET_CMP);
3209 	}
3210 
3211 	if (cpi->cpi_ncpu_per_chip > 1 &&
3212 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3213 		add_x86_feature(featureset, X86FSET_HTT);
3214 	}
3215 
3216 	/*
3217 	 * Now that has been set up, we need to go through and calculate all of
3218 	 * the rest of the parameters that exist. If we think the CPU doesn't
3219 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3220 	 * up information in some way. The most likely case for this is
3221 	 * virtualization where we have a lot of partial topology information.
3222 	 */
3223 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3224 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3225 		/*
3226 		 * This is a single core, single-threaded processor.
3227 		 */
3228 		cpi->cpi_procnodes_per_pkg = 1;
3229 		cpi->cpi_cores_per_compunit = 1;
3230 		cpi->cpi_compunitid = 0;
3231 		cpi->cpi_chipid = -1;
3232 		cpi->cpi_clogid = 0;
3233 		cpi->cpi_coreid = cpu->cpu_id;
3234 		cpi->cpi_pkgcoreid = 0;
3235 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3236 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3237 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3238 		} else {
3239 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3240 		}
3241 	} else {
3242 		switch (cpi->cpi_vendor) {
3243 		case X86_VENDOR_Intel:
3244 			cpuid_intel_getids(cpu, featureset);
3245 			break;
3246 		case X86_VENDOR_AMD:
3247 		case X86_VENDOR_HYGON:
3248 			cpuid_amd_getids(cpu, featureset);
3249 			break;
3250 		default:
3251 			/*
3252 			 * In this case, it's hard to say what we should do.
3253 			 * We're going to model them to the OS as single core
3254 			 * threads. We don't have a good identifier for them, so
3255 			 * we're just going to use the cpu id all on a single
3256 			 * chip.
3257 			 *
3258 			 * This case has historically been different from the
3259 			 * case above where we don't have HTT or CMP. While they
3260 			 * could be combined, we've opted to keep it separate to
3261 			 * minimize the risk of topology changes in weird cases.
3262 			 */
3263 			cpi->cpi_procnodes_per_pkg = 1;
3264 			cpi->cpi_cores_per_compunit = 1;
3265 			cpi->cpi_chipid = 0;
3266 			cpi->cpi_coreid = cpu->cpu_id;
3267 			cpi->cpi_clogid = cpu->cpu_id;
3268 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3269 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3270 			cpi->cpi_compunitid = cpi->cpi_coreid;
3271 			break;
3272 		}
3273 	}
3274 }
3275 
3276 /*
3277  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3278  * always gather leaf 6 if it's supported; however, we only look for features on
3279  * Intel systems as AMD does not currently define any of the features we look
3280  * for below.
3281  */
3282 static void
cpuid_pass1_thermal(cpu_t * cpu,uchar_t * featureset)3283 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3284 {
3285 	struct cpuid_regs *cp;
3286 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3287 
3288 	if (cpi->cpi_maxeax < 6) {
3289 		return;
3290 	}
3291 
3292 	cp = &cpi->cpi_std[6];
3293 	cp->cp_eax = 6;
3294 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3295 	(void) __cpuid_insn(cp);
3296 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3297 
3298 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3299 		return;
3300 	}
3301 
3302 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3303 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3304 	}
3305 
3306 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3307 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3308 	}
3309 }
3310 
3311 /*
3312  * PPIN is the protected processor inventory number. On AMD this is an actual
3313  * feature bit. However, on Intel systems we need to read the platform
3314  * information MSR if we're on a specific model.
3315  */
3316 #if !defined(__xpv)
3317 static void
cpuid_pass1_ppin(cpu_t * cpu,uchar_t * featureset)3318 cpuid_pass1_ppin(cpu_t *cpu, uchar_t *featureset)
3319 {
3320 	on_trap_data_t otd;
3321 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3322 
3323 	switch (cpi->cpi_vendor) {
3324 	case X86_VENDOR_AMD:
3325 		/*
3326 		 * This leaf will have already been gathered in the topology
3327 		 * functions.
3328 		 */
3329 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3330 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3331 				add_x86_feature(featureset, X86FSET_PPIN);
3332 			}
3333 		}
3334 		break;
3335 	case X86_VENDOR_Intel:
3336 		if (cpi->cpi_family != 6)
3337 			break;
3338 		switch (cpi->cpi_model) {
3339 		case INTC_MODEL_IVYBRIDGE_XEON:
3340 		case INTC_MODEL_HASWELL_XEON:
3341 		case INTC_MODEL_BROADWELL_XEON:
3342 		case INTC_MODEL_BROADWELL_XEON_D:
3343 		case INTC_MODEL_SKYLAKE_XEON:
3344 		case INTC_MODEL_ICELAKE_XEON:
3345 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3346 				uint64_t value;
3347 
3348 				value = rdmsr(MSR_PLATFORM_INFO);
3349 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3350 					add_x86_feature(featureset,
3351 					    X86FSET_PPIN);
3352 				}
3353 			}
3354 			no_trap();
3355 			break;
3356 		default:
3357 			break;
3358 		}
3359 		break;
3360 	default:
3361 		break;
3362 	}
3363 }
3364 #endif	/* ! __xpv */
3365 
3366 void
cpuid_pass1(cpu_t * cpu,uchar_t * featureset)3367 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3368 {
3369 	uint32_t mask_ecx, mask_edx;
3370 	struct cpuid_info *cpi;
3371 	struct cpuid_regs *cp;
3372 	int xcpuid;
3373 #if !defined(__xpv)
3374 	extern int idle_cpu_prefer_mwait;
3375 #endif
3376 
3377 	/*
3378 	 * Space statically allocated for BSP, ensure pointer is set
3379 	 */
3380 	if (cpu->cpu_id == 0) {
3381 		if (cpu->cpu_m.mcpu_cpi == NULL)
3382 			cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3383 	}
3384 
3385 	add_x86_feature(featureset, X86FSET_CPUID);
3386 
3387 	cpi = cpu->cpu_m.mcpu_cpi;
3388 	ASSERT(cpi != NULL);
3389 	cp = &cpi->cpi_std[0];
3390 	cp->cp_eax = 0;
3391 	cpi->cpi_maxeax = __cpuid_insn(cp);
3392 	{
3393 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3394 		*iptr++ = cp->cp_ebx;
3395 		*iptr++ = cp->cp_edx;
3396 		*iptr++ = cp->cp_ecx;
3397 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3398 	}
3399 
3400 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3401 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3402 
3403 	/*
3404 	 * Limit the range in case of weird hardware
3405 	 */
3406 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3407 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3408 	if (cpi->cpi_maxeax < 1)
3409 		goto pass1_done;
3410 
3411 	cp = &cpi->cpi_std[1];
3412 	cp->cp_eax = 1;
3413 	(void) __cpuid_insn(cp);
3414 
3415 	/*
3416 	 * Extract identifying constants for easy access.
3417 	 */
3418 	cpi->cpi_model = CPI_MODEL(cpi);
3419 	cpi->cpi_family = CPI_FAMILY(cpi);
3420 
3421 	if (cpi->cpi_family == 0xf)
3422 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3423 
3424 	/*
3425 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3426 	 * Intel, and presumably everyone else, uses model == 0xf, as
3427 	 * one would expect (max value means possible overflow).  Sigh.
3428 	 */
3429 
3430 	switch (cpi->cpi_vendor) {
3431 	case X86_VENDOR_Intel:
3432 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3433 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3434 		break;
3435 	case X86_VENDOR_AMD:
3436 		if (CPI_FAMILY(cpi) == 0xf)
3437 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3438 		break;
3439 	case X86_VENDOR_HYGON:
3440 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3441 		break;
3442 	default:
3443 		if (cpi->cpi_model == 0xf)
3444 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3445 		break;
3446 	}
3447 
3448 	cpi->cpi_step = CPI_STEP(cpi);
3449 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3450 
3451 	/*
3452 	 * *default* assumptions:
3453 	 * - believe %edx feature word
3454 	 * - ignore %ecx feature word
3455 	 * - 32-bit virtual and physical addressing
3456 	 */
3457 	mask_edx = 0xffffffff;
3458 	mask_ecx = 0;
3459 
3460 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3461 
3462 	switch (cpi->cpi_vendor) {
3463 	case X86_VENDOR_Intel:
3464 		if (cpi->cpi_family == 5)
3465 			x86_type = X86_TYPE_P5;
3466 		else if (IS_LEGACY_P6(cpi)) {
3467 			x86_type = X86_TYPE_P6;
3468 			pentiumpro_bug4046376 = 1;
3469 			/*
3470 			 * Clear the SEP bit when it was set erroneously
3471 			 */
3472 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3473 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3474 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3475 			x86_type = X86_TYPE_P4;
3476 			/*
3477 			 * We don't currently depend on any of the %ecx
3478 			 * features until Prescott, so we'll only check
3479 			 * this from P4 onwards.  We might want to revisit
3480 			 * that idea later.
3481 			 */
3482 			mask_ecx = 0xffffffff;
3483 		} else if (cpi->cpi_family > 0xf)
3484 			mask_ecx = 0xffffffff;
3485 		/*
3486 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3487 		 * to obtain the monitor linesize.
3488 		 */
3489 		if (cpi->cpi_maxeax < 5)
3490 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3491 		break;
3492 	case X86_VENDOR_IntelClone:
3493 	default:
3494 		break;
3495 	case X86_VENDOR_AMD:
3496 #if defined(OPTERON_ERRATUM_108)
3497 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3498 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3499 			cpi->cpi_model = 0xc;
3500 		} else
3501 #endif
3502 		if (cpi->cpi_family == 5) {
3503 			/*
3504 			 * AMD K5 and K6
3505 			 *
3506 			 * These CPUs have an incomplete implementation
3507 			 * of MCA/MCE which we mask away.
3508 			 */
3509 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3510 
3511 			/*
3512 			 * Model 0 uses the wrong (APIC) bit
3513 			 * to indicate PGE.  Fix it here.
3514 			 */
3515 			if (cpi->cpi_model == 0) {
3516 				if (cp->cp_edx & 0x200) {
3517 					cp->cp_edx &= ~0x200;
3518 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3519 				}
3520 			}
3521 
3522 			/*
3523 			 * Early models had problems w/ MMX; disable.
3524 			 */
3525 			if (cpi->cpi_model < 6)
3526 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3527 		}
3528 
3529 		/*
3530 		 * For newer families, SSE3 and CX16, at least, are valid;
3531 		 * enable all
3532 		 */
3533 		if (cpi->cpi_family >= 0xf)
3534 			mask_ecx = 0xffffffff;
3535 		/*
3536 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3537 		 * to obtain the monitor linesize.
3538 		 */
3539 		if (cpi->cpi_maxeax < 5)
3540 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3541 
3542 #if !defined(__xpv)
3543 		/*
3544 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3545 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3546 		 * know for certain that in at least family 17h, per AMD, mwait
3547 		 * is preferred. Families in-between are less certain.
3548 		 */
3549 		if (cpi->cpi_family < 0x17) {
3550 			idle_cpu_prefer_mwait = 0;
3551 		}
3552 #endif
3553 
3554 		break;
3555 	case X86_VENDOR_HYGON:
3556 		/* Enable all for Hygon Dhyana CPU */
3557 		mask_ecx = 0xffffffff;
3558 		break;
3559 	case X86_VENDOR_TM:
3560 		/*
3561 		 * workaround the NT workaround in CMS 4.1
3562 		 */
3563 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3564 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3565 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3566 		break;
3567 	case X86_VENDOR_Centaur:
3568 		/*
3569 		 * workaround the NT workarounds again
3570 		 */
3571 		if (cpi->cpi_family == 6)
3572 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3573 		break;
3574 	case X86_VENDOR_Cyrix:
3575 		/*
3576 		 * We rely heavily on the probing in locore
3577 		 * to actually figure out what parts, if any,
3578 		 * of the Cyrix cpuid instruction to believe.
3579 		 */
3580 		switch (x86_type) {
3581 		case X86_TYPE_CYRIX_486:
3582 			mask_edx = 0;
3583 			break;
3584 		case X86_TYPE_CYRIX_6x86:
3585 			mask_edx = 0;
3586 			break;
3587 		case X86_TYPE_CYRIX_6x86L:
3588 			mask_edx =
3589 			    CPUID_INTC_EDX_DE |
3590 			    CPUID_INTC_EDX_CX8;
3591 			break;
3592 		case X86_TYPE_CYRIX_6x86MX:
3593 			mask_edx =
3594 			    CPUID_INTC_EDX_DE |
3595 			    CPUID_INTC_EDX_MSR |
3596 			    CPUID_INTC_EDX_CX8 |
3597 			    CPUID_INTC_EDX_PGE |
3598 			    CPUID_INTC_EDX_CMOV |
3599 			    CPUID_INTC_EDX_MMX;
3600 			break;
3601 		case X86_TYPE_CYRIX_GXm:
3602 			mask_edx =
3603 			    CPUID_INTC_EDX_MSR |
3604 			    CPUID_INTC_EDX_CX8 |
3605 			    CPUID_INTC_EDX_CMOV |
3606 			    CPUID_INTC_EDX_MMX;
3607 			break;
3608 		case X86_TYPE_CYRIX_MediaGX:
3609 			break;
3610 		case X86_TYPE_CYRIX_MII:
3611 		case X86_TYPE_VIA_CYRIX_III:
3612 			mask_edx =
3613 			    CPUID_INTC_EDX_DE |
3614 			    CPUID_INTC_EDX_TSC |
3615 			    CPUID_INTC_EDX_MSR |
3616 			    CPUID_INTC_EDX_CX8 |
3617 			    CPUID_INTC_EDX_PGE |
3618 			    CPUID_INTC_EDX_CMOV |
3619 			    CPUID_INTC_EDX_MMX;
3620 			break;
3621 		default:
3622 			break;
3623 		}
3624 		break;
3625 	}
3626 
3627 #if defined(__xpv)
3628 	/*
3629 	 * Do not support MONITOR/MWAIT under a hypervisor
3630 	 */
3631 	mask_ecx &= ~CPUID_INTC_ECX_MON;
3632 	/*
3633 	 * Do not support XSAVE under a hypervisor for now
3634 	 */
3635 	xsave_force_disable = B_TRUE;
3636 
3637 #endif	/* __xpv */
3638 
3639 	if (xsave_force_disable) {
3640 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3641 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3642 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3643 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3644 	}
3645 
3646 	/*
3647 	 * Now we've figured out the masks that determine
3648 	 * which bits we choose to believe, apply the masks
3649 	 * to the feature words, then map the kernel's view
3650 	 * of these feature words into its feature word.
3651 	 */
3652 	cp->cp_edx &= mask_edx;
3653 	cp->cp_ecx &= mask_ecx;
3654 
3655 	/*
3656 	 * apply any platform restrictions (we don't call this
3657 	 * immediately after __cpuid_insn here, because we need the
3658 	 * workarounds applied above first)
3659 	 */
3660 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3661 
3662 	/*
3663 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3664 	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3665 	 */
3666 	if (cpi->cpi_maxeax >= 7) {
3667 		struct cpuid_regs *ecp;
3668 		ecp = &cpi->cpi_std[7];
3669 		ecp->cp_eax = 7;
3670 		ecp->cp_ecx = 0;
3671 		(void) __cpuid_insn(ecp);
3672 
3673 		/*
3674 		 * If XSAVE has been disabled, just ignore all of the
3675 		 * extended-save-area dependent flags here.
3676 		 */
3677 		if (xsave_force_disable) {
3678 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3679 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3680 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3681 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3682 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3683 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3684 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3685 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3686 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3687 		}
3688 
3689 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3690 			add_x86_feature(featureset, X86FSET_SMEP);
3691 
3692 		/*
3693 		 * We check disable_smap here in addition to in startup_smap()
3694 		 * to ensure CPUs that aren't the boot CPU don't accidentally
3695 		 * include it in the feature set and thus generate a mismatched
3696 		 * x86 feature set across CPUs.
3697 		 */
3698 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3699 		    disable_smap == 0)
3700 			add_x86_feature(featureset, X86FSET_SMAP);
3701 
3702 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3703 			add_x86_feature(featureset, X86FSET_RDSEED);
3704 
3705 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3706 			add_x86_feature(featureset, X86FSET_ADX);
3707 
3708 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3709 			add_x86_feature(featureset, X86FSET_FSGSBASE);
3710 
3711 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3712 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3713 
3714 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3715 			add_x86_feature(featureset, X86FSET_INVPCID);
3716 
3717 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3718 			add_x86_feature(featureset, X86FSET_UMIP);
3719 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3720 			add_x86_feature(featureset, X86FSET_PKU);
3721 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3722 			add_x86_feature(featureset, X86FSET_OSPKE);
3723 
3724 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3725 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3726 				add_x86_feature(featureset, X86FSET_MPX);
3727 
3728 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3729 				add_x86_feature(featureset, X86FSET_CLWB);
3730 		}
3731 	}
3732 
3733 	/*
3734 	 * fold in overrides from the "eeprom" mechanism
3735 	 */
3736 	cp->cp_edx |= cpuid_feature_edx_include;
3737 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3738 
3739 	cp->cp_ecx |= cpuid_feature_ecx_include;
3740 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3741 
3742 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3743 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3744 	}
3745 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3746 		add_x86_feature(featureset, X86FSET_TSC);
3747 	}
3748 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3749 		add_x86_feature(featureset, X86FSET_MSR);
3750 	}
3751 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3752 		add_x86_feature(featureset, X86FSET_MTRR);
3753 	}
3754 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3755 		add_x86_feature(featureset, X86FSET_PGE);
3756 	}
3757 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3758 		add_x86_feature(featureset, X86FSET_CMOV);
3759 	}
3760 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3761 		add_x86_feature(featureset, X86FSET_MMX);
3762 	}
3763 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3764 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3765 		add_x86_feature(featureset, X86FSET_MCA);
3766 	}
3767 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3768 		add_x86_feature(featureset, X86FSET_PAE);
3769 	}
3770 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3771 		add_x86_feature(featureset, X86FSET_CX8);
3772 	}
3773 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3774 		add_x86_feature(featureset, X86FSET_CX16);
3775 	}
3776 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3777 		add_x86_feature(featureset, X86FSET_PAT);
3778 	}
3779 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3780 		add_x86_feature(featureset, X86FSET_SEP);
3781 	}
3782 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3783 		/*
3784 		 * In our implementation, fxsave/fxrstor
3785 		 * are prerequisites before we'll even
3786 		 * try and do SSE things.
3787 		 */
3788 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3789 			add_x86_feature(featureset, X86FSET_SSE);
3790 		}
3791 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3792 			add_x86_feature(featureset, X86FSET_SSE2);
3793 		}
3794 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3795 			add_x86_feature(featureset, X86FSET_SSE3);
3796 		}
3797 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3798 			add_x86_feature(featureset, X86FSET_SSSE3);
3799 		}
3800 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3801 			add_x86_feature(featureset, X86FSET_SSE4_1);
3802 		}
3803 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3804 			add_x86_feature(featureset, X86FSET_SSE4_2);
3805 		}
3806 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3807 			add_x86_feature(featureset, X86FSET_AES);
3808 		}
3809 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3810 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3811 		}
3812 
3813 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3814 			add_x86_feature(featureset, X86FSET_SHA);
3815 
3816 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3817 			add_x86_feature(featureset, X86FSET_XSAVE);
3818 
3819 			/* We only test AVX & AVX512 when there is XSAVE */
3820 
3821 			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3822 				add_x86_feature(featureset,
3823 				    X86FSET_AVX);
3824 
3825 				/*
3826 				 * Intel says we can't check these without also
3827 				 * checking AVX.
3828 				 */
3829 				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3830 					add_x86_feature(featureset,
3831 					    X86FSET_F16C);
3832 
3833 				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3834 					add_x86_feature(featureset,
3835 					    X86FSET_FMA);
3836 
3837 				if (cpi->cpi_std[7].cp_ebx &
3838 				    CPUID_INTC_EBX_7_0_BMI1)
3839 					add_x86_feature(featureset,
3840 					    X86FSET_BMI1);
3841 
3842 				if (cpi->cpi_std[7].cp_ebx &
3843 				    CPUID_INTC_EBX_7_0_BMI2)
3844 					add_x86_feature(featureset,
3845 					    X86FSET_BMI2);
3846 
3847 				if (cpi->cpi_std[7].cp_ebx &
3848 				    CPUID_INTC_EBX_7_0_AVX2)
3849 					add_x86_feature(featureset,
3850 					    X86FSET_AVX2);
3851 
3852 				if (cpi->cpi_std[7].cp_ecx &
3853 				    CPUID_INTC_ECX_7_0_VAES)
3854 					add_x86_feature(featureset,
3855 					    X86FSET_VAES);
3856 
3857 				if (cpi->cpi_std[7].cp_ecx &
3858 				    CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3859 					add_x86_feature(featureset,
3860 					    X86FSET_VPCLMULQDQ);
3861 			}
3862 
3863 			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3864 			    (cpi->cpi_std[7].cp_ebx &
3865 			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3866 				add_x86_feature(featureset, X86FSET_AVX512F);
3867 
3868 				if (cpi->cpi_std[7].cp_ebx &
3869 				    CPUID_INTC_EBX_7_0_AVX512DQ)
3870 					add_x86_feature(featureset,
3871 					    X86FSET_AVX512DQ);
3872 				if (cpi->cpi_std[7].cp_ebx &
3873 				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3874 					add_x86_feature(featureset,
3875 					    X86FSET_AVX512FMA);
3876 				if (cpi->cpi_std[7].cp_ebx &
3877 				    CPUID_INTC_EBX_7_0_AVX512PF)
3878 					add_x86_feature(featureset,
3879 					    X86FSET_AVX512PF);
3880 				if (cpi->cpi_std[7].cp_ebx &
3881 				    CPUID_INTC_EBX_7_0_AVX512ER)
3882 					add_x86_feature(featureset,
3883 					    X86FSET_AVX512ER);
3884 				if (cpi->cpi_std[7].cp_ebx &
3885 				    CPUID_INTC_EBX_7_0_AVX512CD)
3886 					add_x86_feature(featureset,
3887 					    X86FSET_AVX512CD);
3888 				if (cpi->cpi_std[7].cp_ebx &
3889 				    CPUID_INTC_EBX_7_0_AVX512BW)
3890 					add_x86_feature(featureset,
3891 					    X86FSET_AVX512BW);
3892 				if (cpi->cpi_std[7].cp_ebx &
3893 				    CPUID_INTC_EBX_7_0_AVX512VL)
3894 					add_x86_feature(featureset,
3895 					    X86FSET_AVX512VL);
3896 
3897 				if (cpi->cpi_std[7].cp_ecx &
3898 				    CPUID_INTC_ECX_7_0_AVX512VBMI)
3899 					add_x86_feature(featureset,
3900 					    X86FSET_AVX512VBMI);
3901 				if (cpi->cpi_std[7].cp_ecx &
3902 				    CPUID_INTC_ECX_7_0_AVX512VNNI)
3903 					add_x86_feature(featureset,
3904 					    X86FSET_AVX512VNNI);
3905 				if (cpi->cpi_std[7].cp_ecx &
3906 				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3907 					add_x86_feature(featureset,
3908 					    X86FSET_AVX512VPOPCDQ);
3909 
3910 				if (cpi->cpi_std[7].cp_edx &
3911 				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
3912 					add_x86_feature(featureset,
3913 					    X86FSET_AVX512NNIW);
3914 				if (cpi->cpi_std[7].cp_edx &
3915 				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3916 					add_x86_feature(featureset,
3917 					    X86FSET_AVX512FMAPS);
3918 			}
3919 		}
3920 	}
3921 
3922 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3923 		add_x86_feature(featureset, X86FSET_PCID);
3924 	}
3925 
3926 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3927 		add_x86_feature(featureset, X86FSET_X2APIC);
3928 	}
3929 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3930 		add_x86_feature(featureset, X86FSET_DE);
3931 	}
3932 #if !defined(__xpv)
3933 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3934 
3935 		/*
3936 		 * We require the CLFLUSH instruction for erratum workaround
3937 		 * to use MONITOR/MWAIT.
3938 		 */
3939 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3940 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3941 			add_x86_feature(featureset, X86FSET_MWAIT);
3942 		} else {
3943 			extern int idle_cpu_assert_cflush_monitor;
3944 
3945 			/*
3946 			 * All processors we are aware of which have
3947 			 * MONITOR/MWAIT also have CLFLUSH.
3948 			 */
3949 			if (idle_cpu_assert_cflush_monitor) {
3950 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3951 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3952 			}
3953 		}
3954 	}
3955 #endif	/* __xpv */
3956 
3957 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3958 		add_x86_feature(featureset, X86FSET_VMX);
3959 	}
3960 
3961 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3962 		add_x86_feature(featureset, X86FSET_RDRAND);
3963 
3964 	/*
3965 	 * Only need it first time, rest of the cpus would follow suit.
3966 	 * we only capture this for the bootcpu.
3967 	 */
3968 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3969 		add_x86_feature(featureset, X86FSET_CLFSH);
3970 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3971 	}
3972 	if (is_x86_feature(featureset, X86FSET_PAE))
3973 		cpi->cpi_pabits = 36;
3974 
3975 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3976 		struct cpuid_regs r, *ecp;
3977 
3978 		ecp = &r;
3979 		ecp->cp_eax = 0xD;
3980 		ecp->cp_ecx = 1;
3981 		ecp->cp_edx = ecp->cp_ebx = 0;
3982 		(void) __cpuid_insn(ecp);
3983 
3984 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3985 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
3986 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3987 			add_x86_feature(featureset, X86FSET_XSAVEC);
3988 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3989 			add_x86_feature(featureset, X86FSET_XSAVES);
3990 	}
3991 
3992 	/*
3993 	 * Work on the "extended" feature information, doing
3994 	 * some basic initialization for cpuid_pass2()
3995 	 */
3996 	xcpuid = 0;
3997 	switch (cpi->cpi_vendor) {
3998 	case X86_VENDOR_Intel:
3999 		/*
4000 		 * On KVM we know we will have proper support for extended
4001 		 * cpuid.
4002 		 */
4003 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4004 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4005 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4006 			xcpuid++;
4007 		break;
4008 	case X86_VENDOR_AMD:
4009 		if (cpi->cpi_family > 5 ||
4010 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4011 			xcpuid++;
4012 		break;
4013 	case X86_VENDOR_Cyrix:
4014 		/*
4015 		 * Only these Cyrix CPUs are -known- to support
4016 		 * extended cpuid operations.
4017 		 */
4018 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4019 		    x86_type == X86_TYPE_CYRIX_GXm)
4020 			xcpuid++;
4021 		break;
4022 	case X86_VENDOR_HYGON:
4023 	case X86_VENDOR_Centaur:
4024 	case X86_VENDOR_TM:
4025 	default:
4026 		xcpuid++;
4027 		break;
4028 	}
4029 
4030 	if (xcpuid) {
4031 		cp = &cpi->cpi_extd[0];
4032 		cp->cp_eax = CPUID_LEAF_EXT_0;
4033 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4034 	}
4035 
4036 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4037 
4038 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4039 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4040 
4041 		switch (cpi->cpi_vendor) {
4042 		case X86_VENDOR_Intel:
4043 		case X86_VENDOR_AMD:
4044 		case X86_VENDOR_HYGON:
4045 			if (cpi->cpi_xmaxeax < 0x80000001)
4046 				break;
4047 			cp = &cpi->cpi_extd[1];
4048 			cp->cp_eax = 0x80000001;
4049 			(void) __cpuid_insn(cp);
4050 
4051 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4052 			    cpi->cpi_family == 5 &&
4053 			    cpi->cpi_model == 6 &&
4054 			    cpi->cpi_step == 6) {
4055 				/*
4056 				 * K6 model 6 uses bit 10 to indicate SYSC
4057 				 * Later models use bit 11. Fix it here.
4058 				 */
4059 				if (cp->cp_edx & 0x400) {
4060 					cp->cp_edx &= ~0x400;
4061 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4062 				}
4063 			}
4064 
4065 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4066 
4067 			/*
4068 			 * Compute the additions to the kernel's feature word.
4069 			 */
4070 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4071 				add_x86_feature(featureset, X86FSET_NX);
4072 			}
4073 
4074 			/*
4075 			 * Regardless whether or not we boot 64-bit,
4076 			 * we should have a way to identify whether
4077 			 * the CPU is capable of running 64-bit.
4078 			 */
4079 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4080 				add_x86_feature(featureset, X86FSET_64);
4081 			}
4082 
4083 			/* 1 GB large page - enable only for 64 bit kernel */
4084 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4085 				add_x86_feature(featureset, X86FSET_1GPG);
4086 			}
4087 
4088 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4089 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4090 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4091 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4092 				add_x86_feature(featureset, X86FSET_SSE4A);
4093 			}
4094 
4095 			/*
4096 			 * It's really tricky to support syscall/sysret in
4097 			 * the i386 kernel; we rely on sysenter/sysexit
4098 			 * instead.  In the amd64 kernel, things are -way-
4099 			 * better.
4100 			 */
4101 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4102 				add_x86_feature(featureset, X86FSET_ASYSC);
4103 			}
4104 
4105 			/*
4106 			 * While we're thinking about system calls, note
4107 			 * that AMD processors don't support sysenter
4108 			 * in long mode at all, so don't try to program them.
4109 			 */
4110 			if (x86_vendor == X86_VENDOR_AMD ||
4111 			    x86_vendor == X86_VENDOR_HYGON) {
4112 				remove_x86_feature(featureset, X86FSET_SEP);
4113 			}
4114 
4115 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4116 				add_x86_feature(featureset, X86FSET_TSCP);
4117 			}
4118 
4119 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4120 				add_x86_feature(featureset, X86FSET_SVM);
4121 			}
4122 
4123 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4124 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4125 			}
4126 
4127 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4128 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4129 			}
4130 
4131 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4132 				add_x86_feature(featureset, X86FSET_XOP);
4133 			}
4134 
4135 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4136 				add_x86_feature(featureset, X86FSET_FMA4);
4137 			}
4138 
4139 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4140 				add_x86_feature(featureset, X86FSET_TBM);
4141 			}
4142 
4143 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4144 				add_x86_feature(featureset, X86FSET_MONITORX);
4145 			}
4146 			break;
4147 		default:
4148 			break;
4149 		}
4150 
4151 		/*
4152 		 * Get CPUID data about processor cores and hyperthreads.
4153 		 */
4154 		switch (cpi->cpi_vendor) {
4155 		case X86_VENDOR_Intel:
4156 			if (cpi->cpi_maxeax >= 4) {
4157 				cp = &cpi->cpi_std[4];
4158 				cp->cp_eax = 4;
4159 				cp->cp_ecx = 0;
4160 				(void) __cpuid_insn(cp);
4161 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4162 			}
4163 			/*FALLTHROUGH*/
4164 		case X86_VENDOR_AMD:
4165 		case X86_VENDOR_HYGON:
4166 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4167 				break;
4168 			cp = &cpi->cpi_extd[8];
4169 			cp->cp_eax = CPUID_LEAF_EXT_8;
4170 			(void) __cpuid_insn(cp);
4171 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4172 			    cp);
4173 
4174 			/*
4175 			 * AMD uses ebx for some extended functions.
4176 			 */
4177 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4178 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4179 				/*
4180 				 * While we're here, check for the AMD "Error
4181 				 * Pointer Zero/Restore" feature. This can be
4182 				 * used to setup the FP save handlers
4183 				 * appropriately.
4184 				 */
4185 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4186 					cpi->cpi_fp_amd_save = 0;
4187 				} else {
4188 					cpi->cpi_fp_amd_save = 1;
4189 				}
4190 
4191 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4192 					add_x86_feature(featureset,
4193 					    X86FSET_CLZERO);
4194 				}
4195 			}
4196 
4197 			/*
4198 			 * Virtual and physical address limits from
4199 			 * cpuid override previously guessed values.
4200 			 */
4201 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4202 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4203 			break;
4204 		default:
4205 			break;
4206 		}
4207 
4208 		/*
4209 		 * Get CPUID data about TSC Invariance in Deep C-State.
4210 		 */
4211 		switch (cpi->cpi_vendor) {
4212 		case X86_VENDOR_Intel:
4213 		case X86_VENDOR_AMD:
4214 		case X86_VENDOR_HYGON:
4215 			if (cpi->cpi_maxeax >= 7) {
4216 				cp = &cpi->cpi_extd[7];
4217 				cp->cp_eax = 0x80000007;
4218 				cp->cp_ecx = 0;
4219 				(void) __cpuid_insn(cp);
4220 			}
4221 			break;
4222 		default:
4223 			break;
4224 		}
4225 	}
4226 
4227 	/*
4228 	 * cpuid_pass1_ppin assumes that cpuid_pass1_topology has already been
4229 	 * run and thus gathered some of its dependent leaves.
4230 	 */
4231 	cpuid_pass1_topology(cpu, featureset);
4232 	cpuid_pass1_thermal(cpu, featureset);
4233 #if !defined(__xpv)
4234 	cpuid_pass1_ppin(cpu, featureset);
4235 #endif
4236 
4237 	/*
4238 	 * Synthesize chip "revision" and socket type
4239 	 */
4240 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4241 	    cpi->cpi_model, cpi->cpi_step);
4242 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4243 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4244 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4245 	    cpi->cpi_model, cpi->cpi_step);
4246 
4247 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4248 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4249 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4250 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4251 			/* Special handling for AMD FP not necessary. */
4252 			cpi->cpi_fp_amd_save = 0;
4253 		} else {
4254 			cpi->cpi_fp_amd_save = 1;
4255 		}
4256 	}
4257 
4258 	/*
4259 	 * Check (and potentially set) if lfence is serializing.
4260 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4261 	 */
4262 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4263 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4264 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4265 		/*
4266 		 * The AMD white paper Software Techniques For Managing
4267 		 * Speculation on AMD Processors details circumstances for when
4268 		 * lfence instructions are serializing.
4269 		 *
4270 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4271 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4272 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4273 		 * committed to supporting that MSR on all later CPUs.
4274 		 */
4275 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4276 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4277 		} else if (cpi->cpi_family >= 0x10) {
4278 #if !defined(__xpv)
4279 			uint64_t val;
4280 
4281 			/*
4282 			 * Be careful when attempting to enable the bit, and
4283 			 * verify that it was actually set in case we are
4284 			 * running in a hypervisor which is less than faithful
4285 			 * about its emulation of this feature.
4286 			 */
4287 			on_trap_data_t otd;
4288 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4289 				val = rdmsr(MSR_AMD_DE_CFG);
4290 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4291 				wrmsr(MSR_AMD_DE_CFG, val);
4292 				val = rdmsr(MSR_AMD_DE_CFG);
4293 			} else {
4294 				val = 0;
4295 			}
4296 			no_trap();
4297 
4298 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4299 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4300 			}
4301 #endif
4302 		}
4303 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4304 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4305 		/*
4306 		 * Documentation and other OSes indicate that lfence is always
4307 		 * serializing on Intel CPUs.
4308 		 */
4309 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4310 	}
4311 
4312 
4313 	/*
4314 	 * Check the processor leaves that are used for security features.
4315 	 */
4316 	cpuid_scan_security(cpu, featureset);
4317 
4318 pass1_done:
4319 	cpi->cpi_pass = 1;
4320 }
4321 
4322 /*
4323  * Make copies of the cpuid table entries we depend on, in
4324  * part for ease of parsing now, in part so that we have only
4325  * one place to correct any of it, in part for ease of
4326  * later export to userland, and in part so we can look at
4327  * this stuff in a crash dump.
4328  */
4329 
4330 /*ARGSUSED*/
4331 void
cpuid_pass2(cpu_t * cpu)4332 cpuid_pass2(cpu_t *cpu)
4333 {
4334 	uint_t n, nmax;
4335 	int i;
4336 	struct cpuid_regs *cp;
4337 	uint8_t *dp;
4338 	uint32_t *iptr;
4339 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4340 
4341 	ASSERT(cpi->cpi_pass == 1);
4342 
4343 	if (cpi->cpi_maxeax < 1)
4344 		goto pass2_done;
4345 
4346 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4347 		nmax = NMAX_CPI_STD;
4348 	/*
4349 	 * (We already handled n == 0 and n == 1 in pass 1)
4350 	 */
4351 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4352 		/*
4353 		 * leaves 6 and 7 were handled in pass 1
4354 		 */
4355 		if (n == 6 || n == 7)
4356 			continue;
4357 
4358 		cp->cp_eax = n;
4359 
4360 		/*
4361 		 * CPUID function 4 expects %ecx to be initialized
4362 		 * with an index which indicates which cache to return
4363 		 * information about. The OS is expected to call function 4
4364 		 * with %ecx set to 0, 1, 2, ... until it returns with
4365 		 * EAX[4:0] set to 0, which indicates there are no more
4366 		 * caches.
4367 		 *
4368 		 * Here, populate cpi_std[4] with the information returned by
4369 		 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
4370 		 * when dynamic memory allocation becomes available.
4371 		 *
4372 		 * Note: we need to explicitly initialize %ecx here, since
4373 		 * function 4 may have been previously invoked.
4374 		 */
4375 		if (n == 4)
4376 			cp->cp_ecx = 0;
4377 
4378 		(void) __cpuid_insn(cp);
4379 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4380 		switch (n) {
4381 		case 2:
4382 			/*
4383 			 * "the lower 8 bits of the %eax register
4384 			 * contain a value that identifies the number
4385 			 * of times the cpuid [instruction] has to be
4386 			 * executed to obtain a complete image of the
4387 			 * processor's caching systems."
4388 			 *
4389 			 * How *do* they make this stuff up?
4390 			 */
4391 			cpi->cpi_ncache = sizeof (*cp) *
4392 			    BITX(cp->cp_eax, 7, 0);
4393 			if (cpi->cpi_ncache == 0)
4394 				break;
4395 			cpi->cpi_ncache--;	/* skip count byte */
4396 
4397 			/*
4398 			 * Well, for now, rather than attempt to implement
4399 			 * this slightly dubious algorithm, we just look
4400 			 * at the first 15 ..
4401 			 */
4402 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4403 				cpi->cpi_ncache = sizeof (*cp) - 1;
4404 
4405 			dp = cpi->cpi_cacheinfo;
4406 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4407 				uint8_t *p = (void *)&cp->cp_eax;
4408 				for (i = 1; i < 4; i++)
4409 					if (p[i] != 0)
4410 						*dp++ = p[i];
4411 			}
4412 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4413 				uint8_t *p = (void *)&cp->cp_ebx;
4414 				for (i = 0; i < 4; i++)
4415 					if (p[i] != 0)
4416 						*dp++ = p[i];
4417 			}
4418 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4419 				uint8_t *p = (void *)&cp->cp_ecx;
4420 				for (i = 0; i < 4; i++)
4421 					if (p[i] != 0)
4422 						*dp++ = p[i];
4423 			}
4424 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4425 				uint8_t *p = (void *)&cp->cp_edx;
4426 				for (i = 0; i < 4; i++)
4427 					if (p[i] != 0)
4428 						*dp++ = p[i];
4429 			}
4430 			break;
4431 
4432 		case 3:	/* Processor serial number, if PSN supported */
4433 			break;
4434 
4435 		case 4:	/* Deterministic cache parameters */
4436 			break;
4437 
4438 		case 5:	/* Monitor/Mwait parameters */
4439 		{
4440 			size_t mwait_size;
4441 
4442 			/*
4443 			 * check cpi_mwait.support which was set in cpuid_pass1
4444 			 */
4445 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4446 				break;
4447 
4448 			/*
4449 			 * Protect ourself from insane mwait line size.
4450 			 * Workaround for incomplete hardware emulator(s).
4451 			 */
4452 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4453 			if (mwait_size < sizeof (uint32_t) ||
4454 			    !ISP2(mwait_size)) {
4455 #if DEBUG
4456 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4457 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4458 #endif
4459 				break;
4460 			}
4461 
4462 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4463 			cpi->cpi_mwait.mon_max = mwait_size;
4464 			if (MWAIT_EXTENSION(cpi)) {
4465 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4466 				if (MWAIT_INT_ENABLE(cpi))
4467 					cpi->cpi_mwait.support |=
4468 					    MWAIT_ECX_INT_ENABLE;
4469 			}
4470 			break;
4471 		}
4472 		default:
4473 			break;
4474 		}
4475 	}
4476 
4477 	/*
4478 	 * XSAVE enumeration
4479 	 */
4480 	if (cpi->cpi_maxeax >= 0xD) {
4481 		struct cpuid_regs regs;
4482 		boolean_t cpuid_d_valid = B_TRUE;
4483 
4484 		cp = &regs;
4485 		cp->cp_eax = 0xD;
4486 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4487 
4488 		(void) __cpuid_insn(cp);
4489 
4490 		/*
4491 		 * Sanity checks for debug
4492 		 */
4493 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4494 		    (cp->