xref: /illumos-gate/usr/src/uts/i86pc/os/cpuid.c (revision beed421e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2020 Oxide Computer Company
28  */
29 /*
30  * Copyright (c) 2010, Intel Corporation.
31  * All rights reserved.
32  */
33 /*
34  * Portions Copyright 2009 Advanced Micro Devices, Inc.
35  */
36 
37 /*
38  * CPU Identification logic
39  *
40  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
41  * with the identification of CPUs, their features, and their topologies. More
42  * specifically, this file helps drive the following:
43  *
44  * 1. Enumeration of features of the processor which are used by the kernel to
45  *    determine what features to enable or disable. These may be instruction set
46  *    enhancements or features that we use.
47  *
48  * 2. Enumeration of instruction set architecture (ISA) additions that userland
49  *    will be told about through the auxiliary vector.
50  *
51  * 3. Understanding the physical topology of the CPU such as the number of
52  *    caches, how many cores it has, whether or not it supports symmetric
53  *    multi-processing (SMT), etc.
54  *
55  * ------------------------
56  * CPUID History and Basics
57  * ------------------------
58  *
59  * The cpuid instruction was added by Intel roughly around the time that the
60  * original Pentium was introduced. The purpose of cpuid was to tell in a
61  * programmatic fashion information about the CPU that previously was guessed
62  * at. For example, an important part of cpuid is that we can know what
63  * extensions to the ISA exist. If you use an invalid opcode you would get a
64  * #UD, so this method allows a program (whether a user program or the kernel)
65  * to determine what exists without crashing or getting a SIGILL. Of course,
66  * this was also during the era of the clones and the AMD Am5x86. The vendor
67  * name shows up first in cpuid for a reason.
68  *
69  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
70  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
71  * its own meaning. The different leaves are broken down into different regions:
72  *
73  *	[ 0, 7fffffff ]			This region is called the 'basic'
74  *					region. This region is generally defined
75  *					by Intel, though some of the original
76  *					portions have different meanings based
77  *					on the manufacturer. These days, Intel
78  *					adds most new features to this region.
79  *					AMD adds non-Intel compatible
80  *					information in the third, extended
81  *					region. Intel uses this for everything
82  *					including ISA extensions, CPU
83  *					features, cache information, topology,
84  *					and more.
85  *
86  *					There is a hole carved out of this
87  *					region which is reserved for
88  *					hypervisors.
89  *
90  *	[ 40000000, 4fffffff ]		This region, which is found in the
91  *					middle of the previous region, is
92  *					explicitly promised to never be used by
93  *					CPUs. Instead, it is used by hypervisors
94  *					to communicate information about
95  *					themselves to the operating system. The
96  *					values and details are unique for each
97  *					hypervisor.
98  *
99  *	[ 80000000, ffffffff ]		This region is called the 'extended'
100  *					region. Some of the low leaves mirror
101  *					parts of the basic leaves. This region
102  *					has generally been used by AMD for
103  *					various extensions. For example, AMD-
104  *					specific information about caches,
105  *					features, and topology are found in this
106  *					region.
107  *
108  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
109  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
110  * the ranges, one of the primary things returned is the maximum valid leaf in
111  * that range. This allows for discovery of what range of CPUID is valid.
112  *
113  * The CPUs have potentially surprising behavior when using an invalid leaf or
114  * unimplemented leaf. If the requested leaf is within the valid basic or
115  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
116  * set to zero. However, if you specify a leaf that is outside of a valid range,
117  * then instead it will be filled with the last valid _basic_ leaf. For example,
118  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
119  * an invalid extended leaf will return the information for leaf 3.
120  *
121  * Some leaves are broken down into sub-leaves. This means that the value
122  * depends on both the leaf asked for in %eax and a secondary register. For
123  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
124  * additional information. Or when getting topology information in leaf 0xb, the
125  * initial value in %ecx changes which level of the topology that you are
126  * getting information about.
127  *
128  * cpuid values are always kept to 32 bits regardless of whether or not the
129  * program is in 64-bit mode. When executing in 64-bit mode, the upper
130  * 32 bits of the register are always set to zero so that way the values are the
131  * same regardless of execution mode.
132  *
133  * ----------------------
134  * Identifying Processors
135  * ----------------------
136  *
137  * We can identify a processor in two steps. The first step looks at cpuid leaf
138  * 0. Leaf 0 contains the processor's vendor information. This is done by
139  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
140  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
141  *
142  * From there, a processor is identified by a combination of three different
143  * values:
144  *
145  *  1. Family
146  *  2. Model
147  *  3. Stepping
148  *
149  * Each vendor uses the family and model to uniquely identify a processor. The
150  * way that family and model are changed depends on the vendor. For example,
151  * Intel has been using family 0x6 for almost all of their processor since the
152  * Pentium Pro/Pentium II era, often called the P6. The model is used to
153  * identify the exact processor. Different models are often used for the client
154  * (consumer) and server parts. Even though each processor often has major
155  * architectural differences, they still are considered the same family by
156  * Intel.
157  *
158  * On the other hand, each major AMD architecture generally has its own family.
159  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
160  * the model number is used to help identify specific processors.
161  *
162  * The stepping is used to refer to a revision of a specific microprocessor. The
163  * term comes from equipment used to produce masks that are used to create
164  * integrated circuits.
165  *
166  * The information is present in leaf 1, %eax. In technical documentation you
167  * will see the terms extended model and extended family. The original family,
168  * model, and stepping fields were each 4 bits wide. If the values in either
169  * are 0xf, then one is to consult the extended model and extended family, which
170  * take previously reserved bits and allow for a larger number of models and add
171  * 0xf to them.
172  *
173  * When we process this information, we store the full family, model, and
174  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
175  * cpi_step, respectively. Whenever you are performing comparisons with the
176  * family, model, and stepping, you should use these members and not the raw
177  * values from cpuid. If you must use the raw values from cpuid directly, you
178  * must make sure that you add the extended model and family to the base model
179  * and family.
180  *
181  * In general, we do not use information about the family, model, and stepping
182  * to determine whether or not a feature is present; that is generally driven by
183  * specific leaves. However, when something we care about on the processor is
184  * not considered 'architectural' meaning that it is specific to a set of
185  * processors and not promised in the architecture model to be consistent from
186  * generation to generation, then we will fall back on this information. The
187  * most common cases where this comes up is when we have to workaround errata in
188  * the processor, are dealing with processor-specific features such as CPU
189  * performance counters, or we want to provide additional information for things
190  * such as fault management.
191  *
192  * While processors also do have a brand string, which is the name that people
193  * are familiar with when buying the processor, they are not meant for
194  * programmatic consumption. That is what the family, model, and stepping are
195  * for.
196  *
197  * ------------
198  * CPUID Passes
199  * ------------
200  *
201  * As part of performing feature detection, we break this into several different
202  * passes. The passes are as follows:
203  *
204  *	Pass 0		This is a primordial pass done in locore.s to deal with
205  *			Cyrix CPUs that don't support cpuid. The reality is that
206  *			we likely don't run on them any more, but there is still
207  *			logic for handling them.
208  *
209  *	Pass 1		This is the primary pass and is responsible for doing a
210  *			large number of different things:
211  *
212  *			1. Determine which vendor manufactured the CPU and
213  *			determining the family, model, and stepping information.
214  *
215  *			2. Gathering a large number of feature flags to
216  *			determine which features the CPU support and which
217  *			indicate things that we need to do other work in the OS
218  *			to enable. Features detected this way are added to the
219  *			x86_featureset which can be queried to
220  *			determine what we should do. This includes processing
221  *			all of the basic and extended CPU features that we care
222  *			about.
223  *
224  *			3. Determining the CPU's topology. This includes
225  *			information about how many cores and threads are present
226  *			in the package. It also is responsible for figuring out
227  *			which logical CPUs are potentially part of the same core
228  *			and what other resources they might share. For more
229  *			information see the 'Topology' section.
230  *
231  *			4. Determining the set of CPU security-specific features
232  *			that we need to worry about and determine the
233  *			appropriate set of workarounds.
234  *
235  *			Pass 1 on the boot CPU occurs before KMDB is started.
236  *
237  *	Pass 2		The second pass is done after startup(). Here, we check
238  *			other miscellaneous features. Most of this is gathering
239  *			additional basic and extended features that we'll use in
240  *			later passes or for debugging support.
241  *
242  *	Pass 3		The third pass occurs after the kernel memory allocator
243  *			has been fully initialized. This gathers information
244  *			where we might need dynamic memory available for our
245  *			uses. This includes several varying width leaves that
246  *			have cache information and the processor's brand string.
247  *
248  *	Pass 4		The fourth and final normal pass is performed after the
249  *			kernel has brought most everything online. This is
250  *			invoked from post_startup(). In this pass, we go through
251  *			the set of features that we have enabled and turn that
252  *			into the hardware auxiliary vector features that
253  *			userland receives. This is used by userland, primarily
254  *			by the run-time link-editor (RTLD), though userland
255  *			software could also refer to it directly.
256  *
257  *	Microcode	After a microcode update, we do a selective rescan of
258  *			the cpuid leaves to determine what features have
259  *			changed. Microcode updates can provide more details
260  *			about security related features to deal with issues like
261  *			Spectre and L1TF. On occasion, vendors have violated
262  *			their contract and removed bits. However, we don't try
263  *			to detect that because that puts us in a situation that
264  *			we really can't deal with. As such, the only thing we
265  *			rescan are security related features today. See
266  *			cpuid_pass_ucode().
267  *
268  * All of the passes (except pass 0) are run on all CPUs. However, for the most
269  * part we only care about what the boot CPU says about this information and use
270  * the other CPUs as a rough guide to sanity check that we have the same feature
271  * set.
272  *
273  * We do not support running multiple logical CPUs with disjoint, let alone
274  * different, feature sets.
275  *
276  * ------------------
277  * Processor Topology
278  * ------------------
279  *
280  * One of the important things that we need to do is to understand the topology
281  * of the underlying processor. When we say topology in this case, we're trying
282  * to understand the relationship between the logical CPUs that the operating
283  * system sees and the underlying physical layout. Different logical CPUs may
284  * share different resources which can have important consequences for the
285  * performance of the system. For example, they may share caches, execution
286  * units, and more.
287  *
288  * The topology of the processor changes from generation to generation and
289  * vendor to vendor.  Along with that, different vendors use different
290  * terminology, and the operating system itself uses occasionally overlapping
291  * terminology. It's important to understand what this topology looks like so
292  * one can understand the different things that we try to calculate and
293  * determine.
294  *
295  * To get started, let's talk about a little bit of terminology that we've used
296  * so far, is used throughout this file, and is fairly generic across multiple
297  * vendors:
298  *
299  * CPU
300  *	A central processing unit (CPU) refers to a logical and/or virtual
301  *	entity that the operating system can execute instructions on. The
302  *	underlying resources for this CPU may be shared between multiple
303  *	entities; however, to the operating system it is a discrete unit.
304  *
305  * PROCESSOR and PACKAGE
306  *
307  *	Generally, when we use the term 'processor' on its own, we are referring
308  *	to the physical entity that one buys and plugs into a board. However,
309  *	because processor has been overloaded and one might see it used to mean
310  *	multiple different levels, we will instead use the term 'package' for
311  *	the rest of this file. The term package comes from the electrical
312  *	engineering side and refers to the physical entity that encloses the
313  *	electronics inside. Strictly speaking the package can contain more than
314  *	just the CPU, for example, on many processors it may also have what's
315  *	called an 'integrated graphical processing unit (GPU)'. Because the
316  *	package can encapsulate multiple units, it is the largest physical unit
317  *	that we refer to.
318  *
319  * SOCKET
320  *
321  *	A socket refers to unit on a system board (generally the motherboard)
322  *	that can receive a package. A single package, or processor, is plugged
323  *	into a single socket. A system may have multiple sockets. Often times,
324  *	the term socket is used interchangeably with package and refers to the
325  *	electrical component that has plugged in, and not the receptacle itself.
326  *
327  * CORE
328  *
329  *	A core refers to the physical instantiation of a CPU, generally, with a
330  *	full set of hardware resources available to it. A package may contain
331  *	multiple cores inside of it or it may just have a single one. A
332  *	processor with more than one core is often referred to as 'multi-core'.
333  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
334  *	that has 'multi-core' processors.
335  *
336  *	A core may expose a single logical CPU to the operating system, or it
337  *	may expose multiple CPUs, which we call threads, defined below.
338  *
339  *	Some resources may still be shared by cores in the same package. For
340  *	example, many processors will share the level 3 cache between cores.
341  *	Some AMD generations share hardware resources between cores. For more
342  *	information on that see the section 'AMD Topology'.
343  *
344  * THREAD and STRAND
345  *
346  *	In this file, generally a thread refers to a hardware resources and not
347  *	the operating system's logical abstraction. A thread is always exposed
348  *	as an independent logical CPU to the operating system. A thread belongs
349  *	to a specific core. A core may have more than one thread. When that is
350  *	the case, the threads that are part of the same core are often referred
351  *	to as 'siblings'.
352  *
353  *	When multiple threads exist, this is generally referred to as
354  *	simultaneous multi-threading (SMT). When Intel introduced this in their
355  *	processors they called it hyper-threading (HT). When multiple threads
356  *	are active in a core, they split the resources of the core. For example,
357  *	two threads may share the same set of hardware execution units.
358  *
359  *	The operating system often uses the term 'strand' to refer to a thread.
360  *	This helps disambiguate it from the software concept.
361  *
362  * CHIP
363  *
364  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
365  *	base meaning, it is used to refer to a single integrated circuit, which
366  *	may or may not be the only thing in the package. In illumos, when you
367  *	see the term 'chip' it is almost always referring to the same thing as
368  *	the 'package'. However, many vendors may use chip to refer to one of
369  *	many integrated circuits that have been placed in the package. As an
370  *	example, see the subsequent definition.
371  *
372  *	To try and keep things consistent, we will only use chip when referring
373  *	to the entire integrated circuit package, with the exception of the
374  *	definition of multi-chip module (because it is in the name) and use the
375  *	term 'die' when we want the more general, potential sub-component
376  *	definition.
377  *
378  * DIE
379  *
380  *	A die refers to an integrated circuit. Inside of the package there may
381  *	be a single die or multiple dies. This is sometimes called a 'chip' in
382  *	vendor's parlance, but in this file, we use the term die to refer to a
383  *	subcomponent.
384  *
385  * MULTI-CHIP MODULE
386  *
387  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
388  *	are connected together in the same package. When a multi-chip design is
389  *	used, generally each chip is manufactured independently and then joined
390  *	together in the package. For example, on AMD's Zen microarchitecture
391  *	(family 0x17), the package contains several dies (the second meaning of
392  *	chip from above) that are connected together.
393  *
394  * CACHE
395  *
396  *	A cache is a part of the processor that maintains copies of recently
397  *	accessed memory. Caches are split into levels and then into types.
398  *	Commonly there are one to three levels, called level one, two, and
399  *	three. The lower the level, the smaller it is, the closer it is to the
400  *	execution units of the CPU, and the faster it is to access. The layout
401  *	and design of the cache come in many different flavors, consult other
402  *	resources for a discussion of those.
403  *
404  *	Caches are generally split into two types, the instruction and data
405  *	cache. The caches contain what their names suggest, the instruction
406  *	cache has executable program text, while the data cache has all other
407  *	memory that the processor accesses. As of this writing, data is kept
408  *	coherent between all of the caches on x86, so if one modifies program
409  *	text before it is executed, that will be in the data cache, and the
410  *	instruction cache will be synchronized with that change when the
411  *	processor actually executes those instructions. This coherency also
412  *	covers the fact that data could show up in multiple caches.
413  *
414  *	Generally, the lowest level caches are specific to a core. However, the
415  *	last layer cache is shared between some number of cores. The number of
416  *	CPUs sharing this last level cache is important. This has implications
417  *	for the choices that the scheduler makes, as accessing memory that might
418  *	be in a remote cache after thread migration can be quite expensive.
419  *
420  *	Sometimes, the word cache is abbreviated with a '$', because in US
421  *	English the word cache is pronounced the same as cash. So L1D$ refers to
422  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
423  *	in the rest of this theory statement for clarity.
424  *
425  * MEMORY CONTROLLER
426  *
427  *	The memory controller is a component that provides access to DRAM. Each
428  *	memory controller can access a set number of DRAM channels. Each channel
429  *	can have a number of DIMMs (sticks of memory) associated with it. A
430  *	given package may have more than one memory controller. The association
431  *	of the memory controller to a group of cores is important as it is
432  *	cheaper to access memory on the controller that you are associated with.
433  *
434  * NUMA
435  *
436  *	NUMA or non-uniform memory access, describes a way that systems are
437  *	built. On x86, any processor core can address all of the memory in the
438  *	system. However, When using multiple sockets or possibly within a
439  *	multi-chip module, some of that memory is physically closer and some of
440  *	it is further. Memory that is further away is more expensive to access.
441  *	Consider the following image of multiple sockets with memory:
442  *
443  *	+--------+                                                +--------+
444  *	| DIMM A |         +----------+      +----------+         | DIMM D |
445  *	+--------+-+       |          |      |          |       +-+------+-+
446  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
447  *	  +--------+-+     |          |      |          |     +-+------+-+
448  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
449  *	    +--------+                                        +--------+
450  *
451  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
452  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
453  *	access DIMMs A-C and more expensive to access D-F as it has to go
454  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
455  *	D-F are cheaper than A-C. While the socket form is the most common, when
456  *	using multi-chip modules, this can also sometimes occur. For another
457  *	example of this that's more involved, see the AMD topology section.
458  *
459  *
460  * Intel Topology
461  * --------------
462  *
463  * Most Intel processors since Nehalem, (as of this writing the current gen
464  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
465  * the package is a single monolithic die. MCMs currently aren't used. Most
466  * parts have three levels of caches, with the L3 cache being shared between
467  * all of the cores on the package. The L1/L2 cache is generally specific to
468  * an individual core. The following image shows at a simplified level what
469  * this looks like. The memory controller is commonly part of something called
470  * the 'Uncore', that used to be separate physical chips that were not a part of
471  * the package, but are now part of the same chip.
472  *
473  *  +-----------------------------------------------------------------------+
474  *  | Package                                                               |
475  *  |  +-------------------+  +-------------------+  +-------------------+  |
476  *  |  | Core              |  | Core              |  | Core              |  |
477  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
478  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
479  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
480  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
481  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
482  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
483  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
484  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
485  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
486  *  |  +-------------------+  +-------------------+  +-------------------+  |
487  *  | +-------------------------------------------------------------------+ |
488  *  | |                         Shared L3 Cache                           | |
489  *  | +-------------------------------------------------------------------+ |
490  *  | +-------------------------------------------------------------------+ |
491  *  | |                        Memory Controller                          | |
492  *  | +-------------------------------------------------------------------+ |
493  *  +-----------------------------------------------------------------------+
494  *
495  * A side effect of this current architecture is that what we care about from a
496  * scheduling and topology perspective, is simplified. In general we care about
497  * understanding which logical CPUs are part of the same core and socket.
498  *
499  * To determine the relationship between threads and cores, Intel initially used
500  * the identifier in the advanced programmable interrupt controller (APIC). They
501  * also added cpuid leaf 4 to give additional information about the number of
502  * threads and CPUs in the processor. With the addition of x2apic (which
503  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
504  * additional cpuid topology leaf 0xB was added.
505  *
506  * AMD Topology
507  * ------------
508  *
509  * When discussing AMD topology, we want to break this into three distinct
510  * generations of topology. There's the basic topology that has been used in
511  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
512  * with family 0x15 (Bulldozer), and there's the topology that was introduced
513  * with family 0x17 (Zen). AMD also has some additional terminology that's worth
514  * talking about.
515  *
516  * Until the introduction of family 0x17 (Zen), AMD did not implement something
517  * that they considered SMT. Whether or not the AMD processors have SMT
518  * influences many things including scheduling and reliability, availability,
519  * and serviceability (RAS) features.
520  *
521  * NODE
522  *
523  *	AMD uses the term node to refer to a die that contains a number of cores
524  *	and I/O resources. Depending on the processor family and model, more
525  *	than one node can be present in the package. When there is more than one
526  *	node this indicates a multi-chip module. Usually each node has its own
527  *	access to memory and I/O devices. This is important and generally
528  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
529  *	result, we track this relationship in the operating system.
530  *
531  *	In processors with an L3 cache, the L3 cache is generally shared across
532  *	the entire node, though the way this is carved up varies from generation
533  *	to generation.
534  *
535  * BULLDOZER
536  *
537  *	Starting with the Bulldozer family (0x15) and continuing until the
538  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
539  *	compute unit. In a compute unit, two traditional cores share a number of
540  *	hardware resources. Critically, they share the FPU, L1 instruction
541  *	cache, and the L2 cache. Several compute units were then combined inside
542  *	of a single node.  Because the integer execution units, L1 data cache,
543  *	and some other resources were not shared between the cores, AMD never
544  *	considered this to be SMT.
545  *
546  * ZEN
547  *
548  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
549  *	is called Zeppelin. These modules are similar to the idea of nodes used
550  *	previously. Each of these nodes has two DRAM channels which all of the
551  *	cores in the node can access uniformly. These nodes are linked together
552  *	in the package, creating a NUMA environment.
553  *
554  *	The Zeppelin die itself contains two different 'core complexes'. Each
555  *	core complex consists of four cores which each have two threads, for a
556  *	total of 8 logical CPUs per complex. Unlike other generations,
557  *	where all the logical CPUs in a given node share the L3 cache, here each
558  *	core complex has its own shared L3 cache.
559  *
560  *	A further thing that we need to consider is that in some configurations,
561  *	particularly with the Threadripper line of processors, not every die
562  *	actually has its memory controllers wired up to actual memory channels.
563  *	This means that some cores have memory attached to them and others
564  *	don't.
565  *
566  *	To put Zen in perspective, consider the following images:
567  *
568  *      +--------------------------------------------------------+
569  *      | Core Complex                                           |
570  *      | +-------------------+    +-------------------+  +---+  |
571  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
572  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
573  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
574  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
575  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
576  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
577  *      | +-------------------+    +-------------------+  | C |  |
578  *      | +-------------------+    +-------------------+  | a |  |
579  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
580  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
581  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
582  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
583  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
584  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
585  *      | +-------------------+    +-------------------+  +---+  |
586  *      |                                                        |
587  *	+--------------------------------------------------------+
588  *
589  *  This first image represents a single Zen core complex that consists of four
590  *  cores.
591  *
592  *
593  *	+--------------------------------------------------------+
594  *	| Zeppelin Die                                           |
595  *	|  +--------------------------------------------------+  |
596  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
597  *	|  +--------------------------------------------------+  |
598  *      |                           HH                           |
599  *	|          +-----------+    HH    +-----------+          |
600  *	|          |           |    HH    |           |          |
601  *	|          |    Core   |==========|    Core   |          |
602  *	|          |  Complex  |==========|  Complex  |          |
603  *	|          |           |    HH    |           |          |
604  *	|          +-----------+    HH    +-----------+          |
605  *      |                           HH                           |
606  *	|  +--------------------------------------------------+  |
607  *	|  |                Memory Controller                 |  |
608  *	|  +--------------------------------------------------+  |
609  *      |                                                        |
610  *	+--------------------------------------------------------+
611  *
612  *  This image represents a single Zeppelin Die. Note how both cores are
613  *  connected to the same memory controller and I/O units. While each core
614  *  complex has its own L3 cache as seen in the first image, they both have
615  *  uniform access to memory.
616  *
617  *
618  *                      PP                     PP
619  *                      PP                     PP
620  *           +----------PP---------------------PP---------+
621  *           |          PP                     PP         |
622  *           |    +-----------+          +-----------+    |
623  *           |    |           |          |           |    |
624  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
625  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
626  *           |    |           |          |           |    |
627  *           |    +-----------+ooo    ...+-----------+    |
628  *           |          HH      ooo  ...       HH         |
629  *           |          HH        oo..         HH         |
630  *           |          HH        ..oo         HH         |
631  *           |          HH      ...  ooo       HH         |
632  *           |    +-----------+...    ooo+-----------+    |
633  *           |    |           |          |           |    |
634  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
635  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
636  *           |    |           |          |           |    |
637  *           |    +-----------+          +-----------+    |
638  *           |          PP                     PP         |
639  *           +----------PP---------------------PP---------+
640  *                      PP                     PP
641  *                      PP                     PP
642  *
643  *  This image represents a single Zen package. In this example, it has four
644  *  Zeppelin dies, though some configurations only have a single one. In this
645  *  example, each die is directly connected to the next. Also, each die is
646  *  represented as being connected to memory by the 'M' character and connected
647  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
648  *  die is made up of two core complexes, we have multiple different NUMA
649  *  domains that we care about for these systems.
650  *
651  * CPUID LEAVES
652  *
653  * There are a few different CPUID leaves that we can use to try and understand
654  * the actual state of the world. As part of the introduction of family 0xf, AMD
655  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
656  * processors that are in the system. Because families before Zen didn't have
657  * SMT, this was always the number of cores that were in the system. However, it
658  * should always be thought of as the number of logical threads to be consistent
659  * between generations. In addition we also get the size of the APIC ID that is
660  * used to represent the number of logical processors. This is important for
661  * deriving topology information.
662  *
663  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
664  * bit between Bulldozer and later families, but it is quite useful in
665  * determining the topology information. Because this information has changed
666  * across family generations, it's worth calling out what these mean
667  * explicitly. The registers have the following meanings:
668  *
669  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
670  *		APIC ID, even though on systems without x2apic support, it will
671  *		be limited to 8 bits.
672  *
673  *	%ebx	On Bulldozer-era systems this contains information about the
674  *		number of cores that are in a compute unit (cores that share
675  *		resources). It also contains a per-package compute unit ID that
676  *		identifies which compute unit the logical CPU is a part of.
677  *
678  *		On Zen-era systems this instead contains the number of threads
679  *		per core and the ID of the core that the logical CPU is a part
680  *		of. Note, this ID is unique only to the package, it is not
681  *		globally unique across the entire system.
682  *
683  *	%ecx	This contains the number of nodes that exist in the package. It
684  *		also contains an ID that identifies which node the logical CPU
685  *		is a part of.
686  *
687  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
688  * cache layout to determine which logical CPUs are sharing which caches.
689  *
690  * illumos Topology
691  * ----------------
692  *
693  * Based on the above we synthesize the information into several different
694  * variables that we store in the 'struct cpuid_info'. We'll go into the details
695  * of what each member is supposed to represent and their uniqueness. In
696  * general, there are two levels of uniqueness that we care about. We care about
697  * an ID that is globally unique. That means that it will be unique across all
698  * entities in the system. For example, the default logical CPU ID is globally
699  * unique. On the other hand, there is some information that we only care about
700  * being unique within the context of a single package / socket. Here are the
701  * variables that we keep track of and their meaning.
702  *
703  * Several of the values that are asking for an identifier, with the exception
704  * of cpi_apicid, are allowed to be synthetic.
705  *
706  *
707  * cpi_apicid
708  *
709  *	This is the value of the CPU's APIC id. This should be the full 32-bit
710  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
711  *	APIC ID. This value is globally unique between all logical CPUs across
712  *	all packages. This is usually required by the APIC.
713  *
714  * cpi_chipid
715  *
716  *	This value indicates the ID of the package that the logical CPU is a
717  *	part of. This value is allowed to be synthetic. It is usually derived by
718  *	taking the CPU's APIC ID and determining how many bits are used to
719  *	represent CPU cores in the package. All logical CPUs that are part of
720  *	the same package must have the same value.
721  *
722  * cpi_coreid
723  *
724  *	This represents the ID of a CPU core. Two logical CPUs should only have
725  *	the same cpi_coreid value if they are part of the same core. These
726  *	values may be synthetic. On systems that support SMT, this value is
727  *	usually derived from the APIC ID, otherwise it is often synthetic and
728  *	just set to the value of the cpu_id in the cpu_t.
729  *
730  * cpi_pkgcoreid
731  *
732  *	This is similar to the cpi_coreid in that logical CPUs that are part of
733  *	the same core should have the same ID. The main difference is that these
734  *	values are only required to be unique to a given socket.
735  *
736  * cpi_clogid
737  *
738  *	This represents the logical ID of a logical CPU. This value should be
739  *	unique within a given socket for each logical CPU. This is allowed to be
740  *	synthetic, though it is usually based off of the CPU's apic ID. The
741  *	broader system expects that logical CPUs that have are part of the same
742  *	core have contiguous numbers. For example, if there were two threads per
743  *	core, then the core IDs divided by two should be the same and the first
744  *	modulus two should be zero and the second one. For example, IDs 4 and 5
745  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
746  *	6 represent two logical CPUs that are part of different cores.
747  *
748  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
749  *	from the same source, strictly speaking, they don't have to be and the
750  *	two values should be considered logically independent. One should not
751  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
752  *	some kind of relationship. While this is tempting, we've seen cases on
753  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
754  *
755  * cpi_ncpu_per_chip
756  *
757  *	This value indicates the total number of logical CPUs that exist in the
758  *	physical package. Critically, this is not the number of logical CPUs
759  *	that exist for just the single core.
760  *
761  *	This value should be the same for all logical CPUs in the same package.
762  *
763  * cpi_ncore_per_chip
764  *
765  *	This value indicates the total number of physical CPU cores that exist
766  *	in the package. The system compares this value with cpi_ncpu_per_chip to
767  *	determine if simultaneous multi-threading (SMT) is enabled. When
768  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
769  *	the X86FSET_HTT feature is not set. If this value is greater than one,
770  *	than we consider the processor to have the feature X86FSET_CMP, to
771  *	indicate that there is support for more than one core.
772  *
773  *	This value should be the same for all logical CPUs in the same package.
774  *
775  * cpi_procnodes_per_pkg
776  *
777  *	This value indicates the number of 'nodes' that exist in the package.
778  *	When processors are actually a multi-chip module, this represents the
779  *	number of such modules that exist in the package. Currently, on Intel
780  *	based systems this member is always set to 1.
781  *
782  *	This value should be the same for all logical CPUs in the same package.
783  *
784  * cpi_procnodeid
785  *
786  *	This value indicates the ID of the node that the logical CPU is a part
787  *	of. All logical CPUs that are in the same node must have the same value
788  *	here. This value must be unique across all of the packages in the
789  *	system.  On Intel based systems, this is currently set to the value in
790  *	cpi_chipid because there is only one node.
791  *
792  * cpi_cores_per_compunit
793  *
794  *	This value indicates the number of cores that are part of a compute
795  *	unit. See the AMD topology section for this. This member only has real
796  *	meaning currently for AMD Bulldozer family processors. For all other
797  *	processors, this should currently be set to 1.
798  *
799  * cpi_compunitid
800  *
801  *	This indicates the compute unit that the logical CPU belongs to. For
802  *	processors without AMD Bulldozer-style compute units this should be set
803  *	to the value of cpi_coreid.
804  *
805  * cpi_ncpu_shr_last_cache
806  *
807  *	This indicates the number of logical CPUs that are sharing the same last
808  *	level cache. This value should be the same for all CPUs that are sharing
809  *	that cache. The last cache refers to the cache that is closest to memory
810  *	and furthest away from the CPU.
811  *
812  * cpi_last_lvl_cacheid
813  *
814  *	This indicates the ID of the last cache that the logical CPU uses. This
815  *	cache is often shared between multiple logical CPUs and is the cache
816  *	that is closest to memory and furthest away from the CPU. This value
817  *	should be the same for a group of logical CPUs only if they actually
818  *	share the same last level cache. IDs should not overlap between
819  *	packages.
820  *
821  * cpi_ncore_bits
822  *
823  *	This indicates the number of bits that are required to represent all of
824  *	the cores in the system. As cores are derived based on their APIC IDs,
825  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
826  *	this value to be larger than the actual number of IDs that are present
827  *	in the system. This is used to size tables by the CMI framework. It is
828  *	only filled in for Intel and AMD CPUs.
829  *
830  * cpi_nthread_bits
831  *
832  *	This indicates the number of bits required to represent all of the IDs
833  *	that cover the logical CPUs that exist on a given core. It's OK for this
834  *	value to be larger than the actual number of IDs that are present in the
835  *	system.  This is used to size tables by the CMI framework. It is
836  *	only filled in for Intel and AMD CPUs.
837  *
838  * -----------
839  * Hypervisors
840  * -----------
841  *
842  * If trying to manage the differences between vendors wasn't bad enough, it can
843  * get worse thanks to our friend hardware virtualization. Hypervisors are given
844  * the ability to interpose on all cpuid instructions and change them to suit
845  * their purposes. In general, this is necessary as the hypervisor wants to be
846  * able to present a more uniform set of features or not necessarily give the
847  * guest operating system kernel knowledge of all features so it can be
848  * more easily migrated between systems.
849  *
850  * When it comes to trying to determine topology information, this can be a
851  * double edged sword. When a hypervisor doesn't actually implement a cpuid
852  * leaf, it'll often return all zeros. Because of that, you'll often see various
853  * checks scattered about fields being non-zero before we assume we can use
854  * them.
855  *
856  * When it comes to topology information, the hypervisor is often incentivized
857  * to lie to you about topology. This is because it doesn't always actually
858  * guarantee that topology at all. The topology path we take in the system
859  * depends on how the CPU advertises itself. If it advertises itself as an Intel
860  * or AMD CPU, then we basically do our normal path. However, when they don't
861  * use an actual vendor, then that usually turns into multiple one-core CPUs
862  * that we enumerate that are often on different sockets. The actual behavior
863  * depends greatly on what the hypervisor actually exposes to us.
864  *
865  * --------------------
866  * Exposing Information
867  * --------------------
868  *
869  * We expose CPUID information in three different forms in the system.
870  *
871  * The first is through the x86_featureset variable. This is used in conjunction
872  * with the is_x86_feature() function. This is queried by x86-specific functions
873  * to determine which features are or aren't present in the system and to make
874  * decisions based upon them. For example, users of this include everything from
875  * parts of the system dedicated to reliability, availability, and
876  * serviceability (RAS), to making decisions about how to handle security
877  * mitigations, to various x86-specific drivers. General purpose or
878  * architecture independent drivers should never be calling this function.
879  *
880  * The second means is through the auxiliary vector. The auxiliary vector is a
881  * series of tagged data that the kernel passes down to a user program when it
882  * begins executing. This information is used to indicate to programs what
883  * instruction set extensions are present. For example, information about the
884  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
885  * since user programs cannot make use of it. However, things like the AVX
886  * instruction sets are. Programs use this information to make run-time
887  * decisions about what features they should use. As an example, the run-time
888  * link-editor (rtld) can relocate different functions depending on the hardware
889  * support available.
890  *
891  * The final form is through a series of accessor functions that all have the
892  * form cpuid_get*. This is used by a number of different subsystems in the
893  * kernel to determine more detailed information about what we're running on,
894  * topology information, etc. Some of these subsystems include processor groups
895  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
896  * microcode, and performance monitoring. These functions all ASSERT that the
897  * CPU they're being called on has reached a certain cpuid pass. If the passes
898  * are rearranged, then this needs to be adjusted.
899  *
900  * -----------------------------------------------
901  * Speculative Execution CPU Side Channel Security
902  * -----------------------------------------------
903  *
904  * With the advent of the Spectre and Meltdown attacks which exploit speculative
905  * execution in the CPU to create side channels there have been a number of
906  * different attacks and corresponding issues that the operating system needs to
907  * mitigate against. The following list is some of the common, but not
908  * exhaustive, set of issues that we know about and have done some or need to do
909  * more work in the system to mitigate against:
910  *
911  *   - Spectre v1
912  *   - swapgs (Spectre v1 variant)
913  *   - Spectre v2
914  *   - Meltdown (Spectre v3)
915  *   - Rogue Register Read (Spectre v3a)
916  *   - Speculative Store Bypass (Spectre v4)
917  *   - ret2spec, SpectreRSB
918  *   - L1 Terminal Fault (L1TF)
919  *   - Microarchitectural Data Sampling (MDS)
920  *
921  * Each of these requires different sets of mitigations and has different attack
922  * surfaces. For the most part, this discussion is about protecting the kernel
923  * from non-kernel executing environments such as user processes and hardware
924  * virtual machines. Unfortunately, there are a number of user vs. user
925  * scenarios that exist with these. The rest of this section will describe the
926  * overall approach that the system has taken to address these as well as their
927  * shortcomings. Unfortunately, not all of the above have been handled today.
928  *
929  * SPECTRE v2, ret2spec, SpectreRSB
930  *
931  * The second variant of the spectre attack focuses on performing branch target
932  * injection. This generally impacts indirect call instructions in the system.
933  * There are three different ways to mitigate this issue that are commonly
934  * described today:
935  *
936  *  1. Using Indirect Branch Restricted Speculation (IBRS).
937  *  2. Using Retpolines and RSB Stuffing
938  *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
939  *
940  * IBRS uses a feature added to microcode to restrict speculation, among other
941  * things. This form of mitigation has not been used as it has been generally
942  * seen as too expensive and requires reactivation upon various transitions in
943  * the system.
944  *
945  * As a less impactful alternative to IBRS, retpolines were developed by
946  * Google. These basically require one to replace indirect calls with a specific
947  * trampoline that will cause speculation to fail and break the attack.
948  * Retpolines require compiler support. We always build with retpolines in the
949  * external thunk mode. This means that a traditional indirect call is replaced
950  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
951  * of this is that all indirect function calls are performed through a register.
952  *
953  * We have to use a common external location of the thunk and not inline it into
954  * the callsite so that way we can have a single place to patch these functions.
955  * As it turns out, we actually have three different forms of retpolines that
956  * exist in the system:
957  *
958  *  1. A full retpoline
959  *  2. An AMD-specific optimized retpoline
960  *  3. A no-op version
961  *
962  * The first one is used in the general case. The second one is used if we can
963  * determine that we're on an AMD system and we can successfully toggle the
964  * lfence serializing MSR that exists on the platform. Basically with this
965  * present, an lfence is sufficient and we don't need to do anywhere near as
966  * complicated a dance to successfully use retpolines.
967  *
968  * The third form described above is the most curious. It turns out that the way
969  * that retpolines are implemented is that they rely on how speculation is
970  * performed on a 'ret' instruction. Intel has continued to optimize this
971  * process (which is partly why we need to have return stack buffer stuffing,
972  * but more on that in a bit) and in processors starting with Cascade Lake
973  * on the server side, it's dangerous to rely on retpolines. Instead, a new
974  * mechanism has been introduced called Enhanced IBRS (EIBRS).
975  *
976  * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
977  * physical core. However, if this is the case, we don't want to use retpolines
978  * any more. Therefore if EIBRS is present, we end up turning each retpoline
979  * function (called a thunk) into a jmp instruction. This means that we're still
980  * paying the cost of an extra jump to the external thunk, but it gives us
981  * flexibility and the ability to have a single kernel image that works across a
982  * wide variety of systems and hardware features.
983  *
984  * Unfortunately, this alone is insufficient. First, Skylake systems have
985  * additional speculation for the Return Stack Buffer (RSB) which is used to
986  * return from call instructions which retpolines take advantage of. However,
987  * this problem is not just limited to Skylake and is actually more pernicious.
988  * The SpectreRSB paper introduces several more problems that can arise with
989  * dealing with this. The RSB can be poisoned just like the indirect branch
990  * predictor. This means that one needs to clear the RSB when transitioning
991  * between two different privilege domains. Some examples include:
992  *
993  *  - Switching between two different user processes
994  *  - Going between user land and the kernel
995  *  - Returning to the kernel from a hardware virtual machine
996  *
997  * Mitigating this involves combining a couple of different things. The first is
998  * SMEP (supervisor mode execution protection) which was introduced in Ivy
999  * Bridge. When an RSB entry refers to a user address and we're executing in the
1000  * kernel, speculation through it will be stopped when SMEP is enabled. This
1001  * protects against a number of the different cases that we would normally be
1002  * worried about such as when we enter the kernel from user land.
1003  *
1004  * To prevent against additional manipulation of the RSB from other contexts
1005  * such as a non-root VMX context attacking the kernel we first look to enhanced
1006  * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1007  * need to do to protect the kernel at this time.
1008  *
1009  * On CPUs without EIBRS we need to manually overwrite the contents of the
1010  * return stack buffer. We do this through the x86_rsb_stuff() function.
1011  * Currently this is employed on context switch. The x86_rsb_stuff() function is
1012  * disabled when enhanced IBRS is present because Intel claims on such systems
1013  * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1014  * to user attacks via the RSB.
1015  *
1016  * If SMEP is not present, then we would have to stuff the RSB every time we
1017  * transitioned from user mode to the kernel, which isn't very practical right
1018  * now.
1019  *
1020  * To fully protect user to user and vmx to vmx attacks from these classes of
1021  * issues, we would also need to allow them to opt into performing an Indirect
1022  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1023  *
1024  * By default, the system will enable RSB stuffing and the required variant of
1025  * retpolines and store that information in the x86_spectrev2_mitigation value.
1026  * This will be evaluated after a microcode update as well, though it is
1027  * expected that microcode updates will not take away features. This may mean
1028  * that a late loaded microcode may not end up in the optimal configuration
1029  * (though this should be rare).
1030  *
1031  * Currently we do not build kmdb with retpolines or perform any additional side
1032  * channel security mitigations for it. One complication with kmdb is that it
1033  * requires its own retpoline thunks and it would need to adjust itself based on
1034  * what the kernel does. The threat model of kmdb is more limited and therefore
1035  * it may make more sense to investigate using prediction barriers as the whole
1036  * system is only executing a single instruction at a time while in kmdb.
1037  *
1038  * SPECTRE v1, v4
1039  *
1040  * The v1 and v4 variants of spectre are not currently mitigated in the
1041  * system and require other classes of changes to occur in the code.
1042  *
1043  * SPECTRE v1 (SWAPGS VARIANT)
1044  *
1045  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1046  * can generally affect any branch-dependent code. The swapgs issue is one
1047  * variant of this. If we are coming in from userspace, we can have code like
1048  * this:
1049  *
1050  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1051  *	je	1f
1052  *	movq	$0, REGOFF_SAVFP(%rsp)
1053  *	swapgs
1054  *	1:
1055  *	movq	%gs:CPU_THREAD, %rax
1056  *
1057  * If an attacker can cause a mis-speculation of the branch here, we could skip
1058  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1059  * load. If subsequent code can act as the usual Spectre cache gadget, this
1060  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1061  * any use of the %gs override.
1062  *
1063  * The other case is also an issue: if we're coming into a trap from kernel
1064  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1065  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1066  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1067  * case, and the fix is the same in both cases (an lfence at the branch target
1068  * 1: in this example), we'll just do it unconditionally.
1069  *
1070  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1071  * harder for user-space to actually set a useful %gsbase value: although it's
1072  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1073  * mitigate anyway.
1074  *
1075  * MELTDOWN
1076  *
1077  * Meltdown, or spectre v3, allowed a user process to read any data in their
1078  * address space regardless of whether or not the page tables in question
1079  * allowed the user to have the ability to read them. The solution to meltdown
1080  * is kernel page table isolation. In this world, there are two page tables that
1081  * are used for a process, one in user land and one in the kernel. To implement
1082  * this we use per-CPU page tables and switch between the user and kernel
1083  * variants when entering and exiting the kernel.  For more information about
1084  * this process and how the trampolines work, please see the big theory
1085  * statements and additional comments in:
1086  *
1087  *  - uts/i86pc/ml/kpti_trampolines.s
1088  *  - uts/i86pc/vm/hat_i86.c
1089  *
1090  * While Meltdown only impacted Intel systems and there are also Intel systems
1091  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1092  * kernel page table isolation enabled. While this may at first seem weird, an
1093  * important thing to remember is that you can't speculatively read an address
1094  * if it's never in your page table at all. Having user processes without kernel
1095  * pages present provides us with an important layer of defense in the kernel
1096  * against any other side channel attacks that exist and have yet to be
1097  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1098  * default, no matter the x86 system.
1099  *
1100  * L1 TERMINAL FAULT
1101  *
1102  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1103  * execution uses page table entries. Effectively, it is two different problems.
1104  * The first is that it ignores the not present bit in the page table entries
1105  * when performing speculative execution. This means that something can
1106  * speculatively read the listed physical address if it's present in the L1
1107  * cache under certain conditions (see Intel's documentation for the full set of
1108  * conditions). Secondly, this can be used to bypass hardware virtualization
1109  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1110  * instructions.
1111  *
1112  * For the non-hardware virtualized case, this is relatively easy to deal with.
1113  * We must make sure that all unmapped pages have an address of zero. This means
1114  * that they could read the first 4k of physical memory; however, we never use
1115  * that first page in the operating system and always skip putting it in our
1116  * memory map, even if firmware tells us we can use it in our memory map. While
1117  * other systems try to put extra metadata in the address and reserved bits,
1118  * which led to this being problematic in those cases, we do not.
1119  *
1120  * For hardware virtual machines things are more complicated. Because they can
1121  * construct their own page tables, it isn't hard for them to perform this
1122  * attack against any physical address. The one wrinkle is that this physical
1123  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1124  * to flush the L1 data cache. We wrap this up in the function
1125  * spec_uarch_flush(). This function is also used in the mitigation of
1126  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1127  * hypervisors such as KVM or bhyve are responsible for performing this before
1128  * entering the guest.
1129  *
1130  * Because this attack takes place in the L1 cache, there's another wrinkle
1131  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1132  * designs. This means that when a thread enters a hardware virtualized context
1133  * and flushes the L1 data cache, the other thread on the processor may then go
1134  * ahead and put new data in it that can be potentially attacked. While one
1135  * solution is to disable SMT on the system, another option that is available is
1136  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1137  * goes through and makes sure that if a HVM is being scheduled on one thread,
1138  * then the thing on the other thread is from the same hardware virtual machine.
1139  * If an interrupt comes in or the guest exits to the broader system, then the
1140  * other SMT thread will be kicked out.
1141  *
1142  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1143  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1144  * perform L1TF related mitigations.
1145  *
1146  * MICROARCHITECTURAL DATA SAMPLING
1147  *
1148  * Microarchitectural data sampling (MDS) is a combination of four discrete
1149  * vulnerabilities that are similar issues affecting various parts of the CPU's
1150  * microarchitectural implementation around load, store, and fill buffers.
1151  * Specifically it is made up of the following subcomponents:
1152  *
1153  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1154  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1155  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1156  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1157  *
1158  * To begin addressing these, Intel has introduced another feature in microcode
1159  * called MD_CLEAR. This changes the verw instruction to operate in a different
1160  * way. This allows us to execute the verw instruction in a particular way to
1161  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1162  * updated when this microcode is present to flush this state.
1163  *
1164  * Primarily we need to flush this state whenever we transition from the kernel
1165  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1166  * little bit different. Here the structures are statically sized when a logical
1167  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1168  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1169  * mwait, or another ACPI method. To perform these flushes, we call
1170  * x86_md_clear() at all of these transition points.
1171  *
1172  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1173  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1174  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1175  * a no-op.
1176  *
1177  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1178  * particular, everything we've discussed above is only valid for a single
1179  * thread executing on a core. In the case where you have hyper-threading
1180  * present, this attack can be performed between threads. The theoretical fix
1181  * for this is to ensure that both threads are always in the same security
1182  * domain. This means that they are executing in the same ring and mutually
1183  * trust each other. Practically speaking, this would mean that a system call
1184  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1185  * Rather than implement this, we recommend that one disables hyper-threading
1186  * through the use of psradm -aS.
1187  *
1188  * TSX ASYNCHRONOUS ABORT
1189  *
1190  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1191  * behaves like MDS, but leverages Intel's transactional instructions as another
1192  * vector. Effectively, when a transaction hits one of these cases (unmapped
1193  * page, various cache snoop activity, etc.) then the same data can be exposed
1194  * as in the case of MDS. This means that you can attack your twin.
1195  *
1196  * Intel has described that there are two different ways that we can mitigate
1197  * this problem on affected processors:
1198  *
1199  *   1) We can use the same techniques used to deal with MDS. Flushing the
1200  *      microarchitectural buffers and disabling hyperthreading will mitigate
1201  *      this in the same way.
1202  *
1203  *   2) Using microcode to disable TSX.
1204  *
1205  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1206  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1207  * That's OK as we're already doing all such mitigations. On the other hand,
1208  * processors with MDS_NO are all supposed to receive microcode updates that
1209  * enumerate support for disabling TSX. In general, we'd rather use this method
1210  * when available as it doesn't require disabling hyperthreading to be
1211  * effective. Currently we basically are relying on microcode for processors
1212  * that enumerate MDS_NO.
1213  *
1214  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1215  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1216  * different powers. The first allows us to cause all transactions to
1217  * immediately abort. The second gives us a means of disabling TSX completely,
1218  * which includes removing it from cpuid. If we have support for this in
1219  * microcode during the first cpuid pass, then we'll disable TSX completely such
1220  * that user land never has a chance to observe the bit. However, if we are late
1221  * loading the microcode, then we must use the functionality to cause
1222  * transactions to automatically abort. This is necessary for user land's sake.
1223  * Once a program sees a cpuid bit, it must not be taken away.
1224  *
1225  * We track whether or not we should do this based on what cpuid pass we're in.
1226  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1227  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1228  * should happen twice. Once in the normal cpuid_pass1() code and then a second
1229  * time after we do the initial microcode update.  As a result we need to be
1230  * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable
1231  * microcode on the current CPU (which happens prior to cpuid_pass_ucode()).
1232  *
1233  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1234  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1235  * unfortunate feature in a number of ways, and taking the opportunity to
1236  * finally be able to turn it off is likely to be of benefit in the future.
1237  *
1238  * SUMMARY
1239  *
1240  * The following table attempts to summarize the mitigations for various issues
1241  * and what's done in various places:
1242  *
1243  *  - Spectre v1: Not currently mitigated
1244  *  - swapgs: lfences after swapgs paths
1245  *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1246  *  - Meltdown: Kernel Page Table Isolation
1247  *  - Spectre v3a: Updated CPU microcode
1248  *  - Spectre v4: Not currently mitigated
1249  *  - SpectreRSB: SMEP and RSB Stuffing
1250  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1251  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1252  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1253  *
1254  * The following table indicates the x86 feature set bits that indicate that a
1255  * given problem has been solved or a notable feature is present:
1256  *
1257  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1258  *  - MDS_NO: All forms of MDS
1259  *  - TAA_NO: TAA
1260  */
1261 
1262 #include <sys/types.h>
1263 #include <sys/archsystm.h>
1264 #include <sys/x86_archext.h>
1265 #include <sys/kmem.h>
1266 #include <sys/systm.h>
1267 #include <sys/cmn_err.h>
1268 #include <sys/sunddi.h>
1269 #include <sys/sunndi.h>
1270 #include <sys/cpuvar.h>
1271 #include <sys/processor.h>
1272 #include <sys/sysmacros.h>
1273 #include <sys/pg.h>
1274 #include <sys/fp.h>
1275 #include <sys/controlregs.h>
1276 #include <sys/bitmap.h>
1277 #include <sys/auxv_386.h>
1278 #include <sys/memnode.h>
1279 #include <sys/pci_cfgspace.h>
1280 #include <sys/comm_page.h>
1281 #include <sys/mach_mmu.h>
1282 #include <sys/ucode.h>
1283 #include <sys/tsc.h>
1284 #include <sys/kobj.h>
1285 #include <sys/asm_misc.h>
1286 
1287 #ifdef __xpv
1288 #include <sys/hypervisor.h>
1289 #else
1290 #include <sys/ontrap.h>
1291 #endif
1292 
1293 uint_t x86_vendor = X86_VENDOR_IntelClone;
1294 uint_t x86_type = X86_TYPE_OTHER;
1295 uint_t x86_clflush_size = 0;
1296 
1297 #if defined(__xpv)
1298 int x86_use_pcid = 0;
1299 int x86_use_invpcid = 0;
1300 #else
1301 int x86_use_pcid = -1;
1302 int x86_use_invpcid = -1;
1303 #endif
1304 
1305 typedef enum {
1306 	X86_SPECTREV2_RETPOLINE,
1307 	X86_SPECTREV2_RETPOLINE_AMD,
1308 	X86_SPECTREV2_ENHANCED_IBRS,
1309 	X86_SPECTREV2_DISABLED
1310 } x86_spectrev2_mitigation_t;
1311 
1312 uint_t x86_disable_spectrev2 = 0;
1313 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1314     X86_SPECTREV2_RETPOLINE;
1315 
1316 /*
1317  * The mitigation status for TAA:
1318  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1319  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1320  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1321  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1322  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1323  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1324  */
1325 typedef enum {
1326 	X86_TAA_NOTHING,
1327 	X86_TAA_DISABLED,
1328 	X86_TAA_MD_CLEAR,
1329 	X86_TAA_TSX_FORCE_ABORT,
1330 	X86_TAA_TSX_DISABLE,
1331 	X86_TAA_HW_MITIGATED
1332 } x86_taa_mitigation_t;
1333 
1334 uint_t x86_disable_taa = 0;
1335 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1336 
1337 uint_t pentiumpro_bug4046376;
1338 
1339 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1340 
1341 static char *x86_feature_names[NUM_X86_FEATURES] = {
1342 	"lgpg",
1343 	"tsc",
1344 	"msr",
1345 	"mtrr",
1346 	"pge",
1347 	"de",
1348 	"cmov",
1349 	"mmx",
1350 	"mca",
1351 	"pae",
1352 	"cv8",
1353 	"pat",
1354 	"sep",
1355 	"sse",
1356 	"sse2",
1357 	"htt",
1358 	"asysc",
1359 	"nx",
1360 	"sse3",
1361 	"cx16",
1362 	"cmp",
1363 	"tscp",
1364 	"mwait",
1365 	"sse4a",
1366 	"cpuid",
1367 	"ssse3",
1368 	"sse4_1",
1369 	"sse4_2",
1370 	"1gpg",
1371 	"clfsh",
1372 	"64",
1373 	"aes",
1374 	"pclmulqdq",
1375 	"xsave",
1376 	"avx",
1377 	"vmx",
1378 	"svm",
1379 	"topoext",
1380 	"f16c",
1381 	"rdrand",
1382 	"x2apic",
1383 	"avx2",
1384 	"bmi1",
1385 	"bmi2",
1386 	"fma",
1387 	"smep",
1388 	"smap",
1389 	"adx",
1390 	"rdseed",
1391 	"mpx",
1392 	"avx512f",
1393 	"avx512dq",
1394 	"avx512pf",
1395 	"avx512er",
1396 	"avx512cd",
1397 	"avx512bw",
1398 	"avx512vl",
1399 	"avx512fma",
1400 	"avx512vbmi",
1401 	"avx512_vpopcntdq",
1402 	"avx512_4vnniw",
1403 	"avx512_4fmaps",
1404 	"xsaveopt",
1405 	"xsavec",
1406 	"xsaves",
1407 	"sha",
1408 	"umip",
1409 	"pku",
1410 	"ospke",
1411 	"pcid",
1412 	"invpcid",
1413 	"ibrs",
1414 	"ibpb",
1415 	"stibp",
1416 	"ssbd",
1417 	"ssbd_virt",
1418 	"rdcl_no",
1419 	"ibrs_all",
1420 	"rsba",
1421 	"ssb_no",
1422 	"stibp_all",
1423 	"flush_cmd",
1424 	"l1d_vmentry_no",
1425 	"fsgsbase",
1426 	"clflushopt",
1427 	"clwb",
1428 	"monitorx",
1429 	"clzero",
1430 	"xop",
1431 	"fma4",
1432 	"tbm",
1433 	"avx512_vnni",
1434 	"amd_pcec",
1435 	"md_clear",
1436 	"mds_no",
1437 	"core_thermal",
1438 	"pkg_thermal",
1439 	"tsx_ctrl",
1440 	"taa_no",
1441 	"ppin",
1442 	"vaes",
1443 	"vpclmulqdq",
1444 	"lfence_serializing"
1445 };
1446 
1447 boolean_t
is_x86_feature(void * featureset,uint_t feature)1448 is_x86_feature(void *featureset, uint_t feature)
1449 {
1450 	ASSERT(feature < NUM_X86_FEATURES);
1451 	return (BT_TEST((ulong_t *)featureset, feature));
1452 }
1453 
1454 void
add_x86_feature(void * featureset,uint_t feature)1455 add_x86_feature(void *featureset, uint_t feature)
1456 {
1457 	ASSERT(feature < NUM_X86_FEATURES);
1458 	BT_SET((ulong_t *)featureset, feature);
1459 }
1460 
1461 void
remove_x86_feature(void * featureset,uint_t feature)1462 remove_x86_feature(void *featureset, uint_t feature)
1463 {
1464 	ASSERT(feature < NUM_X86_FEATURES);
1465 	BT_CLEAR((ulong_t *)featureset, feature);
1466 }
1467 
1468 boolean_t
compare_x86_featureset(void * setA,void * setB)1469 compare_x86_featureset(void *setA, void *setB)
1470 {
1471 	/*
1472 	 * We assume that the unused bits of the bitmap are always zero.
1473 	 */
1474 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1475 		return (B_TRUE);
1476 	} else {
1477 		return (B_FALSE);
1478 	}
1479 }
1480 
1481 void
print_x86_featureset(void * featureset)1482 print_x86_featureset(void *featureset)
1483 {
1484 	uint_t i;
1485 
1486 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1487 		if (is_x86_feature(featureset, i)) {
1488 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1489 			    x86_feature_names[i]);
1490 		}
1491 	}
1492 }
1493 
1494 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1495 static size_t xsave_state_size = 0;
1496 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1497 boolean_t xsave_force_disable = B_FALSE;
1498 extern int disable_smap;
1499 
1500 /*
1501  * This is set to platform type we are running on.
1502  */
1503 static int platform_type = -1;
1504 
1505 #if !defined(__xpv)
1506 /*
1507  * Variable to patch if hypervisor platform detection needs to be
1508  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1509  */
1510 int enable_platform_detection = 1;
1511 #endif
1512 
1513 /*
1514  * monitor/mwait info.
1515  *
1516  * size_actual and buf_actual are the real address and size allocated to get
1517  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1518  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1519  * processor cache-line alignment, but this is not guarantied in the furture.
1520  */
1521 struct mwait_info {
1522 	size_t		mon_min;	/* min size to avoid missed wakeups */
1523 	size_t		mon_max;	/* size to avoid false wakeups */
1524 	size_t		size_actual;	/* size actually allocated */
1525 	void		*buf_actual;	/* memory actually allocated */
1526 	uint32_t	support;	/* processor support of monitor/mwait */
1527 };
1528 
1529 /*
1530  * xsave/xrestor info.
1531  *
1532  * This structure contains HW feature bits and the size of the xsave save area.
1533  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1534  * (xsave_state) to describe the xsave layout. However, at runtime the
1535  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1536  * xsave_state structure simply represents the legacy layout of the beginning
1537  * of the xsave area.
1538  */
1539 struct xsave_info {
1540 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1541 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1542 	size_t		xsav_max_size;  /* max size save area for HW features */
1543 	size_t		ymm_size;	/* AVX: size of ymm save area */
1544 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1545 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1546 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1547 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1548 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1549 	size_t		opmask_size;	/* AVX512: size of opmask save */
1550 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1551 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1552 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1553 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1554 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1555 };
1556 
1557 
1558 /*
1559  * These constants determine how many of the elements of the
1560  * cpuid we cache in the cpuid_info data structure; the
1561  * remaining elements are accessible via the cpuid instruction.
1562  */
1563 
1564 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1565 #define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1566 
1567 /*
1568  * See the big theory statement for a more detailed explanation of what some of
1569  * these members mean.
1570  */
1571 struct cpuid_info {
1572 	uint_t cpi_pass;		/* last pass completed */
1573 	/*
1574 	 * standard function information
1575 	 */
1576 	uint_t cpi_maxeax;		/* fn 0: %eax */
1577 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1578 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1579 
1580 	uint_t cpi_family;		/* fn 1: extended family */
1581 	uint_t cpi_model;		/* fn 1: extended model */
1582 	uint_t cpi_step;		/* fn 1: stepping */
1583 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1584 					/*		AMD: package/socket # */
1585 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1586 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1587 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1588 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1589 	uint_t cpi_ncache;		/* fn 2: number of elements */
1590 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1591 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1592 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1593 					/* Intel fn: 4, AMD fn: 8000001d */
1594 	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1595 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1596 	/*
1597 	 * extended function information
1598 	 */
1599 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1600 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1601 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1602 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1603 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1604 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1605 
1606 	id_t cpi_coreid;		/* same coreid => strands share core */
1607 	int cpi_pkgcoreid;		/* core number within single package */
1608 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1609 					/* Intel: fn 4: %eax[31-26] */
1610 
1611 	/*
1612 	 * These values represent the number of bits that are required to store
1613 	 * information about the number of cores and threads.
1614 	 */
1615 	uint_t cpi_ncore_bits;
1616 	uint_t cpi_nthread_bits;
1617 	/*
1618 	 * supported feature information
1619 	 */
1620 	uint32_t cpi_support[6];
1621 #define	STD_EDX_FEATURES	0
1622 #define	AMD_EDX_FEATURES	1
1623 #define	TM_EDX_FEATURES		2
1624 #define	STD_ECX_FEATURES	3
1625 #define	AMD_ECX_FEATURES	4
1626 #define	STD_EBX_FEATURES	5
1627 	/*
1628 	 * Synthesized information, where known.
1629 	 */
1630 	uint32_t cpi_chiprev;		/* See X86_CHIPREV_* in x86_archext.h */
1631 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1632 	uint32_t cpi_socket;		/* Chip package/socket type */
1633 
1634 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1635 	uint32_t cpi_apicid;
1636 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1637 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1638 					/* Intel: 1 */
1639 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1640 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1641 
1642 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1643 };
1644 
1645 
1646 static struct cpuid_info cpuid_info0;
1647 
1648 /*
1649  * These bit fields are defined by the Intel Application Note AP-485
1650  * "Intel Processor Identification and the CPUID Instruction"
1651  */
1652 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1653 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1654 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1655 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1656 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1657 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1658 
1659 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1660 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1661 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1662 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1663 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1664 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1665 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1666 
1667 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1668 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1669 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1670 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1671 
1672 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1673 #define	CPI_XMAXEAX_MAX		0x80000100
1674 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1675 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1676 
1677 /*
1678  * Function 4 (Deterministic Cache Parameters) macros
1679  * Defined by Intel Application Note AP-485
1680  */
1681 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1682 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1683 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1684 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1685 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1686 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1687 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1688 
1689 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1690 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1691 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1692 
1693 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1694 
1695 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1696 
1697 
1698 /*
1699  * A couple of shorthand macros to identify "later" P6-family chips
1700  * like the Pentium M and Core.  First, the "older" P6-based stuff
1701  * (loosely defined as "pre-Pentium-4"):
1702  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1703  */
1704 #define	IS_LEGACY_P6(cpi) (			\
1705 	cpi->cpi_family == 6 &&			\
1706 		(cpi->cpi_model == 1 ||		\
1707 		cpi->cpi_model == 3 ||		\
1708 		cpi->cpi_model == 5 ||		\
1709 		cpi->cpi_model == 6 ||		\
1710 		cpi->cpi_model == 7 ||		\
1711 		cpi->cpi_model == 8 ||		\
1712 		cpi->cpi_model == 0xA ||	\
1713 		cpi->cpi_model == 0xB)		\
1714 )
1715 
1716 /* A "new F6" is everything with family 6 that's not the above */
1717 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1718 
1719 /* Extended family/model support */
1720 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1721 	cpi->cpi_family >= 0xf)
1722 
1723 /*
1724  * Info for monitor/mwait idle loop.
1725  *
1726  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1727  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1728  * 2006.
1729  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1730  * Documentation Updates" #33633, Rev 2.05, December 2006.
1731  */
1732 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1733 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1734 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1735 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1736 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1737 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1738 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1739 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1740 /*
1741  * Number of sub-cstates for a given c-state.
1742  */
1743 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1744 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1745 
1746 /*
1747  * XSAVE leaf 0xD enumeration
1748  */
1749 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1750 #define	CPUID_LEAFD_2_YMM_SIZE		256
1751 
1752 /*
1753  * Common extended leaf names to cut down on typos.
1754  */
1755 #define	CPUID_LEAF_EXT_0		0x80000000
1756 #define	CPUID_LEAF_EXT_8		0x80000008
1757 #define	CPUID_LEAF_EXT_1d		0x8000001d
1758 #define	CPUID_LEAF_EXT_1e		0x8000001e
1759 
1760 /*
1761  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1762  * file to try and keep people using the expected cpuid_* interfaces.
1763  */
1764 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1765 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1766 extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1767 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1768 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1769 
1770 /*
1771  * Apply up various platform-dependent restrictions where the
1772  * underlying platform restrictions mean the CPU can be marked
1773  * as less capable than its cpuid instruction would imply.
1774  */
1775 #if defined(__xpv)
1776 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)1777 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1778 {
1779 	switch (eax) {
1780 	case 1: {
1781 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1782 		    0 : CPUID_INTC_EDX_MCA;
1783 		cp->cp_edx &=
1784 		    ~(mcamask |
1785 		    CPUID_INTC_EDX_PSE |
1786 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1787 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1788 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1789 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1790 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1791 		break;
1792 	}
1793 
1794 	case 0x80000001:
1795 		cp->cp_edx &=
1796 		    ~(CPUID_AMD_EDX_PSE |
1797 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1798 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1799 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1800 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1801 		    CPUID_AMD_EDX_TSCP);
1802 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1803 		break;
1804 	default:
1805 		break;
1806 	}
1807 
1808 	switch (vendor) {
1809 	case X86_VENDOR_Intel:
1810 		switch (eax) {
1811 		case 4:
1812 			/*
1813 			 * Zero out the (ncores-per-chip - 1) field
1814 			 */
1815 			cp->cp_eax &= 0x03fffffff;
1816 			break;
1817 		default:
1818 			break;
1819 		}
1820 		break;
1821 	case X86_VENDOR_AMD:
1822 	case X86_VENDOR_HYGON:
1823 		switch (eax) {
1824 
1825 		case 0x80000001:
1826 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1827 			break;
1828 
1829 		case CPUID_LEAF_EXT_8:
1830 			/*
1831 			 * Zero out the (ncores-per-chip - 1) field
1832 			 */
1833 			cp->cp_ecx &= 0xffffff00;
1834 			break;
1835 		default:
1836 			break;
1837 		}
1838 		break;
1839 	default:
1840 		break;
1841 	}
1842 }
1843 #else
1844 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
1845 #endif
1846 
1847 /*
1848  *  Some undocumented ways of patching the results of the cpuid
1849  *  instruction to permit running Solaris 10 on future cpus that
1850  *  we don't currently support.  Could be set to non-zero values
1851  *  via settings in eeprom.
1852  */
1853 
1854 uint32_t cpuid_feature_ecx_include;
1855 uint32_t cpuid_feature_ecx_exclude;
1856 uint32_t cpuid_feature_edx_include;
1857 uint32_t cpuid_feature_edx_exclude;
1858 
1859 /*
1860  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1861  */
1862 void
cpuid_alloc_space(cpu_t * cpu)1863 cpuid_alloc_space(cpu_t *cpu)
1864 {
1865 	/*
1866 	 * By convention, cpu0 is the boot cpu, which is set up
1867 	 * before memory allocation is available.  All other cpus get
1868 	 * their cpuid_info struct allocated here.
1869 	 */
1870 	ASSERT(cpu->cpu_id != 0);
1871 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1872 	cpu->cpu_m.mcpu_cpi =
1873 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1874 }
1875 
1876 void
cpuid_free_space(cpu_t * cpu)1877 cpuid_free_space(cpu_t *cpu)
1878 {
1879 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1880 	int i;
1881 
1882 	ASSERT(cpi != NULL);
1883 	ASSERT(cpi != &cpuid_info0);
1884 
1885 	/*
1886 	 * Free up any cache leaf related dynamic storage. The first entry was
1887 	 * cached from the standard cpuid storage, so we should not free it.
1888 	 */
1889 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1890 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1891 	if (cpi->cpi_cache_leaf_size > 0)
1892 		kmem_free(cpi->cpi_cache_leaves,
1893 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1894 
1895 	kmem_free(cpi, sizeof (*cpi));
1896 	cpu->cpu_m.mcpu_cpi = NULL;
1897 }
1898 
1899 #if !defined(__xpv)
1900 /*
1901  * Determine the type of the underlying platform. This is used to customize
1902  * initialization of various subsystems (e.g. TSC). determine_platform() must
1903  * only ever be called once to prevent two processors from seeing different
1904  * values of platform_type. Must be called before cpuid_pass1(), the earliest
1905  * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1906  */
1907 void
determine_platform(void)1908 determine_platform(void)
1909 {
1910 	struct cpuid_regs cp;
1911 	uint32_t base;
1912 	uint32_t regs[4];
1913 	char *hvstr = (char *)regs;
1914 
1915 	ASSERT(platform_type == -1);
1916 
1917 	platform_type = HW_NATIVE;
1918 
1919 	if (!enable_platform_detection)
1920 		return;
1921 
1922 	/*
1923 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
1924 	 * vendor signature, and set platform type accordingly.
1925 	 *
1926 	 * References:
1927 	 * http://lkml.org/lkml/2008/10/1/246
1928 	 * http://kb.vmware.com/kb/1009458
1929 	 */
1930 	cp.cp_eax = 0x1;
1931 	(void) __cpuid_insn(&cp);
1932 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1933 		cp.cp_eax = 0x40000000;
1934 		(void) __cpuid_insn(&cp);
1935 		regs[0] = cp.cp_ebx;
1936 		regs[1] = cp.cp_ecx;
1937 		regs[2] = cp.cp_edx;
1938 		regs[3] = 0;
1939 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1940 			platform_type = HW_XEN_HVM;
1941 			return;
1942 		}
1943 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1944 			platform_type = HW_VMWARE;
1945 			return;
1946 		}
1947 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
1948 			platform_type = HW_KVM;
1949 			return;
1950 		}
1951 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1952 			platform_type = HW_BHYVE;
1953 			return;
1954 		}
1955 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1956 			platform_type = HW_MICROSOFT;
1957 	} else {
1958 		/*
1959 		 * Check older VMware hardware versions. VMware hypervisor is
1960 		 * detected by performing an IN operation to VMware hypervisor
1961 		 * port and checking that value returned in %ebx is VMware
1962 		 * hypervisor magic value.
1963 		 *
1964 		 * References: http://kb.vmware.com/kb/1009458
1965 		 */
1966 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1967 		if (regs[1] == VMWARE_HVMAGIC) {
1968 			platform_type = HW_VMWARE;
1969 			return;
1970 		}
1971 	}
1972 
1973 	/*
1974 	 * Check Xen hypervisor. In a fully virtualized domain,
1975 	 * Xen's pseudo-cpuid function returns a string representing the
1976 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1977 	 * supported cpuid function. We need at least a (base + 2) leaf value
1978 	 * to do what we want to do. Try different base values, since the
1979 	 * hypervisor might use a different one depending on whether Hyper-V
1980 	 * emulation is switched on by default or not.
1981 	 */
1982 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1983 		cp.cp_eax = base;
1984 		(void) __cpuid_insn(&cp);
1985 		regs[0] = cp.cp_ebx;
1986 		regs[1] = cp.cp_ecx;
1987 		regs[2] = cp.cp_edx;
1988 		regs[3] = 0;
1989 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1990 		    cp.cp_eax >= (base + 2)) {
1991 			platform_type &= ~HW_NATIVE;
1992 			platform_type |= HW_XEN_HVM;
1993 			return;
1994 		}
1995 	}
1996 }
1997 
1998 int
get_hwenv(void)1999 get_hwenv(void)
2000 {
2001 	ASSERT(platform_type != -1);
2002 	return (platform_type);
2003 }
2004 
2005 int
is_controldom(void)2006 is_controldom(void)
2007 {
2008 	return (0);
2009 }
2010 
2011 #else
2012 
2013 int
get_hwenv(void)2014 get_hwenv(void)
2015 {
2016 	return (HW_XEN_PV);
2017 }
2018 
2019 int
is_controldom(void)2020 is_controldom(void)
2021 {
2022 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2023 }
2024 
2025 #endif	/* __xpv */
2026 
2027 /*
2028  * Make sure that we have gathered all of the CPUID leaves that we might need to
2029  * determine topology. We assume that the standard leaf 1 has already been done
2030  * and that xmaxeax has already been calculated.
2031  */
2032 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2033 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2034 {
2035 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2036 
2037 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2038 		struct cpuid_regs *cp;
2039 
2040 		cp = &cpi->cpi_extd[8];
2041 		cp->cp_eax = CPUID_LEAF_EXT_8;
2042 		(void) __cpuid_insn(cp);
2043 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2044 	}
2045 
2046 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2047 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2048 		struct cpuid_regs *cp;
2049 
2050 		cp = &cpi->cpi_extd[0x1e];
2051 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2052 		(void) __cpuid_insn(cp);
2053 	}
2054 }
2055 
2056 /*
2057  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2058  * it to everything else. If not, and we're on an AMD system where 8000001e is
2059  * valid, then we use that. Othewrise, we fall back to the default value for the
2060  * APIC ID in leaf 1.
2061  */
2062 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2063 cpuid_gather_apicid(struct cpuid_info *cpi)
2064 {
2065 	/*
2066 	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2067 	 * it, we need to gather it again.
2068 	 */
2069 	if (cpi->cpi_maxeax >= 0xB) {
2070 		struct cpuid_regs regs;
2071 		struct cpuid_regs *cp;
2072 
2073 		cp = &regs;
2074 		cp->cp_eax = 0xB;
2075 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2076 		(void) __cpuid_insn(cp);
2077 
2078 		if (cp->cp_ebx != 0) {
2079 			return (cp->cp_edx);
2080 		}
2081 	}
2082 
2083 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2084 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2085 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2086 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2087 		return (cpi->cpi_extd[0x1e].cp_eax);
2088 	}
2089 
2090 	return (CPI_APIC_ID(cpi));
2091 }
2092 
2093 /*
2094  * For AMD processors, attempt to calculate the number of chips and cores that
2095  * exist. The way that we do this varies based on the generation, because the
2096  * generations themselves have changed dramatically.
2097  *
2098  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2099  * However, with the advent of family 17h (Zen) it actually tells us the number
2100  * of threads, so we need to look at leaf 0x8000001e if available to determine
2101  * its value. Otherwise, for all prior families, the number of enabled cores is
2102  * the same as threads.
2103  *
2104  * If we do not have leaf 0x80000008, then we assume that this processor does
2105  * not have anything. AMD's older CPUID specification says there's no reason to
2106  * fall back to leaf 1.
2107  *
2108  * In some virtualization cases we will not have leaf 8000001e or it will be
2109  * zero. When that happens we assume the number of threads is one.
2110  */
2111 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2112 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2113 {
2114 	uint_t nthreads, nthread_per_core;
2115 
2116 	nthreads = nthread_per_core = 1;
2117 
2118 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2119 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2120 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2121 		nthreads = CPI_CPU_COUNT(cpi);
2122 	}
2123 
2124 	/*
2125 	 * For us to have threads, and know about it, we have to be at least at
2126 	 * family 17h and have the cpuid bit that says we have extended
2127 	 * topology.
2128 	 */
2129 	if (cpi->cpi_family >= 0x17 &&
2130 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2131 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2132 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2133 	}
2134 
2135 	*ncpus = nthreads;
2136 	*ncores = nthreads / nthread_per_core;
2137 }
2138 
2139 /*
2140  * Seed the initial values for the cores and threads for an Intel based
2141  * processor. These values will be overwritten if we detect that the processor
2142  * supports CPUID leaf 0xb.
2143  */
2144 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2145 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2146 {
2147 	/*
2148 	 * Only seed the number of physical cores from the first level leaf 4
2149 	 * information. The number of threads there indicate how many share the
2150 	 * L1 cache, which may or may not have anything to do with the number of
2151 	 * logical CPUs per core.
2152 	 */
2153 	if (cpi->cpi_maxeax >= 4) {
2154 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2155 	} else {
2156 		*ncores = 1;
2157 	}
2158 
2159 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2160 		*ncpus = CPI_CPU_COUNT(cpi);
2161 	} else {
2162 		*ncpus = *ncores;
2163 	}
2164 }
2165 
2166 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2167 cpuid_leafB_getids(cpu_t *cpu)
2168 {
2169 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2170 	struct cpuid_regs regs;
2171 	struct cpuid_regs *cp;
2172 
2173 	if (cpi->cpi_maxeax < 0xB)
2174 		return (B_FALSE);
2175 
2176 	cp = &regs;
2177 	cp->cp_eax = 0xB;
2178 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2179 
2180 	(void) __cpuid_insn(cp);
2181 
2182 	/*
2183 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2184 	 * indicates that the extended topology enumeration leaf is
2185 	 * available.
2186 	 */
2187 	if (cp->cp_ebx != 0) {
2188 		uint32_t x2apic_id = 0;
2189 		uint_t coreid_shift = 0;
2190 		uint_t ncpu_per_core = 1;
2191 		uint_t chipid_shift = 0;
2192 		uint_t ncpu_per_chip = 1;
2193 		uint_t i;
2194 		uint_t level;
2195 
2196 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2197 			cp->cp_eax = 0xB;
2198 			cp->cp_ecx = i;
2199 
2200 			(void) __cpuid_insn(cp);
2201 			level = CPI_CPU_LEVEL_TYPE(cp);
2202 
2203 			if (level == 1) {
2204 				x2apic_id = cp->cp_edx;
2205 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2206 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2207 			} else if (level == 2) {
2208 				x2apic_id = cp->cp_edx;
2209 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2210 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2211 			}
2212 		}
2213 
2214 		/*
2215 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2216 		 */
2217 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2218 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2219 		    ncpu_per_core;
2220 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2221 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2222 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2223 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2224 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2225 		cpi->cpi_compunitid = cpi->cpi_coreid;
2226 
2227 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2228 			cpi->cpi_nthread_bits = coreid_shift;
2229 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2230 		}
2231 
2232 		return (B_TRUE);
2233 	} else {
2234 		return (B_FALSE);
2235 	}
2236 }
2237 
2238 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2239 cpuid_intel_getids(cpu_t *cpu, void *feature)
2240 {
2241 	uint_t i;
2242 	uint_t chipid_shift = 0;
2243 	uint_t coreid_shift = 0;
2244 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2245 
2246 	/*
2247 	 * There are no compute units or processor nodes currently on Intel.
2248 	 * Always set these to one.
2249 	 */
2250 	cpi->cpi_procnodes_per_pkg = 1;
2251 	cpi->cpi_cores_per_compunit = 1;
2252 
2253 	/*
2254 	 * If cpuid Leaf B is present, use that to try and get this information.
2255 	 * It will be the most accurate for Intel CPUs.
2256 	 */
2257 	if (cpuid_leafB_getids(cpu))
2258 		return;
2259 
2260 	/*
2261 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2262 	 * and ncore_per_chip. These represent the largest power of two values
2263 	 * that we need to cover all of the IDs in the system. Therefore, we use
2264 	 * those values to seed the number of bits needed to cover information
2265 	 * in the case when leaf B is not available. These values will probably
2266 	 * be larger than required, but that's OK.
2267 	 */
2268 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2269 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2270 
2271 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2272 		chipid_shift++;
2273 
2274 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2275 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2276 
2277 	if (is_x86_feature(feature, X86FSET_CMP)) {
2278 		/*
2279 		 * Multi-core (and possibly multi-threaded)
2280 		 * processors.
2281 		 */
2282 		uint_t ncpu_per_core = 0;
2283 
2284 		if (cpi->cpi_ncore_per_chip == 1)
2285 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2286 		else if (cpi->cpi_ncore_per_chip > 1)
2287 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2288 			    cpi->cpi_ncore_per_chip;
2289 		/*
2290 		 * 8bit APIC IDs on dual core Pentiums
2291 		 * look like this:
2292 		 *
2293 		 * +-----------------------+------+------+
2294 		 * | Physical Package ID   |  MC  |  HT  |
2295 		 * +-----------------------+------+------+
2296 		 * <------- chipid -------->
2297 		 * <------- coreid --------------->
2298 		 *			   <--- clogid -->
2299 		 *			   <------>
2300 		 *			   pkgcoreid
2301 		 *
2302 		 * Where the number of bits necessary to
2303 		 * represent MC and HT fields together equals
2304 		 * to the minimum number of bits necessary to
2305 		 * store the value of cpi->cpi_ncpu_per_chip.
2306 		 * Of those bits, the MC part uses the number
2307 		 * of bits necessary to store the value of
2308 		 * cpi->cpi_ncore_per_chip.
2309 		 */
2310 		for (i = 1; i < ncpu_per_core; i <<= 1)
2311 			coreid_shift++;
2312 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2313 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2314 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2315 		/*
2316 		 * Single-core multi-threaded processors.
2317 		 */
2318 		cpi->cpi_coreid = cpi->cpi_chipid;
2319 		cpi->cpi_pkgcoreid = 0;
2320 	} else {
2321 		/*
2322 		 * Single-core single-thread processors.
2323 		 */
2324 		cpi->cpi_coreid = cpu->cpu_id;
2325 		cpi->cpi_pkgcoreid = 0;
2326 	}
2327 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2328 	cpi->cpi_compunitid = cpi->cpi_coreid;
2329 }
2330 
2331 /*
2332  * Historically, AMD has had CMP chips with only a single thread per core.
2333  * However, starting in family 17h (Zen), this has changed and they now have
2334  * multiple threads. Our internal core id needs to be a unique value.
2335  *
2336  * To determine the core id of an AMD system, if we're from a family before 17h,
2337  * then we just use the cpu id, as that gives us a good value that will be
2338  * unique for each core. If instead, we're on family 17h or later, then we need
2339  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2340  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2341  * We can't use the normal core id in that leaf as it's only unique within the
2342  * socket, which is perfect for cpi_pkgcoreid, but not us.
2343  */
2344 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2345 cpuid_amd_get_coreid(cpu_t *cpu)
2346 {
2347 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2348 
2349 	if (cpi->cpi_family >= 0x17 &&
2350 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2351 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2352 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2353 		if (nthreads > 1) {
2354 			VERIFY3U(nthreads, ==, 2);
2355 			return (cpi->cpi_apicid >> 1);
2356 		}
2357 	}
2358 
2359 	return (cpu->cpu_id);
2360 }
2361 
2362 /*
2363  * IDs on AMD is a more challenging task. This is notable because of the
2364  * following two facts:
2365  *
2366  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2367  *     also no way to get an actual unique core id from the system. As such, we
2368  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2369  *     however, guarantee that sibling cores of a chip will have sequential
2370  *     coreids starting at a multiple of the number of cores per chip - that is
2371  *     usually the case, but if the ACPI MADT table is presented in a different
2372  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2373  *
2374  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2375  *     called compute units. These compute units share the L1I cache, L2 cache,
2376  *     and the FPU. To deal with this, a new topology leaf was added in
2377  *     0x8000001e. However, parts of this leaf have different meanings
2378  *     once we get to family 0x17.
2379  */
2380 
2381 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2382 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2383 {
2384 	int i, first_half, coreidsz;
2385 	uint32_t nb_caps_reg;
2386 	uint_t node2_1;
2387 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2388 	struct cpuid_regs *cp;
2389 
2390 	/*
2391 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2392 	 * hasn't been stripped by virtualization). We always set the compute
2393 	 * unit id to the same value. Also, initialize the default number of
2394 	 * cores per compute unit and nodes per package. This will be
2395 	 * overwritten when we know information about a particular family.
2396 	 */
2397 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2398 	cpi->cpi_compunitid = cpi->cpi_coreid;
2399 	cpi->cpi_cores_per_compunit = 1;
2400 	cpi->cpi_procnodes_per_pkg = 1;
2401 
2402 	/*
2403 	 * To construct the logical ID, we need to determine how many APIC IDs
2404 	 * are dedicated to the cores and threads. This is provided for us in
2405 	 * 0x80000008. However, if it's not present (say due to virtualization),
2406 	 * then we assume it's one. This should be present on all 64-bit AMD
2407 	 * processors.  It was added in family 0xf (Hammer).
2408 	 */
2409 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2410 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2411 
2412 		/*
2413 		 * In AMD parlance chip is really a node while illumos
2414 		 * uses chip as equivalent to socket/package.
2415 		 */
2416 		if (coreidsz == 0) {
2417 			/* Use legacy method */
2418 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2419 				coreidsz++;
2420 			if (coreidsz == 0)
2421 				coreidsz = 1;
2422 		}
2423 	} else {
2424 		/* Assume single-core part */
2425 		coreidsz = 1;
2426 	}
2427 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2428 
2429 	/*
2430 	 * The package core ID varies depending on the family. While it may be
2431 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2432 	 * this value is the core id in the given node. For non-virtualized
2433 	 * family 17h, we need to take the logical core id and shift off the
2434 	 * threads like we do when getting the core id.  Otherwise, we can use
2435 	 * the clogid as is. When family 17h is virtualized, the clogid should
2436 	 * be sufficient as if we don't have valid data in the leaf, then we
2437 	 * won't think we have SMT, in which case the cpi_clogid should be
2438 	 * sufficient.
2439 	 */
2440 	if (cpi->cpi_family >= 0x17 &&
2441 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2442 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2443 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2444 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2445 		if (nthreads > 1) {
2446 			VERIFY3U(nthreads, ==, 2);
2447 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2448 		} else {
2449 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2450 		}
2451 	} else {
2452 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2453 	}
2454 
2455 	/*
2456 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2457 	 * (bulldozer) or newer, then we can derive all of this from leaf
2458 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2459 	 */
2460 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2461 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2462 		cp = &cpi->cpi_extd[0x1e];
2463 
2464 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2465 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2466 
2467 		/*
2468 		 * For Bulldozer-era CPUs, recalculate the compute unit
2469 		 * information.
2470 		 */
2471 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2472 			cpi->cpi_cores_per_compunit =
2473 			    BITX(cp->cp_ebx, 15, 8) + 1;
2474 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2475 			    (cpi->cpi_ncore_per_chip /
2476 			    cpi->cpi_cores_per_compunit) *
2477 			    (cpi->cpi_procnodeid /
2478 			    cpi->cpi_procnodes_per_pkg);
2479 		}
2480 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2481 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2482 	} else if (cpi->cpi_family == 0x10) {
2483 		/*
2484 		 * See if we are a multi-node processor.
2485 		 * All processors in the system have the same number of nodes
2486 		 */
2487 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2488 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2489 			/* Single-node */
2490 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2491 			    coreidsz);
2492 		} else {
2493 
2494 			/*
2495 			 * Multi-node revision D (2 nodes per package
2496 			 * are supported)
2497 			 */
2498 			cpi->cpi_procnodes_per_pkg = 2;
2499 
2500 			first_half = (cpi->cpi_pkgcoreid <=
2501 			    (cpi->cpi_ncore_per_chip/2 - 1));
2502 
2503 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2504 				/* We are BSP */
2505 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2506 			} else {
2507 
2508 				/* We are AP */
2509 				/* NodeId[2:1] bits to use for reading F3xe8 */
2510 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2511 
2512 				nb_caps_reg =
2513 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2514 
2515 				/*
2516 				 * Check IntNodeNum bit (31:30, but bit 31 is
2517 				 * always 0 on dual-node processors)
2518 				 */
2519 				if (BITX(nb_caps_reg, 30, 30) == 0)
2520 					cpi->cpi_procnodeid = node2_1 +
2521 					    !first_half;
2522 				else
2523 					cpi->cpi_procnodeid = node2_1 +
2524 					    first_half;
2525 			}
2526 		}
2527 	} else {
2528 		cpi->cpi_procnodeid = 0;
2529 	}
2530 
2531 	cpi->cpi_chipid =
2532 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2533 
2534 	cpi->cpi_ncore_bits = coreidsz;
2535 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2536 	    cpi->cpi_ncore_per_chip);
2537 }
2538 
2539 static void
spec_uarch_flush_noop(void)2540 spec_uarch_flush_noop(void)
2541 {
2542 }
2543 
2544 /*
2545  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2546  * MDS-related micro-architectural state that would normally happen by calling
2547  * x86_md_clear().
2548  */
2549 static void
spec_uarch_flush_msr(void)2550 spec_uarch_flush_msr(void)
2551 {
2552 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2553 }
2554 
2555 /*
2556  * This function points to a function that will flush certain
2557  * micro-architectural state on the processor. This flush is used to mitigate
2558  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2559  * function can point to one of three functions:
2560  *
2561  * - A noop which is done because we either are vulnerable, but do not have
2562  *   microcode available to help deal with a fix, or because we aren't
2563  *   vulnerable.
2564  *
2565  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2566  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2567  *   however, it only flushes the MDS related micro-architectural state on the
2568  *   current hyperthread, it does not do anything for the twin.
2569  *
2570  * - x86_md_clear which will flush the MDS related state. This is done when we
2571  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2572  *   (RDCL_NO is set).
2573  */
2574 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2575 
2576 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2577 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2578 {
2579 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2580 
2581 	/*
2582 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2583 	 * has been fixed in hardware, it doesn't cover everything related to
2584 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2585 	 * need to mitigate this.
2586 	 */
2587 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2588 	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2589 		return;
2590 	}
2591 
2592 	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2593 		const uint8_t nop = NOP_INSTR;
2594 		uint8_t *md = (uint8_t *)x86_md_clear;
2595 
2596 		*md = nop;
2597 	}
2598 
2599 	membar_producer();
2600 }
2601 
2602 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2603 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2604 {
2605 	boolean_t need_l1d, need_mds;
2606 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2607 
2608 	/*
2609 	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2610 	 * hardware, then there's nothing left for us to do for enabling the
2611 	 * flush. We can also go ahead and say that SMT exclusion is
2612 	 * unnecessary.
2613 	 */
2614 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2615 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2616 	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2617 		extern int smt_exclusion;
2618 		smt_exclusion = 0;
2619 		spec_uarch_flush = spec_uarch_flush_noop;
2620 		membar_producer();
2621 		return;
2622 	}
2623 
2624 	/*
2625 	 * The locations where we need to perform an L1D flush are required both
2626 	 * for mitigating L1TF and MDS. When verw support is present in
2627 	 * microcode, then the L1D flush will take care of doing that as well.
2628 	 * However, if we have a system where RDCL_NO is present, but we don't
2629 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2630 	 * L1D flush.
2631 	 */
2632 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2633 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2634 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2635 		need_l1d = B_TRUE;
2636 	} else {
2637 		need_l1d = B_FALSE;
2638 	}
2639 
2640 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2641 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2642 		need_mds = B_TRUE;
2643 	} else {
2644 		need_mds = B_FALSE;
2645 	}
2646 
2647 	if (need_l1d) {
2648 		spec_uarch_flush = spec_uarch_flush_msr;
2649 	} else if (need_mds) {
2650 		spec_uarch_flush = x86_md_clear;
2651 	} else {
2652 		/*
2653 		 * We have no hardware mitigations available to us.
2654 		 */
2655 		spec_uarch_flush = spec_uarch_flush_noop;
2656 	}
2657 	membar_producer();
2658 }
2659 
2660 /*
2661  * We default to enabling RSB mitigations.
2662  */
2663 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)2664 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2665 {
2666 	const uint8_t ret = RET_INSTR;
2667 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2668 
2669 	switch (mit) {
2670 	case X86_SPECTREV2_ENHANCED_IBRS:
2671 	case X86_SPECTREV2_DISABLED:
2672 		*stuff = ret;
2673 		break;
2674 	default:
2675 		break;
2676 	}
2677 }
2678 
2679 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)2680 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2681 {
2682 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2683 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2684 	    "_r14", "_r15" };
2685 	const uint_t nthunks = ARRAY_SIZE(thunks);
2686 	const char *type;
2687 	uint_t i;
2688 
2689 	if (mit == x86_spectrev2_mitigation)
2690 		return;
2691 
2692 	switch (mit) {
2693 	case X86_SPECTREV2_RETPOLINE:
2694 		type = "gen";
2695 		break;
2696 	case X86_SPECTREV2_RETPOLINE_AMD:
2697 		type = "amd";
2698 		break;
2699 	case X86_SPECTREV2_ENHANCED_IBRS:
2700 	case X86_SPECTREV2_DISABLED:
2701 		type = "jmp";
2702 		break;
2703 	default:
2704 		panic("asked to updated retpoline state with unknown state!");
2705 	}
2706 
2707 	for (i = 0; i < nthunks; i++) {
2708 		uintptr_t source, dest;
2709 		int ssize, dsize;
2710 		char sourcebuf[64], destbuf[64];
2711 		size_t len;
2712 
2713 		(void) snprintf(destbuf, sizeof (destbuf),
2714 		    "__x86_indirect_thunk%s", thunks[i]);
2715 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2716 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2717 
2718 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2719 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2720 		VERIFY3U(source, !=, 0);
2721 		VERIFY3U(dest, !=, 0);
2722 		VERIFY3S(dsize, >=, ssize);
2723 		bcopy((void *)source, (void *)dest, ssize);
2724 	}
2725 }
2726 
2727 static void
cpuid_enable_enhanced_ibrs(void)2728 cpuid_enable_enhanced_ibrs(void)
2729 {
2730 	uint64_t val;
2731 
2732 	val = rdmsr(MSR_IA32_SPEC_CTRL);
2733 	val |= IA32_SPEC_CTRL_IBRS;
2734 	wrmsr(MSR_IA32_SPEC_CTRL, val);
2735 }
2736 
2737 /*
2738  * Determine whether or not we can use the AMD optimized retpoline
2739  * functionality. We use this when we know we're on an AMD system and we can
2740  * successfully verify that lfence is dispatch serializing.
2741  */
2742 static boolean_t
cpuid_use_amd_retpoline(struct cpuid_info * cpi)2743 cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2744 {
2745 	if (cpi->cpi_vendor != X86_VENDOR_AMD &&
2746 	    cpi->cpi_vendor != X86_VENDOR_HYGON)
2747 		return (B_FALSE);
2748 
2749 	return (is_x86_feature(x86_featureset, X86FSET_LFENCE_SER));
2750 }
2751 
2752 /*
2753  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2754  * we can disable TSX, we do so.
2755  *
2756  * This determination is done only on the boot CPU, potentially after loading
2757  * updated microcode.
2758  */
2759 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)2760 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2761 {
2762 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2763 
2764 	VERIFY(cpu->cpu_id == 0);
2765 
2766 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2767 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2768 		return;
2769 	}
2770 
2771 	if (x86_disable_taa) {
2772 		x86_taa_mitigation = X86_TAA_DISABLED;
2773 		return;
2774 	}
2775 
2776 	/*
2777 	 * If we do not have the ability to disable TSX, then our only
2778 	 * mitigation options are in hardware (TAA_NO), or by using our existing
2779 	 * MDS mitigation as described above.  The latter relies upon us having
2780 	 * configured MDS mitigations correctly! This includes disabling SMT if
2781 	 * we want to cross-CPU-thread protection.
2782 	 */
2783 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2784 		/*
2785 		 * It's not clear whether any parts will enumerate TAA_NO
2786 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
2787 		 */
2788 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2789 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2790 			return;
2791 		}
2792 
2793 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2794 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2795 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
2796 		} else {
2797 			x86_taa_mitigation = X86_TAA_NOTHING;
2798 		}
2799 		return;
2800 	}
2801 
2802 	/*
2803 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
2804 	 * enough in boot.
2805 	 *
2806 	 * Otherwise, we'll fall back to causing transactions to abort as our
2807 	 * mitigation. TSX-using code will always take the fallback path.
2808 	 */
2809 	if (cpi->cpi_pass < 4) {
2810 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2811 	} else {
2812 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2813 	}
2814 }
2815 
2816 /*
2817  * As mentioned, we should only touch the MSR when we've got a suitable
2818  * microcode loaded on this CPU.
2819  */
2820 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)2821 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2822 {
2823 	uint64_t val;
2824 
2825 	switch (taa) {
2826 	case X86_TAA_TSX_DISABLE:
2827 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2828 			return;
2829 		val = rdmsr(MSR_IA32_TSX_CTRL);
2830 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2831 		wrmsr(MSR_IA32_TSX_CTRL, val);
2832 		break;
2833 	case X86_TAA_TSX_FORCE_ABORT:
2834 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2835 			return;
2836 		val = rdmsr(MSR_IA32_TSX_CTRL);
2837 		val |= IA32_TSX_CTRL_RTM_DISABLE;
2838 		wrmsr(MSR_IA32_TSX_CTRL, val);
2839 		break;
2840 	case X86_TAA_HW_MITIGATED:
2841 	case X86_TAA_MD_CLEAR:
2842 	case X86_TAA_DISABLED:
2843 	case X86_TAA_NOTHING:
2844 		break;
2845 	}
2846 }
2847 
2848 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)2849 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2850 {
2851 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2852 	x86_spectrev2_mitigation_t v2mit;
2853 
2854 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2855 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2856 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2857 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2858 			add_x86_feature(featureset, X86FSET_IBPB);
2859 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2860 			add_x86_feature(featureset, X86FSET_IBRS);
2861 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2862 			add_x86_feature(featureset, X86FSET_STIBP);
2863 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2864 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
2865 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2866 			add_x86_feature(featureset, X86FSET_SSBD);
2867 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2868 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2869 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2870 			add_x86_feature(featureset, X86FSET_SSB_NO);
2871 		/*
2872 		 * Don't enable enhanced IBRS unless we're told that we should
2873 		 * prefer it and it has the same semantics as Intel. This is
2874 		 * split into two bits rather than a single one.
2875 		 */
2876 		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2877 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2878 			add_x86_feature(featureset, X86FSET_IBRS_ALL);
2879 		}
2880 
2881 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2882 	    cpi->cpi_maxeax >= 7) {
2883 		struct cpuid_regs *ecp;
2884 		ecp = &cpi->cpi_std[7];
2885 
2886 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2887 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
2888 		}
2889 
2890 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2891 			add_x86_feature(featureset, X86FSET_IBRS);
2892 			add_x86_feature(featureset, X86FSET_IBPB);
2893 		}
2894 
2895 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2896 			add_x86_feature(featureset, X86FSET_STIBP);
2897 		}
2898 
2899 		/*
2900 		 * Don't read the arch caps MSR on xpv where we lack the
2901 		 * on_trap().
2902 		 */
2903 #ifndef __xpv
2904 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2905 			on_trap_data_t otd;
2906 
2907 			/*
2908 			 * Be paranoid and assume we'll get a #GP.
2909 			 */
2910 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
2911 				uint64_t reg;
2912 
2913 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2914 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
2915 					add_x86_feature(featureset,
2916 					    X86FSET_RDCL_NO);
2917 				}
2918 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2919 					add_x86_feature(featureset,
2920 					    X86FSET_IBRS_ALL);
2921 				}
2922 				if (reg & IA32_ARCH_CAP_RSBA) {
2923 					add_x86_feature(featureset,
2924 					    X86FSET_RSBA);
2925 				}
2926 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2927 					add_x86_feature(featureset,
2928 					    X86FSET_L1D_VM_NO);
2929 				}
2930 				if (reg & IA32_ARCH_CAP_SSB_NO) {
2931 					add_x86_feature(featureset,
2932 					    X86FSET_SSB_NO);
2933 				}
2934 				if (reg & IA32_ARCH_CAP_MDS_NO) {
2935 					add_x86_feature(featureset,
2936 					    X86FSET_MDS_NO);
2937 				}
2938 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
2939 					add_x86_feature(featureset,
2940 					    X86FSET_TSX_CTRL);
2941 				}
2942 				if (reg & IA32_ARCH_CAP_TAA_NO) {
2943 					add_x86_feature(featureset,
2944 					    X86FSET_TAA_NO);
2945 				}
2946 			}
2947 			no_trap();
2948 		}
2949 #endif	/* !__xpv */
2950 
2951 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2952 			add_x86_feature(featureset, X86FSET_SSBD);
2953 
2954 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2955 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2956 	}
2957 
2958 	/*
2959 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
2960 	 * will have already run this function and determined what we need to
2961 	 * do. This gives us a hook for per-HW thread mitigations such as
2962 	 * enhanced IBRS, or disabling TSX.
2963 	 */
2964 	if (cpu->cpu_id != 0) {
2965 		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2966 			cpuid_enable_enhanced_ibrs();
2967 		}
2968 
2969 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
2970 		return;
2971 	}
2972 
2973 	/*
2974 	 * Go through and initialize various security mechanisms that we should
2975 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
2976 	 * TAA.
2977 	 */
2978 
2979 	/*
2980 	 * By default we've come in with retpolines enabled. Check whether we
2981 	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2982 	 * by default, but disabled if we are using enhanced IBRS.
2983 	 */
2984 	if (x86_disable_spectrev2 != 0) {
2985 		v2mit = X86_SPECTREV2_DISABLED;
2986 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2987 		cpuid_enable_enhanced_ibrs();
2988 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2989 	} else if (cpuid_use_amd_retpoline(cpi)) {
2990 		v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2991 	} else {
2992 		v2mit = X86_SPECTREV2_RETPOLINE;
2993 	}
2994 
2995 	cpuid_patch_retpolines(v2mit);
2996 	cpuid_patch_rsb(v2mit);
2997 	x86_spectrev2_mitigation = v2mit;
2998 	membar_producer();
2999 
3000 	/*
3001 	 * We need to determine what changes are required for mitigating L1TF
3002 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3003 	 * is required.
3004 	 *
3005 	 * If any of these are present, then we need to flush u-arch state at
3006 	 * various points. For MDS, we need to do so whenever we change to a
3007 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3008 	 * flush the L1D cache at VM entry. When we have microcode that handles
3009 	 * MDS, the L1D flush also clears the other u-arch state that the
3010 	 * md_clear does.
3011 	 */
3012 
3013 	/*
3014 	 * Update whether or not we need to be taking explicit action against
3015 	 * MDS.
3016 	 */
3017 	cpuid_update_md_clear(cpu, featureset);
3018 
3019 	/*
3020 	 * Determine whether SMT exclusion is required and whether or not we
3021 	 * need to perform an l1d flush.
3022 	 */
3023 	cpuid_update_l1d_flush(cpu, featureset);
3024 
3025 	/*
3026 	 * Determine what our mitigation strategy should be for TAA and then
3027 	 * also apply TAA mitigations.
3028 	 */
3029 	cpuid_update_tsx(cpu, featureset);
3030 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3031 }
3032 
3033 /*
3034  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3035  */
3036 void
setup_xfem(void)3037 setup_xfem(void)
3038 {
3039 	uint64_t flags = XFEATURE_LEGACY_FP;
3040 
3041 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3042 
3043 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3044 		flags |= XFEATURE_SSE;
3045 
3046 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3047 		flags |= XFEATURE_AVX;
3048 
3049 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3050 		flags |= XFEATURE_AVX512;
3051 
3052 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3053 
3054 	xsave_bv_all = flags;
3055 }
3056 
3057 static void
cpuid_pass1_topology(cpu_t * cpu,uchar_t * featureset)3058 cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
3059 {
3060 	struct cpuid_info *cpi;
3061 
3062 	cpi = cpu->cpu_m.mcpu_cpi;
3063 
3064 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3065 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3066 		cpuid_gather_amd_topology_leaves(cpu);
3067 	}
3068 
3069 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3070 
3071 	/*
3072 	 * Before we can calculate the IDs that we should assign to this
3073 	 * processor, we need to understand how many cores and threads it has.
3074 	 */
3075 	switch (cpi->cpi_vendor) {
3076 	case X86_VENDOR_Intel:
3077 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3078 		    &cpi->cpi_ncore_per_chip);
3079 		break;
3080 	case X86_VENDOR_AMD:
3081 	case X86_VENDOR_HYGON:
3082 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3083 		    &cpi->cpi_ncore_per_chip);
3084 		break;
3085 	default:
3086 		/*
3087 		 * If we have some other x86 compatible chip, it's not clear how
3088 		 * they would behave. The most common case is virtualization
3089 		 * today, though there are also 64-bit VIA chips. Assume that
3090 		 * all we can get is the basic Leaf 1 HTT information.
3091 		 */
3092 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3093 			cpi->cpi_ncore_per_chip = 1;
3094 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3095 		}
3096 		break;
3097 	}
3098 
3099 	/*
3100 	 * Based on the calculated number of threads and cores, potentially
3101 	 * assign the HTT and CMT features.
3102 	 */
3103 	if (cpi->cpi_ncore_per_chip > 1) {
3104 		add_x86_feature(featureset, X86FSET_CMP);
3105 	}
3106 
3107 	if (cpi->cpi_ncpu_per_chip > 1 &&
3108 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3109 		add_x86_feature(featureset, X86FSET_HTT);
3110 	}
3111 
3112 	/*
3113 	 * Now that has been set up, we need to go through and calculate all of
3114 	 * the rest of the parameters that exist. If we think the CPU doesn't
3115 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3116 	 * up information in some way. The most likely case for this is
3117 	 * virtualization where we have a lot of partial topology information.
3118 	 */
3119 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3120 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3121 		/*
3122 		 * This is a single core, single-threaded processor.
3123 		 */
3124 		cpi->cpi_procnodes_per_pkg = 1;
3125 		cpi->cpi_cores_per_compunit = 1;
3126 		cpi->cpi_compunitid = 0;
3127 		cpi->cpi_chipid = -1;
3128 		cpi->cpi_clogid = 0;
3129 		cpi->cpi_coreid = cpu->cpu_id;
3130 		cpi->cpi_pkgcoreid = 0;
3131 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3132 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3133 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3134 		} else {
3135 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3136 		}
3137 	} else {
3138 		switch (cpi->cpi_vendor) {
3139 		case X86_VENDOR_Intel:
3140 			cpuid_intel_getids(cpu, featureset);
3141 			break;
3142 		case X86_VENDOR_AMD:
3143 		case X86_VENDOR_HYGON:
3144 			cpuid_amd_getids(cpu, featureset);
3145 			break;
3146 		default:
3147 			/*
3148 			 * In this case, it's hard to say what we should do.
3149 			 * We're going to model them to the OS as single core
3150 			 * threads. We don't have a good identifier for them, so
3151 			 * we're just going to use the cpu id all on a single
3152 			 * chip.
3153 			 *
3154 			 * This case has historically been different from the
3155 			 * case above where we don't have HTT or CMP. While they
3156 			 * could be combined, we've opted to keep it separate to
3157 			 * minimize the risk of topology changes in weird cases.
3158 			 */
3159 			cpi->cpi_procnodes_per_pkg = 1;
3160 			cpi->cpi_cores_per_compunit = 1;
3161 			cpi->cpi_chipid = 0;
3162 			cpi->cpi_coreid = cpu->cpu_id;
3163 			cpi->cpi_clogid = cpu->cpu_id;
3164 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3165 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3166 			cpi->cpi_compunitid = cpi->cpi_coreid;
3167 			break;
3168 		}
3169 	}
3170 }
3171 
3172 /*
3173  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3174  * always gather leaf 6 if it's supported; however, we only look for features on
3175  * Intel systems as AMD does not currently define any of the features we look
3176  * for below.
3177  */
3178 static void
cpuid_pass1_thermal(cpu_t * cpu,uchar_t * featureset)3179 cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3180 {
3181 	struct cpuid_regs *cp;
3182 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3183 
3184 	if (cpi->cpi_maxeax < 6) {
3185 		return;
3186 	}
3187 
3188 	cp = &cpi->cpi_std[6];
3189 	cp->cp_eax = 6;
3190 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3191 	(void) __cpuid_insn(cp);
3192 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3193 
3194 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3195 		return;
3196 	}
3197 
3198 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3199 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3200 	}
3201 
3202 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3203 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3204 	}
3205 }
3206 
3207 /*
3208  * PPIN is the protected processor inventory number. On AMD this is an actual
3209  * feature bit. However, on Intel systems we need to read the platform
3210  * information MSR if we're on a specific model.
3211  */
3212 #if !defined(__xpv)
3213 static void
cpuid_pass1_ppin(cpu_t * cpu,uchar_t * featureset)3214 cpuid_pass1_ppin(cpu_t *cpu, uchar_t *featureset)
3215 {
3216 	on_trap_data_t otd;
3217 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3218 
3219 	switch (cpi->cpi_vendor) {
3220 	case X86_VENDOR_AMD:
3221 		/*
3222 		 * This leaf will have already been gathered in the topology
3223 		 * functions.
3224 		 */
3225 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3226 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3227 				add_x86_feature(featureset, X86FSET_PPIN);
3228 			}
3229 		}
3230 		break;
3231 	case X86_VENDOR_Intel:
3232 		if (cpi->cpi_family != 6)
3233 			break;
3234 		switch (cpi->cpi_model) {
3235 		case INTC_MODEL_IVYBRIDGE_XEON:
3236 		case INTC_MODEL_HASWELL_XEON:
3237 		case INTC_MODEL_BROADWELL_XEON:
3238 		case INTC_MODEL_BROADWELL_XEON_D:
3239 		case INTC_MODEL_SKYLAKE_XEON:
3240 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3241 				uint64_t value;
3242 
3243 				value = rdmsr(MSR_PLATFORM_INFO);
3244 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3245 					add_x86_feature(featureset,
3246 					    X86FSET_PPIN);
3247 				}
3248 			}
3249 			no_trap();
3250 			break;
3251 		default:
3252 			break;
3253 		}
3254 		break;
3255 	default:
3256 		break;
3257 	}
3258 }
3259 #endif	/* ! __xpv */
3260 
3261 void
cpuid_pass1(cpu_t * cpu,uchar_t * featureset)3262 cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3263 {
3264 	uint32_t mask_ecx, mask_edx;
3265 	struct cpuid_info *cpi;
3266 	struct cpuid_regs *cp;
3267 	int xcpuid;
3268 #if !defined(__xpv)
3269 	extern int idle_cpu_prefer_mwait;
3270 #endif
3271 
3272 	/*
3273 	 * Space statically allocated for BSP, ensure pointer is set
3274 	 */
3275 	if (cpu->cpu_id == 0) {
3276 		if (cpu->cpu_m.mcpu_cpi == NULL)
3277 			cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3278 	}
3279 
3280 	add_x86_feature(featureset, X86FSET_CPUID);
3281 
3282 	cpi = cpu->cpu_m.mcpu_cpi;
3283 	ASSERT(cpi != NULL);
3284 	cp = &cpi->cpi_std[0];
3285 	cp->cp_eax = 0;
3286 	cpi->cpi_maxeax = __cpuid_insn(cp);
3287 	{
3288 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3289 		*iptr++ = cp->cp_ebx;
3290 		*iptr++ = cp->cp_edx;
3291 		*iptr++ = cp->cp_ecx;
3292 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3293 	}
3294 
3295 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3296 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3297 
3298 	/*
3299 	 * Limit the range in case of weird hardware
3300 	 */
3301 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3302 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3303 	if (cpi->cpi_maxeax < 1)
3304 		goto pass1_done;
3305 
3306 	cp = &cpi->cpi_std[1];
3307 	cp->cp_eax = 1;
3308 	(void) __cpuid_insn(cp);
3309 
3310 	/*
3311 	 * Extract identifying constants for easy access.
3312 	 */
3313 	cpi->cpi_model = CPI_MODEL(cpi);
3314 	cpi->cpi_family = CPI_FAMILY(cpi);
3315 
3316 	if (cpi->cpi_family == 0xf)
3317 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3318 
3319 	/*
3320 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3321 	 * Intel, and presumably everyone else, uses model == 0xf, as
3322 	 * one would expect (max value means possible overflow).  Sigh.
3323 	 */
3324 
3325 	switch (cpi->cpi_vendor) {
3326 	case X86_VENDOR_Intel:
3327 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3328 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3329 		break;
3330 	case X86_VENDOR_AMD:
3331 		if (CPI_FAMILY(cpi) == 0xf)
3332 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3333 		break;
3334 	case X86_VENDOR_HYGON:
3335 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3336 		break;
3337 	default:
3338 		if (cpi->cpi_model == 0xf)
3339 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3340 		break;
3341 	}
3342 
3343 	cpi->cpi_step = CPI_STEP(cpi);
3344 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3345 
3346 	/*
3347 	 * *default* assumptions:
3348 	 * - believe %edx feature word
3349 	 * - ignore %ecx feature word
3350 	 * - 32-bit virtual and physical addressing
3351 	 */
3352 	mask_edx = 0xffffffff;
3353 	mask_ecx = 0;
3354 
3355 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3356 
3357 	switch (cpi->cpi_vendor) {
3358 	case X86_VENDOR_Intel:
3359 		if (cpi->cpi_family == 5)
3360 			x86_type = X86_TYPE_P5;
3361 		else if (IS_LEGACY_P6(cpi)) {
3362 			x86_type = X86_TYPE_P6;
3363 			pentiumpro_bug4046376 = 1;
3364 			/*
3365 			 * Clear the SEP bit when it was set erroneously
3366 			 */
3367 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3368 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3369 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3370 			x86_type = X86_TYPE_P4;
3371 			/*
3372 			 * We don't currently depend on any of the %ecx
3373 			 * features until Prescott, so we'll only check
3374 			 * this from P4 onwards.  We might want to revisit
3375 			 * that idea later.
3376 			 */
3377 			mask_ecx = 0xffffffff;
3378 		} else if (cpi->cpi_family > 0xf)
3379 			mask_ecx = 0xffffffff;
3380 		/*
3381 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3382 		 * to obtain the monitor linesize.
3383 		 */
3384 		if (cpi->cpi_maxeax < 5)
3385 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3386 		break;
3387 	case X86_VENDOR_IntelClone:
3388 	default:
3389 		break;
3390 	case X86_VENDOR_AMD:
3391 #if defined(OPTERON_ERRATUM_108)
3392 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3393 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3394 			cpi->cpi_model = 0xc;
3395 		} else
3396 #endif
3397 		if (cpi->cpi_family == 5) {
3398 			/*
3399 			 * AMD K5 and K6
3400 			 *
3401 			 * These CPUs have an incomplete implementation
3402 			 * of MCA/MCE which we mask away.
3403 			 */
3404 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3405 
3406 			/*
3407 			 * Model 0 uses the wrong (APIC) bit
3408 			 * to indicate PGE.  Fix it here.
3409 			 */
3410 			if (cpi->cpi_model == 0) {
3411 				if (cp->cp_edx & 0x200) {
3412 					cp->cp_edx &= ~0x200;
3413 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3414 				}
3415 			}
3416 
3417 			/*
3418 			 * Early models had problems w/ MMX; disable.
3419 			 */
3420 			if (cpi->cpi_model < 6)
3421 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3422 		}
3423 
3424 		/*
3425 		 * For newer families, SSE3 and CX16, at least, are valid;
3426 		 * enable all
3427 		 */
3428 		if (cpi->cpi_family >= 0xf)
3429 			mask_ecx = 0xffffffff;
3430 		/*
3431 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3432 		 * to obtain the monitor linesize.
3433 		 */
3434 		if (cpi->cpi_maxeax < 5)
3435 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3436 
3437 #if !defined(__xpv)
3438 		/*
3439 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3440 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3441 		 * know for certain that in at least family 17h, per AMD, mwait
3442 		 * is preferred. Families in-between are less certain.
3443 		 */
3444 		if (cpi->cpi_family < 0x17) {
3445 			idle_cpu_prefer_mwait = 0;
3446 		}
3447 #endif
3448 
3449 		break;
3450 	case X86_VENDOR_HYGON:
3451 		/* Enable all for Hygon Dhyana CPU */
3452 		mask_ecx = 0xffffffff;
3453 		break;
3454 	case X86_VENDOR_TM:
3455 		/*
3456 		 * workaround the NT workaround in CMS 4.1
3457 		 */
3458 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3459 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3460 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3461 		break;
3462 	case X86_VENDOR_Centaur:
3463 		/*
3464 		 * workaround the NT workarounds again
3465 		 */
3466 		if (cpi->cpi_family == 6)
3467 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3468 		break;
3469 	case X86_VENDOR_Cyrix:
3470 		/*
3471 		 * We rely heavily on the probing in locore
3472 		 * to actually figure out what parts, if any,
3473 		 * of the Cyrix cpuid instruction to believe.
3474 		 */
3475 		switch (x86_type) {
3476 		case X86_TYPE_CYRIX_486:
3477 			mask_edx = 0;
3478 			break;
3479 		case X86_TYPE_CYRIX_6x86:
3480 			mask_edx = 0;
3481 			break;
3482 		case X86_TYPE_CYRIX_6x86L:
3483 			mask_edx =
3484 			    CPUID_INTC_EDX_DE |
3485 			    CPUID_INTC_EDX_CX8;
3486 			break;
3487 		case X86_TYPE_CYRIX_6x86MX:
3488 			mask_edx =
3489 			    CPUID_INTC_EDX_DE |
3490 			    CPUID_INTC_EDX_MSR |
3491 			    CPUID_INTC_EDX_CX8 |
3492 			    CPUID_INTC_EDX_PGE |
3493 			    CPUID_INTC_EDX_CMOV |
3494 			    CPUID_INTC_EDX_MMX;
3495 			break;
3496 		case X86_TYPE_CYRIX_GXm:
3497 			mask_edx =
3498 			    CPUID_INTC_EDX_MSR |
3499 			    CPUID_INTC_EDX_CX8 |
3500 			    CPUID_INTC_EDX_CMOV |
3501 			    CPUID_INTC_EDX_MMX;
3502 			break;
3503 		case X86_TYPE_CYRIX_MediaGX:
3504 			break;
3505 		case X86_TYPE_CYRIX_MII:
3506 		case X86_TYPE_VIA_CYRIX_III:
3507 			mask_edx =
3508 			    CPUID_INTC_EDX_DE |
3509 			    CPUID_INTC_EDX_TSC |
3510 			    CPUID_INTC_EDX_MSR |
3511 			    CPUID_INTC_EDX_CX8 |
3512 			    CPUID_INTC_EDX_PGE |
3513 			    CPUID_INTC_EDX_CMOV |
3514 			    CPUID_INTC_EDX_MMX;
3515 			break;
3516 		default:
3517 			break;
3518 		}
3519 		break;
3520 	}
3521 
3522 #if defined(__xpv)
3523 	/*
3524 	 * Do not support MONITOR/MWAIT under a hypervisor
3525 	 */
3526 	mask_ecx &= ~CPUID_INTC_ECX_MON;
3527 	/*
3528 	 * Do not support XSAVE under a hypervisor for now
3529 	 */
3530 	xsave_force_disable = B_TRUE;
3531 
3532 #endif	/* __xpv */
3533 
3534 	if (xsave_force_disable) {
3535 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3536 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3537 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3538 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3539 	}
3540 
3541 	/*
3542 	 * Now we've figured out the masks that determine
3543 	 * which bits we choose to believe, apply the masks
3544 	 * to the feature words, then map the kernel's view
3545 	 * of these feature words into its feature word.
3546 	 */
3547 	cp->cp_edx &= mask_edx;
3548 	cp->cp_ecx &= mask_ecx;
3549 
3550 	/*
3551 	 * apply any platform restrictions (we don't call this
3552 	 * immediately after __cpuid_insn here, because we need the
3553 	 * workarounds applied above first)
3554 	 */
3555 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3556 
3557 	/*
3558 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3559 	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3560 	 */
3561 	if (cpi->cpi_maxeax >= 7) {
3562 		struct cpuid_regs *ecp;
3563 		ecp = &cpi->cpi_std[7];
3564 		ecp->cp_eax = 7;
3565 		ecp->cp_ecx = 0;
3566 		(void) __cpuid_insn(ecp);
3567 
3568 		/*
3569 		 * If XSAVE has been disabled, just ignore all of the
3570 		 * extended-save-area dependent flags here.
3571 		 */
3572 		if (xsave_force_disable) {
3573 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3574 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3575 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3576 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3577 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3578 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3579 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3580 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3581 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3582 		}
3583 
3584 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3585 			add_x86_feature(featureset, X86FSET_SMEP);
3586 
3587 		/*
3588 		 * We check disable_smap here in addition to in startup_smap()
3589 		 * to ensure CPUs that aren't the boot CPU don't accidentally
3590 		 * include it in the feature set and thus generate a mismatched
3591 		 * x86 feature set across CPUs.
3592 		 */
3593 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3594 		    disable_smap == 0)
3595 			add_x86_feature(featureset, X86FSET_SMAP);
3596 
3597 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3598 			add_x86_feature(featureset, X86FSET_RDSEED);
3599 
3600 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3601 			add_x86_feature(featureset, X86FSET_ADX);
3602 
3603 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3604 			add_x86_feature(featureset, X86FSET_FSGSBASE);
3605 
3606 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3607 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3608 
3609 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3610 			add_x86_feature(featureset, X86FSET_INVPCID);
3611 
3612 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3613 			add_x86_feature(featureset, X86FSET_UMIP);
3614 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3615 			add_x86_feature(featureset, X86FSET_PKU);
3616 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3617 			add_x86_feature(featureset, X86FSET_OSPKE);
3618 
3619 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3620 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3621 				add_x86_feature(featureset, X86FSET_MPX);
3622 
3623 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3624 				add_x86_feature(featureset, X86FSET_CLWB);
3625 		}
3626 	}
3627 
3628 	/*
3629 	 * fold in overrides from the "eeprom" mechanism
3630 	 */
3631 	cp->cp_edx |= cpuid_feature_edx_include;
3632 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3633 
3634 	cp->cp_ecx |= cpuid_feature_ecx_include;
3635 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3636 
3637 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3638 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3639 	}
3640 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3641 		add_x86_feature(featureset, X86FSET_TSC);
3642 	}
3643 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3644 		add_x86_feature(featureset, X86FSET_MSR);
3645 	}
3646 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3647 		add_x86_feature(featureset, X86FSET_MTRR);
3648 	}
3649 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3650 		add_x86_feature(featureset, X86FSET_PGE);
3651 	}
3652 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3653 		add_x86_feature(featureset, X86FSET_CMOV);
3654 	}
3655 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3656 		add_x86_feature(featureset, X86FSET_MMX);
3657 	}
3658 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3659 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3660 		add_x86_feature(featureset, X86FSET_MCA);
3661 	}
3662 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3663 		add_x86_feature(featureset, X86FSET_PAE);
3664 	}
3665 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3666 		add_x86_feature(featureset, X86FSET_CX8);
3667 	}
3668 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3669 		add_x86_feature(featureset, X86FSET_CX16);
3670 	}
3671 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3672 		add_x86_feature(featureset, X86FSET_PAT);
3673 	}
3674 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3675 		add_x86_feature(featureset, X86FSET_SEP);
3676 	}
3677 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3678 		/*
3679 		 * In our implementation, fxsave/fxrstor
3680 		 * are prerequisites before we'll even
3681 		 * try and do SSE things.
3682 		 */
3683 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3684 			add_x86_feature(featureset, X86FSET_SSE);
3685 		}
3686 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3687 			add_x86_feature(featureset, X86FSET_SSE2);
3688 		}
3689 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3690 			add_x86_feature(featureset, X86FSET_SSE3);
3691 		}
3692 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3693 			add_x86_feature(featureset, X86FSET_SSSE3);
3694 		}
3695 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3696 			add_x86_feature(featureset, X86FSET_SSE4_1);
3697 		}
3698 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3699 			add_x86_feature(featureset, X86FSET_SSE4_2);
3700 		}
3701 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3702 			add_x86_feature(featureset, X86FSET_AES);
3703 		}
3704 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3705 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3706 		}
3707 
3708 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3709 			add_x86_feature(featureset, X86FSET_SHA);
3710 
3711 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3712 			add_x86_feature(featureset, X86FSET_XSAVE);
3713 
3714 			/* We only test AVX & AVX512 when there is XSAVE */
3715 
3716 			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3717 				add_x86_feature(featureset,
3718 				    X86FSET_AVX);
3719 
3720 				/*
3721 				 * Intel says we can't check these without also
3722 				 * checking AVX.
3723 				 */
3724 				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3725 					add_x86_feature(featureset,
3726 					    X86FSET_F16C);
3727 
3728 				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3729 					add_x86_feature(featureset,
3730 					    X86FSET_FMA);
3731 
3732 				if (cpi->cpi_std[7].cp_ebx &
3733 				    CPUID_INTC_EBX_7_0_BMI1)
3734 					add_x86_feature(featureset,
3735 					    X86FSET_BMI1);
3736 
3737 				if (cpi->cpi_std[7].cp_ebx &
3738 				    CPUID_INTC_EBX_7_0_BMI2)
3739 					add_x86_feature(featureset,
3740 					    X86FSET_BMI2);
3741 
3742 				if (cpi->cpi_std[7].cp_ebx &
3743 				    CPUID_INTC_EBX_7_0_AVX2)
3744 					add_x86_feature(featureset,
3745 					    X86FSET_AVX2);
3746 
3747 				if (cpi->cpi_std[7].cp_ecx &
3748 				    CPUID_INTC_ECX_7_0_VAES)
3749 					add_x86_feature(featureset,
3750 					    X86FSET_VAES);
3751 
3752 				if (cpi->cpi_std[7].cp_ecx &
3753 				    CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3754 					add_x86_feature(featureset,
3755 					    X86FSET_VPCLMULQDQ);
3756 			}
3757 
3758 			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3759 			    (cpi->cpi_std[7].cp_ebx &
3760 			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3761 				add_x86_feature(featureset, X86FSET_AVX512F);
3762 
3763 				if (cpi->cpi_std[7].cp_ebx &
3764 				    CPUID_INTC_EBX_7_0_AVX512DQ)
3765 					add_x86_feature(featureset,
3766 					    X86FSET_AVX512DQ);
3767 				if (cpi->cpi_std[7].cp_ebx &
3768 				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3769 					add_x86_feature(featureset,
3770 					    X86FSET_AVX512FMA);
3771 				if (cpi->cpi_std[7].cp_ebx &
3772 				    CPUID_INTC_EBX_7_0_AVX512PF)
3773 					add_x86_feature(featureset,
3774 					    X86FSET_AVX512PF);
3775 				if (cpi->cpi_std[7].cp_ebx &
3776 				    CPUID_INTC_EBX_7_0_AVX512ER)
3777 					add_x86_feature(featureset,
3778 					    X86FSET_AVX512ER);
3779 				if (cpi->cpi_std[7].cp_ebx &
3780 				    CPUID_INTC_EBX_7_0_AVX512CD)
3781 					add_x86_feature(featureset,
3782 					    X86FSET_AVX512CD);
3783 				if (cpi->cpi_std[7].cp_ebx &
3784 				    CPUID_INTC_EBX_7_0_AVX512BW)
3785 					add_x86_feature(featureset,
3786 					    X86FSET_AVX512BW);
3787 				if (cpi->cpi_std[7].cp_ebx &
3788 				    CPUID_INTC_EBX_7_0_AVX512VL)
3789 					add_x86_feature(featureset,
3790 					    X86FSET_AVX512VL);
3791 
3792 				if (cpi->cpi_std[7].cp_ecx &
3793 				    CPUID_INTC_ECX_7_0_AVX512VBMI)
3794 					add_x86_feature(featureset,
3795 					    X86FSET_AVX512VBMI);
3796 				if (cpi->cpi_std[7].cp_ecx &
3797 				    CPUID_INTC_ECX_7_0_AVX512VNNI)
3798 					add_x86_feature(featureset,
3799 					    X86FSET_AVX512VNNI);
3800 				if (cpi->cpi_std[7].cp_ecx &
3801 				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3802 					add_x86_feature(featureset,
3803 					    X86FSET_AVX512VPOPCDQ);
3804 
3805 				if (cpi->cpi_std[7].cp_edx &
3806 				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
3807 					add_x86_feature(featureset,
3808 					    X86FSET_AVX512NNIW);
3809 				if (cpi->cpi_std[7].cp_edx &
3810 				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3811 					add_x86_feature(featureset,
3812 					    X86FSET_AVX512FMAPS);
3813 			}
3814 		}
3815 	}
3816 
3817 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3818 		add_x86_feature(featureset, X86FSET_PCID);
3819 	}
3820 
3821 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3822 		add_x86_feature(featureset, X86FSET_X2APIC);
3823 	}
3824 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3825 		add_x86_feature(featureset, X86FSET_DE);
3826 	}
3827 #if !defined(__xpv)
3828 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3829 
3830 		/*
3831 		 * We require the CLFLUSH instruction for erratum workaround
3832 		 * to use MONITOR/MWAIT.
3833 		 */
3834 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3835 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3836 			add_x86_feature(featureset, X86FSET_MWAIT);
3837 		} else {
3838 			extern int idle_cpu_assert_cflush_monitor;
3839 
3840 			/*
3841 			 * All processors we are aware of which have
3842 			 * MONITOR/MWAIT also have CLFLUSH.
3843 			 */
3844 			if (idle_cpu_assert_cflush_monitor) {
3845 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3846 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3847 			}
3848 		}
3849 	}
3850 #endif	/* __xpv */
3851 
3852 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3853 		add_x86_feature(featureset, X86FSET_VMX);
3854 	}
3855 
3856 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3857 		add_x86_feature(featureset, X86FSET_RDRAND);
3858 
3859 	/*
3860 	 * Only need it first time, rest of the cpus would follow suit.
3861 	 * we only capture this for the bootcpu.
3862 	 */
3863 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3864 		add_x86_feature(featureset, X86FSET_CLFSH);
3865 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3866 	}
3867 	if (is_x86_feature(featureset, X86FSET_PAE))
3868 		cpi->cpi_pabits = 36;
3869 
3870 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3871 		struct cpuid_regs r, *ecp;
3872 
3873 		ecp = &r;
3874 		ecp->cp_eax = 0xD;
3875 		ecp->cp_ecx = 1;
3876 		ecp->cp_edx = ecp->cp_ebx = 0;
3877 		(void) __cpuid_insn(ecp);
3878 
3879 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3880 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
3881 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3882 			add_x86_feature(featureset, X86FSET_XSAVEC);
3883 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3884 			add_x86_feature(featureset, X86FSET_XSAVES);
3885 	}
3886 
3887 	/*
3888 	 * Work on the "extended" feature information, doing
3889 	 * some basic initialization for cpuid_pass2()
3890 	 */
3891 	xcpuid = 0;
3892 	switch (cpi->cpi_vendor) {
3893 	case X86_VENDOR_Intel:
3894 		/*
3895 		 * On KVM we know we will have proper support for extended
3896 		 * cpuid.
3897 		 */
3898 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3899 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3900 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3901 			xcpuid++;
3902 		break;
3903 	case X86_VENDOR_AMD:
3904 		if (cpi->cpi_family > 5 ||
3905 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3906 			xcpuid++;
3907 		break;
3908 	case X86_VENDOR_Cyrix:
3909 		/*
3910 		 * Only these Cyrix CPUs are -known- to support
3911 		 * extended cpuid operations.
3912 		 */
3913 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3914 		    x86_type == X86_TYPE_CYRIX_GXm)
3915 			xcpuid++;
3916 		break;
3917 	case X86_VENDOR_HYGON:
3918 	case X86_VENDOR_Centaur:
3919 	case X86_VENDOR_TM:
3920 	default:
3921 		xcpuid++;
3922 		break;
3923 	}
3924 
3925 	if (xcpuid) {
3926 		cp = &cpi->cpi_extd[0];
3927 		cp->cp_eax = CPUID_LEAF_EXT_0;
3928 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
3929 	}
3930 
3931 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3932 
3933 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3934 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3935 
3936 		switch (cpi->cpi_vendor) {
3937 		case X86_VENDOR_Intel:
3938 		case X86_VENDOR_AMD:
3939 		case X86_VENDOR_HYGON:
3940 			if (cpi->cpi_xmaxeax < 0x80000001)
3941 				break;
3942 			cp = &cpi->cpi_extd[1];
3943 			cp->cp_eax = 0x80000001;
3944 			(void) __cpuid_insn(cp);
3945 
3946 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3947 			    cpi->cpi_family == 5 &&
3948 			    cpi->cpi_model == 6 &&
3949 			    cpi->cpi_step == 6) {
3950 				/*
3951 				 * K6 model 6 uses bit 10 to indicate SYSC
3952 				 * Later models use bit 11. Fix it here.
3953 				 */
3954 				if (cp->cp_edx & 0x400) {
3955 					cp->cp_edx &= ~0x400;
3956 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3957 				}
3958 			}
3959 
3960 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3961 
3962 			/*
3963 			 * Compute the additions to the kernel's feature word.
3964 			 */
3965 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3966 				add_x86_feature(featureset, X86FSET_NX);
3967 			}
3968 
3969 			/*
3970 			 * Regardless whether or not we boot 64-bit,
3971 			 * we should have a way to identify whether
3972 			 * the CPU is capable of running 64-bit.
3973 			 */
3974 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3975 				add_x86_feature(featureset, X86FSET_64);
3976 			}
3977 
3978 			/* 1 GB large page - enable only for 64 bit kernel */
3979 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3980 				add_x86_feature(featureset, X86FSET_1GPG);
3981 			}
3982 
3983 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3984 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3985 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3986 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3987 				add_x86_feature(featureset, X86FSET_SSE4A);
3988 			}
3989 
3990 			/*
3991 			 * It's really tricky to support syscall/sysret in
3992 			 * the i386 kernel; we rely on sysenter/sysexit
3993 			 * instead.  In the amd64 kernel, things are -way-
3994 			 * better.
3995 			 */
3996 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3997 				add_x86_feature(featureset, X86FSET_ASYSC);
3998 			}
3999 
4000 			/*
4001 			 * While we're thinking about system calls, note
4002 			 * that AMD processors don't support sysenter
4003 			 * in long mode at all, so don't try to program them.
4004 			 */
4005 			if (x86_vendor == X86_VENDOR_AMD ||
4006 			    x86_vendor == X86_VENDOR_HYGON) {
4007 				remove_x86_feature(featureset, X86FSET_SEP);
4008 			}
4009 
4010 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4011 				add_x86_feature(featureset, X86FSET_TSCP);
4012 			}
4013 
4014 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4015 				add_x86_feature(featureset, X86FSET_SVM);
4016 			}
4017 
4018 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4019 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4020 			}
4021 
4022 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4023 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4024 			}
4025 
4026 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4027 				add_x86_feature(featureset, X86FSET_XOP);
4028 			}
4029 
4030 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4031 				add_x86_feature(featureset, X86FSET_FMA4);
4032 			}
4033 
4034 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4035 				add_x86_feature(featureset, X86FSET_TBM);
4036 			}
4037 
4038 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4039 				add_x86_feature(featureset, X86FSET_MONITORX);
4040 			}
4041 			break;
4042 		default:
4043 			break;
4044 		}
4045 
4046 		/*
4047 		 * Get CPUID data about processor cores and hyperthreads.
4048 		 */
4049 		switch (cpi->cpi_vendor) {
4050 		case X86_VENDOR_Intel:
4051 			if (cpi->cpi_maxeax >= 4) {
4052 				cp = &cpi->cpi_std[4];
4053 				cp->cp_eax = 4;
4054 				cp->cp_ecx = 0;
4055 				(void) __cpuid_insn(cp);
4056 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4057 			}
4058 			/*FALLTHROUGH*/
4059 		case X86_VENDOR_AMD:
4060 		case X86_VENDOR_HYGON:
4061 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4062 				break;
4063 			cp = &cpi->cpi_extd[8];
4064 			cp->cp_eax = CPUID_LEAF_EXT_8;
4065 			(void) __cpuid_insn(cp);
4066 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4067 			    cp);
4068 
4069 			/*
4070 			 * AMD uses ebx for some extended functions.
4071 			 */
4072 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4073 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4074 				/*
4075 				 * While we're here, check for the AMD "Error
4076 				 * Pointer Zero/Restore" feature. This can be
4077 				 * used to setup the FP save handlers
4078 				 * appropriately.
4079 				 */
4080 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4081 					cpi->cpi_fp_amd_save = 0;
4082 				} else {
4083 					cpi->cpi_fp_amd_save = 1;
4084 				}
4085 
4086 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4087 					add_x86_feature(featureset,
4088 					    X86FSET_CLZERO);
4089 				}
4090 			}
4091 
4092 			/*
4093 			 * Virtual and physical address limits from
4094 			 * cpuid override previously guessed values.
4095 			 */
4096 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4097 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4098 			break;
4099 		default:
4100 			break;
4101 		}
4102 
4103 		/*
4104 		 * Get CPUID data about TSC Invariance in Deep C-State.
4105 		 */
4106 		switch (cpi->cpi_vendor) {
4107 		case X86_VENDOR_Intel:
4108 		case X86_VENDOR_AMD:
4109 		case X86_VENDOR_HYGON:
4110 			if (cpi->cpi_maxeax >= 7) {
4111 				cp = &cpi->cpi_extd[7];
4112 				cp->cp_eax = 0x80000007;
4113 				cp->cp_ecx = 0;
4114 				(void) __cpuid_insn(cp);
4115 			}
4116 			break;
4117 		default:
4118 			break;
4119 		}
4120 	}
4121 
4122 	/*
4123 	 * cpuid_pass1_ppin assumes that cpuid_pass1_topology has already been
4124 	 * run and thus gathered some of its dependent leaves.
4125 	 */
4126 	cpuid_pass1_topology(cpu, featureset);
4127 	cpuid_pass1_thermal(cpu, featureset);
4128 #if !defined(__xpv)
4129 	cpuid_pass1_ppin(cpu, featureset);
4130 #endif
4131 
4132 	/*
4133 	 * Synthesize chip "revision" and socket type
4134 	 */
4135 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4136 	    cpi->cpi_model, cpi->cpi_step);
4137 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4138 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4139 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4140 	    cpi->cpi_model, cpi->cpi_step);
4141 
4142 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4143 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4144 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4145 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4146 			/* Special handling for AMD FP not necessary. */
4147 			cpi->cpi_fp_amd_save = 0;
4148 		} else {
4149 			cpi->cpi_fp_amd_save = 1;
4150 		}
4151 	}
4152 
4153 	/*
4154 	 * Check (and potentially set) if lfence is serializing.
4155 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4156 	 */
4157 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4158 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4159 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4160 		/*
4161 		 * The AMD white paper Software Techniques For Managing
4162 		 * Speculation on AMD Processors details circumstances for when
4163 		 * lfence instructions are serializing.
4164 		 *
4165 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4166 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4167 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4168 		 * committed to supporting that MSR on all later CPUs.
4169 		 */
4170 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4171 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4172 		} else if (cpi->cpi_family >= 0x10) {
4173 			uint64_t val = 0;
4174 
4175 #if !defined(__xpv)
4176 			/*
4177 			 * Be careful when attempting to enable the bit, and
4178 			 * verify that it was actually set in case we are
4179 			 * running in a hypervisor which is less than faithful
4180 			 * about its emulation of this feature.
4181 			 */
4182 			on_trap_data_t otd;
4183 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4184 				val = rdmsr(MSR_AMD_DE_CFG);
4185 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4186 				wrmsr(MSR_AMD_DE_CFG, val);
4187 				val = rdmsr(MSR_AMD_DE_CFG);
4188 			}
4189 			no_trap();
4190 #endif
4191 
4192 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4193 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4194 			}
4195 		}
4196 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4197 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4198 		/*
4199 		 * Documentation and other OSes indicate that lfence is always
4200 		 * serializing on Intel CPUs.
4201 		 */
4202 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4203 	}
4204 
4205 
4206 	/*
4207 	 * Check the processor leaves that are used for security features.
4208 	 */
4209 	cpuid_scan_security(cpu, featureset);
4210 
4211 pass1_done:
4212 	cpi->cpi_pass = 1;
4213 }
4214 
4215 /*
4216  * Make copies of the cpuid table entries we depend on, in
4217  * part for ease of parsing now, in part so that we have only
4218  * one place to correct any of it, in part for ease of
4219  * later export to userland, and in part so we can look at
4220  * this stuff in a crash dump.
4221  */
4222 
4223 /*ARGSUSED*/
4224 void
cpuid_pass2(cpu_t * cpu)4225 cpuid_pass2(cpu_t *cpu)
4226 {
4227 	uint_t n, nmax;
4228 	int i;
4229 	struct cpuid_regs *cp;
4230 	uint8_t *dp;
4231 	uint32_t *iptr;
4232 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4233 
4234 	ASSERT(cpi->cpi_pass == 1);
4235 
4236 	if (cpi->cpi_maxeax < 1)
4237 		goto pass2_done;
4238 
4239 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4240 		nmax = NMAX_CPI_STD;
4241 	/*
4242 	 * (We already handled n == 0 and n == 1 in pass 1)
4243 	 */
4244 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4245 		/*
4246 		 * leaves 6 and 7 were handled in pass 1
4247 		 */
4248 		if (n == 6 || n == 7)
4249 			continue;
4250 
4251 		cp->cp_eax = n;
4252 
4253 		/*
4254 		 * CPUID function 4 expects %ecx to be initialized
4255 		 * with an index which indicates which cache to return
4256 		 * information about. The OS is expected to call function 4
4257 		 * with %ecx set to 0, 1, 2, ... until it returns with
4258 		 * EAX[4:0] set to 0, which indicates there are no more
4259 		 * caches.
4260 		 *
4261 		 * Here, populate cpi_std[4] with the information returned by
4262 		 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
4263 		 * when dynamic memory allocation becomes available.
4264 		 *
4265 		 * Note: we need to explicitly initialize %ecx here, since
4266 		 * function 4 may have been previously invoked.
4267 		 */
4268 		if (n == 4)
4269 			cp->cp_ecx = 0;
4270 
4271 		(void) __cpuid_insn(cp);
4272 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4273 		switch (n) {
4274 		case 2:
4275 			/*
4276 			 * "the lower 8 bits of the %eax register
4277 			 * contain a value that identifies the number
4278 			 * of times the cpuid [instruction] has to be
4279 			 * executed to obtain a complete image of the
4280 			 * processor's caching systems."
4281 			 *
4282 			 * How *do* they make this stuff up?
4283 			 */
4284 			cpi->cpi_ncache = sizeof (*cp) *
4285 			    BITX(cp->cp_eax, 7, 0);
4286 			if (cpi->cpi_ncache == 0)
4287 				break;
4288 			cpi->cpi_ncache--;	/* skip count byte */
4289 
4290 			/*
4291 			 * Well, for now, rather than attempt to implement
4292 			 * this slightly dubious algorithm, we just look
4293 			 * at the first 15 ..
4294 			 */
4295 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4296 				cpi->cpi_ncache = sizeof (*cp) - 1;
4297 
4298 			dp = cpi->cpi_cacheinfo;
4299 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4300 				uint8_t *p = (void *)&cp->cp_eax;
4301 				for (i = 1; i < 4; i++)
4302 					if (p[i] != 0)
4303 						*dp++ = p[i];
4304 			}
4305 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4306 				uint8_t *p = (void *)&cp->cp_ebx;
4307 				for (i = 0; i < 4; i++)
4308 					if (p[i] != 0)
4309 						*dp++ = p[i];
4310 			}
4311 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4312 				uint8_t *p = (void *)&cp->cp_ecx;
4313 				for (i = 0; i < 4; i++)
4314 					if (p[i] != 0)
4315 						*dp++ = p[i];
4316 			}
4317 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4318 				uint8_t *p = (void *)&cp->cp_edx;
4319 				for (i = 0; i < 4; i++)
4320 					if (p[i] != 0)
4321 						*dp++ = p[i];
4322 			}
4323 			break;
4324 
4325 		case 3:	/* Processor serial number, if PSN supported */
4326 			break;
4327 
4328 		case 4:	/* Deterministic cache parameters */
4329 			break;
4330 
4331 		case 5:	/* Monitor/Mwait parameters */
4332 		{
4333 			size_t mwait_size;
4334 
4335 			/*
4336 			 * check cpi_mwait.support which was set in cpuid_pass1
4337 			 */
4338 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4339 				break;
4340 
4341 			/*
4342 			 * Protect ourself from insane mwait line size.
4343 			 * Workaround for incomplete hardware emulator(s).
4344 			 */
4345 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4346 			if (mwait_size < sizeof (uint32_t) ||
4347 			    !ISP2(mwait_size)) {
4348 #if DEBUG
4349 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4350 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4351 #endif
4352 				break;
4353 			}
4354 
4355 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4356 			cpi->cpi_mwait.mon_max = mwait_size;
4357 			if (MWAIT_EXTENSION(cpi)) {
4358 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4359 				if (MWAIT_INT_ENABLE(cpi))
4360 					cpi->cpi_mwait.support |=
4361 					    MWAIT_ECX_INT_ENABLE;
4362 			}
4363 			break;
4364 		}
4365 		default:
4366 			break;
4367 		}
4368 	}
4369 
4370 	/*
4371 	 * XSAVE enumeration
4372 	 */
4373 	if (cpi->cpi_maxeax >= 0xD) {
4374 		struct cpuid_regs regs;
4375 		boolean_t cpuid_d_valid = B_TRUE;
4376 
4377 		cp = &regs;
4378 		cp->cp_eax = 0xD;
4379 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4380 
4381 		(void) __cpuid_insn(cp);
4382 
4383 		/*
4384 		 * Sanity checks for debug
4385 		 */
4386 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4387 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4388 			cpuid_d_valid = B_FALSE;
4389 		}
4390 
4391 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4392 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4393 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4394 
4395 		/*
4396 		 * If the hw supports AVX, get the size and offset in the save
4397 		 * area for the ymm state.
4398 		 */
4399 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4400 			cp->cp_eax = 0xD;
4401 			cp->cp_ecx = 2;
4402 			cp->cp_edx = cp->cp_ebx = 0;
4403 
4404 			(void) __cpuid_insn(cp);
4405 
4406 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4407 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4408 				cpuid_d_valid = B_FALSE;
4409 			}
4410 
4411 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4412 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4413 		}
4414 
4415 		/*
4416 		 * If the hw supports MPX, get the size and offset in the
4417 		 * save area for BNDREGS and BNDCSR.
4418 		 */
4419 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4420 			cp->cp_eax = 0xD;
4421 			cp->cp_ecx = 3;
4422 			cp->cp_edx = cp->cp_ebx = 0;
4423 
4424 			(void) __cpuid_insn(cp);
4425 
4426 			cpi->cpi_xsave.bndregs_size = cp->