xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision 5a9c36de)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2023 Oxide Computer Company
28  * Copyright 2024 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * The Microcode Pass
328  *
329  * After a microcode update, we do a selective rescan of the cpuid leaves to
330  * determine what features have changed. Microcode updates can provide more
331  * details about security related features to deal with issues like Spectre and
332  * L1TF. On occasion, vendors have violated their contract and removed bits.
333  * However, we don't try to detect that because that puts us in a situation that
334  * we really can't deal with. As such, the only thing we rescan are security
335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
336  * different sequence on APs and therefore is not part of the sequential order;
337  * It is invoked directly instead of by cpuid_execpass() and its completion
338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
339  * a more complex dependency mechanism if warranted by future developments.
340  *
341  * All of the passes are run on all CPUs. However, for the most part we only
342  * care about what the boot CPU says about this information and use the other
343  * CPUs as a rough guide to sanity check that we have the same feature set.
344  *
345  * We do not support running multiple logical CPUs with disjoint, let alone
346  * different, feature sets.
347  *
348  * ------------------
349  * Processor Topology
350  * ------------------
351  *
352  * One of the important things that we need to do is to understand the topology
353  * of the underlying processor. When we say topology in this case, we're trying
354  * to understand the relationship between the logical CPUs that the operating
355  * system sees and the underlying physical layout. Different logical CPUs may
356  * share different resources which can have important consequences for the
357  * performance of the system. For example, they may share caches, execution
358  * units, and more.
359  *
360  * The topology of the processor changes from generation to generation and
361  * vendor to vendor.  Along with that, different vendors use different
362  * terminology, and the operating system itself uses occasionally overlapping
363  * terminology. It's important to understand what this topology looks like so
364  * one can understand the different things that we try to calculate and
365  * determine.
366  *
367  * To get started, let's talk about a little bit of terminology that we've used
368  * so far, is used throughout this file, and is fairly generic across multiple
369  * vendors:
370  *
371  * CPU
372  *	A central processing unit (CPU) refers to a logical and/or virtual
373  *	entity that the operating system can execute instructions on. The
374  *	underlying resources for this CPU may be shared between multiple
375  *	entities; however, to the operating system it is a discrete unit.
376  *
377  * PROCESSOR and PACKAGE
378  *
379  *	Generally, when we use the term 'processor' on its own, we are referring
380  *	to the physical entity that one buys and plugs into a board. However,
381  *	because processor has been overloaded and one might see it used to mean
382  *	multiple different levels, we will instead use the term 'package' for
383  *	the rest of this file. The term package comes from the electrical
384  *	engineering side and refers to the physical entity that encloses the
385  *	electronics inside. Strictly speaking the package can contain more than
386  *	just the CPU, for example, on many processors it may also have what's
387  *	called an 'integrated graphical processing unit (GPU)'. Because the
388  *	package can encapsulate multiple units, it is the largest physical unit
389  *	that we refer to.
390  *
391  * SOCKET
392  *
393  *	A socket refers to unit on a system board (generally the motherboard)
394  *	that can receive a package. A single package, or processor, is plugged
395  *	into a single socket. A system may have multiple sockets. Often times,
396  *	the term socket is used interchangeably with package and refers to the
397  *	electrical component that has plugged in, and not the receptacle itself.
398  *
399  * CORE
400  *
401  *	A core refers to the physical instantiation of a CPU, generally, with a
402  *	full set of hardware resources available to it. A package may contain
403  *	multiple cores inside of it or it may just have a single one. A
404  *	processor with more than one core is often referred to as 'multi-core'.
405  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
406  *	that has 'multi-core' processors.
407  *
408  *	A core may expose a single logical CPU to the operating system, or it
409  *	may expose multiple CPUs, which we call threads, defined below.
410  *
411  *	Some resources may still be shared by cores in the same package. For
412  *	example, many processors will share the level 3 cache between cores.
413  *	Some AMD generations share hardware resources between cores. For more
414  *	information on that see the section 'AMD Topology'.
415  *
416  * THREAD and STRAND
417  *
418  *	In this file, generally a thread refers to a hardware resources and not
419  *	the operating system's logical abstraction. A thread is always exposed
420  *	as an independent logical CPU to the operating system. A thread belongs
421  *	to a specific core. A core may have more than one thread. When that is
422  *	the case, the threads that are part of the same core are often referred
423  *	to as 'siblings'.
424  *
425  *	When multiple threads exist, this is generally referred to as
426  *	simultaneous multi-threading (SMT). When Intel introduced this in their
427  *	processors they called it hyper-threading (HT). When multiple threads
428  *	are active in a core, they split the resources of the core. For example,
429  *	two threads may share the same set of hardware execution units.
430  *
431  *	The operating system often uses the term 'strand' to refer to a thread.
432  *	This helps disambiguate it from the software concept.
433  *
434  * CHIP
435  *
436  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
437  *	base meaning, it is used to refer to a single integrated circuit, which
438  *	may or may not be the only thing in the package. In illumos, when you
439  *	see the term 'chip' it is almost always referring to the same thing as
440  *	the 'package'. However, many vendors may use chip to refer to one of
441  *	many integrated circuits that have been placed in the package. As an
442  *	example, see the subsequent definition.
443  *
444  *	To try and keep things consistent, we will only use chip when referring
445  *	to the entire integrated circuit package, with the exception of the
446  *	definition of multi-chip module (because it is in the name) and use the
447  *	term 'die' when we want the more general, potential sub-component
448  *	definition.
449  *
450  * DIE
451  *
452  *	A die refers to an integrated circuit. Inside of the package there may
453  *	be a single die or multiple dies. This is sometimes called a 'chip' in
454  *	vendor's parlance, but in this file, we use the term die to refer to a
455  *	subcomponent.
456  *
457  * MULTI-CHIP MODULE
458  *
459  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
460  *	are connected together in the same package. When a multi-chip design is
461  *	used, generally each chip is manufactured independently and then joined
462  *	together in the package. For example, on AMD's Zen microarchitecture
463  *	(family 0x17), the package contains several dies (the second meaning of
464  *	chip from above) that are connected together.
465  *
466  * CACHE
467  *
468  *	A cache is a part of the processor that maintains copies of recently
469  *	accessed memory. Caches are split into levels and then into types.
470  *	Commonly there are one to three levels, called level one, two, and
471  *	three. The lower the level, the smaller it is, the closer it is to the
472  *	execution units of the CPU, and the faster it is to access. The layout
473  *	and design of the cache come in many different flavors, consult other
474  *	resources for a discussion of those.
475  *
476  *	Caches are generally split into two types, the instruction and data
477  *	cache. The caches contain what their names suggest, the instruction
478  *	cache has executable program text, while the data cache has all other
479  *	memory that the processor accesses. As of this writing, data is kept
480  *	coherent between all of the caches on x86, so if one modifies program
481  *	text before it is executed, that will be in the data cache, and the
482  *	instruction cache will be synchronized with that change when the
483  *	processor actually executes those instructions. This coherency also
484  *	covers the fact that data could show up in multiple caches.
485  *
486  *	Generally, the lowest level caches are specific to a core. However, the
487  *	last layer cache is shared between some number of cores. The number of
488  *	CPUs sharing this last level cache is important. This has implications
489  *	for the choices that the scheduler makes, as accessing memory that might
490  *	be in a remote cache after thread migration can be quite expensive.
491  *
492  *	Sometimes, the word cache is abbreviated with a '$', because in US
493  *	English the word cache is pronounced the same as cash. So L1D$ refers to
494  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
495  *	in the rest of this theory statement for clarity.
496  *
497  * MEMORY CONTROLLER
498  *
499  *	The memory controller is a component that provides access to DRAM. Each
500  *	memory controller can access a set number of DRAM channels. Each channel
501  *	can have a number of DIMMs (sticks of memory) associated with it. A
502  *	given package may have more than one memory controller. The association
503  *	of the memory controller to a group of cores is important as it is
504  *	cheaper to access memory on the controller that you are associated with.
505  *
506  * NUMA
507  *
508  *	NUMA or non-uniform memory access, describes a way that systems are
509  *	built. On x86, any processor core can address all of the memory in the
510  *	system. However, When using multiple sockets or possibly within a
511  *	multi-chip module, some of that memory is physically closer and some of
512  *	it is further. Memory that is further away is more expensive to access.
513  *	Consider the following image of multiple sockets with memory:
514  *
515  *	+--------+                                                +--------+
516  *	| DIMM A |         +----------+      +----------+         | DIMM D |
517  *	+--------+-+       |          |      |          |       +-+------+-+
518  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
519  *	  +--------+-+     |          |      |          |     +-+------+-+
520  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
521  *	    +--------+                                        +--------+
522  *
523  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
524  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
525  *	access DIMMs A-C and more expensive to access D-F as it has to go
526  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
527  *	D-F are cheaper than A-C. While the socket form is the most common, when
528  *	using multi-chip modules, this can also sometimes occur. For another
529  *	example of this that's more involved, see the AMD topology section.
530  *
531  *
532  * Intel Topology
533  * --------------
534  *
535  * Most Intel processors since Nehalem, (as of this writing the current gen
536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
537  * the package is a single monolithic die. MCMs currently aren't used. Most
538  * parts have three levels of caches, with the L3 cache being shared between
539  * all of the cores on the package. The L1/L2 cache is generally specific to
540  * an individual core. The following image shows at a simplified level what
541  * this looks like. The memory controller is commonly part of something called
542  * the 'Uncore', that used to be separate physical chips that were not a part of
543  * the package, but are now part of the same chip.
544  *
545  *  +-----------------------------------------------------------------------+
546  *  | Package                                                               |
547  *  |  +-------------------+  +-------------------+  +-------------------+  |
548  *  |  | Core              |  | Core              |  | Core              |  |
549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
558  *  |  +-------------------+  +-------------------+  +-------------------+  |
559  *  | +-------------------------------------------------------------------+ |
560  *  | |                         Shared L3 Cache                           | |
561  *  | +-------------------------------------------------------------------+ |
562  *  | +-------------------------------------------------------------------+ |
563  *  | |                        Memory Controller                          | |
564  *  | +-------------------------------------------------------------------+ |
565  *  +-----------------------------------------------------------------------+
566  *
567  * A side effect of this current architecture is that what we care about from a
568  * scheduling and topology perspective, is simplified. In general we care about
569  * understanding which logical CPUs are part of the same core and socket.
570  *
571  * To determine the relationship between threads and cores, Intel initially used
572  * the identifier in the advanced programmable interrupt controller (APIC). They
573  * also added cpuid leaf 4 to give additional information about the number of
574  * threads and CPUs in the processor. With the addition of x2apic (which
575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
576  * additional cpuid topology leaf 0xB was added.
577  *
578  * AMD Topology
579  * ------------
580  *
581  * When discussing AMD topology, we want to break this into three distinct
582  * generations of topology. There's the basic topology that has been used in
583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
587  * additional terminology that's worth talking about.
588  *
589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
590  * that they considered SMT. Whether or not the AMD processors have SMT
591  * influences many things including scheduling and reliability, availability,
592  * and serviceability (RAS) features.
593  *
594  * NODE
595  *
596  *	AMD uses the term node to refer to a die that contains a number of cores
597  *	and I/O resources. Depending on the processor family and model, more
598  *	than one node can be present in the package. When there is more than one
599  *	node this indicates a multi-chip module. Usually each node has its own
600  *	access to memory and I/O devices. This is important and generally
601  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
602  *	result, we track this relationship in the operating system.
603  *
604  *	In processors with an L3 cache, the L3 cache is generally shared across
605  *	the entire node, though the way this is carved up varies from generation
606  *	to generation.
607  *
608  * BULLDOZER
609  *
610  *	Starting with the Bulldozer family (0x15) and continuing until the
611  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
612  *	compute unit. In a compute unit, two traditional cores share a number of
613  *	hardware resources. Critically, they share the FPU, L1 instruction
614  *	cache, and the L2 cache. Several compute units were then combined inside
615  *	of a single node.  Because the integer execution units, L1 data cache,
616  *	and some other resources were not shared between the cores, AMD never
617  *	considered this to be SMT.
618  *
619  * ZEN
620  *
621  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
622  *	is called Zeppelin. These modules are similar to the idea of nodes used
623  *	previously. Each of these nodes has two DRAM channels which all of the
624  *	cores in the node can access uniformly. These nodes are linked together
625  *	in the package, creating a NUMA environment.
626  *
627  *	The Zeppelin die itself contains two different 'core complexes'. Each
628  *	core complex consists of four cores which each have two threads, for a
629  *	total of 8 logical CPUs per complex. Unlike other generations,
630  *	where all the logical CPUs in a given node share the L3 cache, here each
631  *	core complex has its own shared L3 cache.
632  *
633  *	A further thing that we need to consider is that in some configurations,
634  *	particularly with the Threadripper line of processors, not every die
635  *	actually has its memory controllers wired up to actual memory channels.
636  *	This means that some cores have memory attached to them and others
637  *	don't.
638  *
639  *	To put Zen in perspective, consider the following images:
640  *
641  *      +--------------------------------------------------------+
642  *      | Core Complex                                           |
643  *      | +-------------------+    +-------------------+  +---+  |
644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
650  *      | +-------------------+    +-------------------+  | C |  |
651  *      | +-------------------+    +-------------------+  | a |  |
652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
658  *      | +-------------------+    +-------------------+  +---+  |
659  *      |                                                        |
660  *	+--------------------------------------------------------+
661  *
662  *  This first image represents a single Zen core complex that consists of four
663  *  cores.
664  *
665  *
666  *	+--------------------------------------------------------+
667  *	| Zeppelin Die                                           |
668  *	|  +--------------------------------------------------+  |
669  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
670  *	|  +--------------------------------------------------+  |
671  *      |                           HH                           |
672  *	|          +-----------+    HH    +-----------+          |
673  *	|          |           |    HH    |           |          |
674  *	|          |    Core   |==========|    Core   |          |
675  *	|          |  Complex  |==========|  Complex  |          |
676  *	|          |           |    HH    |           |          |
677  *	|          +-----------+    HH    +-----------+          |
678  *      |                           HH                           |
679  *	|  +--------------------------------------------------+  |
680  *	|  |                Memory Controller                 |  |
681  *	|  +--------------------------------------------------+  |
682  *      |                                                        |
683  *	+--------------------------------------------------------+
684  *
685  *  This image represents a single Zeppelin Die. Note how both cores are
686  *  connected to the same memory controller and I/O units. While each core
687  *  complex has its own L3 cache as seen in the first image, they both have
688  *  uniform access to memory.
689  *
690  *
691  *                      PP                     PP
692  *                      PP                     PP
693  *           +----------PP---------------------PP---------+
694  *           |          PP                     PP         |
695  *           |    +-----------+          +-----------+    |
696  *           |    |           |          |           |    |
697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
699  *           |    |           |          |           |    |
700  *           |    +-----------+ooo    ...+-----------+    |
701  *           |          HH      ooo  ...       HH         |
702  *           |          HH        oo..         HH         |
703  *           |          HH        ..oo         HH         |
704  *           |          HH      ...  ooo       HH         |
705  *           |    +-----------+...    ooo+-----------+    |
706  *           |    |           |          |           |    |
707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
709  *           |    |           |          |           |    |
710  *           |    +-----------+          +-----------+    |
711  *           |          PP                     PP         |
712  *           +----------PP---------------------PP---------+
713  *                      PP                     PP
714  *                      PP                     PP
715  *
716  *  This image represents a single Zen package. In this example, it has four
717  *  Zeppelin dies, though some configurations only have a single one. In this
718  *  example, each die is directly connected to the next. Also, each die is
719  *  represented as being connected to memory by the 'M' character and connected
720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
721  *  die is made up of two core complexes, we have multiple different NUMA
722  *  domains that we care about for these systems.
723  *
724  * ZEN 2
725  *
726  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
727  *	each Zeppelin Die had its own I/O die, that has been moved out of the
728  *	core complex in Zen 2. The actual core complex looks pretty similar, but
729  *	now the die actually looks much simpler:
730  *
731  *      +--------------------------------------------------------+
732  *      | Zen 2 Core Complex Die    HH                           |
733  *      |                           HH                           |
734  *      |          +-----------+    HH    +-----------+          |
735  *      |          |           |    HH    |           |          |
736  *      |          |    Core   |==========|    Core   |          |
737  *      |          |  Complex  |==========|  Complex  |          |
738  *      |          |           |    HH    |           |          |
739  *      |          +-----------+    HH    +-----------+          |
740  *      |                           HH                           |
741  *      |                           HH                           |
742  *      +--------------------------------------------------------+
743  *
744  *	From here, when we add the central I/O die, this changes things a bit.
745  *	Each die is connected to the I/O die, rather than trying to interconnect
746  *	them directly. The following image takes the same Zen 1 image that we
747  *	had earlier and shows what it looks like with the I/O die instead:
748  *
749  *                                 PP    PP
750  *                                 PP    PP
751  *           +---------------------PP----PP---------------------+
752  *           |                     PP    PP                     |
753  *           |  +-----------+      PP    PP      +-----------+  |
754  *           |  |           |      PP    PP      |           |  |
755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
757  *           |  |         |o|oooo|          |oooo|o|         |  |
758  *           |  +-----------+    |          |    +-----------+  |
759  *           |                   |   I/O    |                   |
760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
762  *           |                   |          |                   |
763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
765  *           |                   |          |                   |
766  *           |  +-----------+    |          |    +-----------+  |
767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
769  *           |  |    Die    |      PP    PP      |    Die    |  |
770  *           |  |           |      PP    PP      |           |  |
771  *           |  +-----------+      PP    PP      +-----------+  |
772  *           |                     PP    PP                     |
773  *           +---------------------PP----PP---------------------+
774  *                                 PP    PP
775  *                                 PP    PP
776  *
777  *	The above has four core complex dies installed, though the Zen 2 EPYC
778  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
779  *	generally only have one to two. The more notable difference here is how
780  *	everything communicates. Note that memory and PCIe come out of the
781  *	central die. This changes the way that one die accesses a resource. It
782  *	basically always has to go to the I/O die, where as in Zen 1 it may have
783  *	satisfied it locally. In general, this ends up being a better strategy
784  *	for most things, though it is possible to still treat everything in four
785  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
786  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
787  *	now there is only one 'node' present.
788  *
789  * ZEN 3
790  *
791  *	From an architectural perspective, Zen 3 is a much smaller change from
792  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
793  *	its microarchitectural changes. The biggest thing for us is how the die
794  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
795  *	cache. However, in Zen 3, the L3 is now shared between the entire core
796  *	complex die and is no longer partitioned between each core complex. This
797  *	means that all cores on the die can share the same L3 cache. Otherwise,
798  *	the general layout of the overall package with various core complexes
799  *	and an I/O die stays the same. Here's what the Core Complex Die looks
800  *	like in a bit more detail:
801  *
802  *               +-------------------------------------------------+
803  *               | Zen 3 Core Complex Die                          |
804  *               | +-------------------+    +-------------------+  |
805  *               | | Core       +----+ |    | Core       +----+ |  |
806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
811  *               | +-------------------+    +-------------------+  |
812  *               | +-------------------+    +-------------------+  |
813  *               | | Core       +----+ |    | Core       +----+ |  |
814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
819  *               | +-------------------+    +-------------------+  |
820  *               |                                                 |
821  *               | +--------------------------------------------+  |
822  *               | |                 L3 Cache                   |  |
823  *               | +--------------------------------------------+  |
824  *               |                                                 |
825  *               | +-------------------+    +-------------------+  |
826  *               | | Core       +----+ |    | Core       +----+ |  |
827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
832  *               | +-------------------+    +-------------------+  |
833  *               | +-------------------+    +-------------------+  |
834  *               | | Core       +----+ |    | Core       +----+ |  |
835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
840  *               | +-------------------+    +-------------------+  |
841  *               +-------------------------------------------------+
842  *
843  *	While it is not pictured, there are connections from the die to the
844  *	broader data fabric and additional functional blocks to support that
845  *	communication and coherency.
846  *
847  * CPUID LEAVES
848  *
849  * There are a few different CPUID leaves that we can use to try and understand
850  * the actual state of the world. As part of the introduction of family 0xf, AMD
851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
852  * processors that are in the system. Because families before Zen didn't have
853  * SMT, this was always the number of cores that were in the system. However, it
854  * should always be thought of as the number of logical threads to be consistent
855  * between generations. In addition we also get the size of the APIC ID that is
856  * used to represent the number of logical processors. This is important for
857  * deriving topology information.
858  *
859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
860  * bit between Bulldozer and later families, but it is quite useful in
861  * determining the topology information. Because this information has changed
862  * across family generations, it's worth calling out what these mean
863  * explicitly. The registers have the following meanings:
864  *
865  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
866  *		APIC ID, even though on systems without x2apic support, it will
867  *		be limited to 8 bits.
868  *
869  *	%ebx	On Bulldozer-era systems this contains information about the
870  *		number of cores that are in a compute unit (cores that share
871  *		resources). It also contains a per-package compute unit ID that
872  *		identifies which compute unit the logical CPU is a part of.
873  *
874  *		On Zen-era systems this instead contains the number of threads
875  *		per core and the ID of the core that the logical CPU is a part
876  *		of. Note, this ID is unique only to the package, it is not
877  *		globally unique across the entire system.
878  *
879  *	%ecx	This contains the number of nodes that exist in the package. It
880  *		also contains an ID that identifies which node the logical CPU
881  *		is a part of.
882  *
883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
884  * cache layout to determine which logical CPUs are sharing which caches.
885  *
886  * illumos Topology
887  * ----------------
888  *
889  * Based on the above we synthesize the information into several different
890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
891  * of what each member is supposed to represent and their uniqueness. In
892  * general, there are two levels of uniqueness that we care about. We care about
893  * an ID that is globally unique. That means that it will be unique across all
894  * entities in the system. For example, the default logical CPU ID is globally
895  * unique. On the other hand, there is some information that we only care about
896  * being unique within the context of a single package / socket. Here are the
897  * variables that we keep track of and their meaning.
898  *
899  * Several of the values that are asking for an identifier, with the exception
900  * of cpi_apicid, are allowed to be synthetic.
901  *
902  *
903  * cpi_apicid
904  *
905  *	This is the value of the CPU's APIC id. This should be the full 32-bit
906  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
907  *	APIC ID. This value is globally unique between all logical CPUs across
908  *	all packages. This is usually required by the APIC.
909  *
910  * cpi_chipid
911  *
912  *	This value indicates the ID of the package that the logical CPU is a
913  *	part of. This value is allowed to be synthetic. It is usually derived by
914  *	taking the CPU's APIC ID and determining how many bits are used to
915  *	represent CPU cores in the package. All logical CPUs that are part of
916  *	the same package must have the same value.
917  *
918  * cpi_coreid
919  *
920  *	This represents the ID of a CPU core. Two logical CPUs should only have
921  *	the same cpi_coreid value if they are part of the same core. These
922  *	values may be synthetic. On systems that support SMT, this value is
923  *	usually derived from the APIC ID, otherwise it is often synthetic and
924  *	just set to the value of the cpu_id in the cpu_t.
925  *
926  * cpi_pkgcoreid
927  *
928  *	This is similar to the cpi_coreid in that logical CPUs that are part of
929  *	the same core should have the same ID. The main difference is that these
930  *	values are only required to be unique to a given socket.
931  *
932  * cpi_clogid
933  *
934  *	This represents the logical ID of a logical CPU. This value should be
935  *	unique within a given socket for each logical CPU. This is allowed to be
936  *	synthetic, though it is usually based off of the CPU's apic ID. The
937  *	broader system expects that logical CPUs that have are part of the same
938  *	core have contiguous numbers. For example, if there were two threads per
939  *	core, then the core IDs divided by two should be the same and the first
940  *	modulus two should be zero and the second one. For example, IDs 4 and 5
941  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
942  *	6 represent two logical CPUs that are part of different cores.
943  *
944  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
945  *	from the same source, strictly speaking, they don't have to be and the
946  *	two values should be considered logically independent. One should not
947  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
948  *	some kind of relationship. While this is tempting, we've seen cases on
949  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
950  *
951  * cpi_ncpu_per_chip
952  *
953  *	This value indicates the total number of logical CPUs that exist in the
954  *	physical package. Critically, this is not the number of logical CPUs
955  *	that exist for just the single core.
956  *
957  *	This value should be the same for all logical CPUs in the same package.
958  *
959  * cpi_ncore_per_chip
960  *
961  *	This value indicates the total number of physical CPU cores that exist
962  *	in the package. The system compares this value with cpi_ncpu_per_chip to
963  *	determine if simultaneous multi-threading (SMT) is enabled. When
964  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
965  *	the X86FSET_HTT feature is not set. If this value is greater than one,
966  *	than we consider the processor to have the feature X86FSET_CMP, to
967  *	indicate that there is support for more than one core.
968  *
969  *	This value should be the same for all logical CPUs in the same package.
970  *
971  * cpi_procnodes_per_pkg
972  *
973  *	This value indicates the number of 'nodes' that exist in the package.
974  *	When processors are actually a multi-chip module, this represents the
975  *	number of such modules that exist in the package. Currently, on Intel
976  *	based systems this member is always set to 1.
977  *
978  *	This value should be the same for all logical CPUs in the same package.
979  *
980  * cpi_procnodeid
981  *
982  *	This value indicates the ID of the node that the logical CPU is a part
983  *	of. All logical CPUs that are in the same node must have the same value
984  *	here. This value must be unique across all of the packages in the
985  *	system.  On Intel based systems, this is currently set to the value in
986  *	cpi_chipid because there is only one node.
987  *
988  * cpi_cores_per_compunit
989  *
990  *	This value indicates the number of cores that are part of a compute
991  *	unit. See the AMD topology section for this. This member only has real
992  *	meaning currently for AMD Bulldozer family processors. For all other
993  *	processors, this should currently be set to 1.
994  *
995  * cpi_compunitid
996  *
997  *	This indicates the compute unit that the logical CPU belongs to. For
998  *	processors without AMD Bulldozer-style compute units this should be set
999  *	to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *	This indicates the number of logical CPUs that are sharing the same last
1004  *	level cache. This value should be the same for all CPUs that are sharing
1005  *	that cache. The last cache refers to the cache that is closest to memory
1006  *	and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *	This indicates the ID of the last cache that the logical CPU uses. This
1011  *	cache is often shared between multiple logical CPUs and is the cache
1012  *	that is closest to memory and furthest away from the CPU. This value
1013  *	should be the same for a group of logical CPUs only if they actually
1014  *	share the same last level cache. IDs should not overlap between
1015  *	packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *	This indicates the number of bits that are required to represent all of
1020  *	the cores in the system. As cores are derived based on their APIC IDs,
1021  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *	this value to be larger than the actual number of IDs that are present
1023  *	in the system. This is used to size tables by the CMI framework. It is
1024  *	only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *	This indicates the number of bits required to represent all of the IDs
1029  *	that cover the logical CPUs that exist on a given core. It's OK for this
1030  *	value to be larger than the actual number of IDs that are present in the
1031  *	system.  This is used to size tables by the CMI framework. It is
1032  *	only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *     - Branch History Injection (BHI).
1111  *   - Meltdown (Spectre v3)
1112  *   - Rogue Register Read (Spectre v3a)
1113  *   - Speculative Store Bypass (Spectre v4)
1114  *   - ret2spec, SpectreRSB
1115  *   - L1 Terminal Fault (L1TF)
1116  *   - Microarchitectural Data Sampling (MDS)
1117  *   - Register File Data Sampling (RFDS)
1118  *
1119  * Each of these requires different sets of mitigations and has different attack
1120  * surfaces. For the most part, this discussion is about protecting the kernel
1121  * from non-kernel executing environments such as user processes and hardware
1122  * virtual machines. Unfortunately, there are a number of user vs. user
1123  * scenarios that exist with these. The rest of this section will describe the
1124  * overall approach that the system has taken to address these as well as their
1125  * shortcomings. Unfortunately, not all of the above have been handled today.
1126  *
1127  * SPECTRE v2, ret2spec, SpectreRSB
1128  *
1129  * The second variant of the spectre attack focuses on performing branch target
1130  * injection. This generally impacts indirect call instructions in the system.
1131  * There are four different ways to mitigate this issue that are commonly
1132  * described today:
1133  *
1134  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1135  *  2. Using Retpolines and RSB Stuffing
1136  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1137  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1138  *
1139  * IBRS uses a feature added to microcode to restrict speculation, among other
1140  * things. This form of mitigation has not been used as it has been generally
1141  * seen as too expensive and requires reactivation upon various transitions in
1142  * the system.
1143  *
1144  * As a less impactful alternative to IBRS, retpolines were developed by
1145  * Google. These basically require one to replace indirect calls with a specific
1146  * trampoline that will cause speculation to fail and break the attack.
1147  * Retpolines require compiler support. We always build with retpolines in the
1148  * external thunk mode. This means that a traditional indirect call is replaced
1149  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1150  * of this is that all indirect function calls are performed through a register.
1151  *
1152  * We have to use a common external location of the thunk and not inline it into
1153  * the callsite so that way we can have a single place to patch these functions.
1154  * As it turns out, we currently have two different forms of retpolines that
1155  * exist in the system:
1156  *
1157  *  1. A full retpoline
1158  *  2. A no-op version
1159  *
1160  * The first one is used in the general case. Historically, there was an
1161  * AMD-specific optimized retopoline variant that was based around using a
1162  * serializing lfence instruction; however, in March 2022 it was announced that
1163  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1164  * use it and it is no longer available in the system.
1165  *
1166  * The third form described above is the most curious. It turns out that the way
1167  * that retpolines are implemented is that they rely on how speculation is
1168  * performed on a 'ret' instruction. Intel has continued to optimize this
1169  * process (which is partly why we need to have return stack buffer stuffing,
1170  * but more on that in a bit) and in processors starting with Cascade Lake
1171  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1172  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1173  *
1174  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1175  * physical core. However, if this is the case, we don't want to use retpolines
1176  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1177  * function (called a thunk) into a jmp instruction. This means that we're still
1178  * paying the cost of an extra jump to the external thunk, but it gives us
1179  * flexibility and the ability to have a single kernel image that works across a
1180  * wide variety of systems and hardware features.
1181  *
1182  * Unfortunately, this alone is insufficient. First, Skylake systems have
1183  * additional speculation for the Return Stack Buffer (RSB) which is used to
1184  * return from call instructions which retpolines take advantage of. However,
1185  * this problem is not just limited to Skylake and is actually more pernicious.
1186  * The SpectreRSB paper introduces several more problems that can arise with
1187  * dealing with this. The RSB can be poisoned just like the indirect branch
1188  * predictor. This means that one needs to clear the RSB when transitioning
1189  * between two different privilege domains. Some examples include:
1190  *
1191  *  - Switching between two different user processes
1192  *  - Going between user land and the kernel
1193  *  - Returning to the kernel from a hardware virtual machine
1194  *
1195  * Mitigating this involves combining a couple of different things. The first is
1196  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1197  * Bridge. When an RSB entry refers to a user address and we're executing in the
1198  * kernel, speculation through it will be stopped when SMEP is enabled. This
1199  * protects against a number of the different cases that we would normally be
1200  * worried about such as when we enter the kernel from user land.
1201  *
1202  * To prevent against additional manipulation of the RSB from other contexts
1203  * such as a non-root VMX context attacking the kernel we first look to
1204  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1205  * nothing else that we need to do to protect the kernel at this time.
1206  *
1207  * Unfortunately, not all eIBRS implementations are sufficient to guard
1208  * against RSB manipulations, so we still need to manually overwrite the
1209  * contents of the return stack buffer unless the hardware specifies we are
1210  * covered. We do this through the x86_rsb_stuff() function.  Currently this
1211  * is employed on context switch and vmx_exit. The x86_rsb_stuff() function is
1212  * disabled only when mitigations in general are, or if we have hardware
1213  * indicating no need for post-barrier RSB protections, either in one place
1214  * (old hardware), or on both (newer hardware).
1215  *
1216  * If SMEP is not present, then we would have to stuff the RSB every time we
1217  * transitioned from user mode to the kernel, which isn't very practical right
1218  * now.
1219  *
1220  * To fully protect user to user and vmx to vmx attacks from these classes of
1221  * issues, we would also need to allow them to opt into performing an Indirect
1222  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1223  *
1224  * The fourth form of mitigation here is specific to AMD and is called Automated
1225  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1226  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1227  * (extended feature enable register) MSR. This bit basically says that IBRS
1228  * acts as though it is always active when executing at CPL0 and when executing
1229  * in the 'host' context when SEV-SNP is enabled.
1230  *
1231  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1232  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1233  * to the kernel, we must still consider the remaining cases that exist, just
1234  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1235  * traditional technique to work, this is not true on all CPUs. While a write to
1236  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1237  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1238  * guard page is present between user and kernel address spaces and SMEP is
1239  * enabled, then there is no need to clear the RSB at all.
1240  *
1241  * By default, the system will enable RSB stuffing and the required variant of
1242  * retpolines and store that information in the x86_spectrev2_mitigation value.
1243  * This will be evaluated after a microcode update as well, though it is
1244  * expected that microcode updates will not take away features. This may mean
1245  * that a late loaded microcode may not end up in the optimal configuration
1246  * (though this should be rare).
1247  *
1248  * Currently we do not build kmdb with retpolines or perform any additional side
1249  * channel security mitigations for it. One complication with kmdb is that it
1250  * requires its own retpoline thunks and it would need to adjust itself based on
1251  * what the kernel does. The threat model of kmdb is more limited and therefore
1252  * it may make more sense to investigate using prediction barriers as the whole
1253  * system is only executing a single instruction at a time while in kmdb.
1254  *
1255  * Branch History Injection (BHI)
1256  *
1257  * BHI is a specific form of SPECTREv2 where an attacker may manipulate branch
1258  * history before transitioning from user to supervisor mode (or from VMX
1259  * non-root/guest to root mode). The attacker can then exploit certain
1260  * compiler-generated code-sequences ("gadgets") to disclose information from
1261  * other contexts or domains.  Recent (late-2023/early-2024) research in
1262  * object code analysis discovered many more potential gadgets than what was
1263  * initially reported (which previously was confined to Linux use of
1264  * unprivileged eBPF).
1265  *
1266  * The BHI threat doesn't exist in processsors that predate eIBRS, or in AMD
1267  * ones. Some eIBRS processors have the ability to disable branch history in
1268  * certain (but not all) cases using an MSR write. eIBRS processors that don't
1269  * have the ability to disable must use a software sequence to scrub the
1270  * branch history buffer.
1271  *
1272  * BHI_DIS_S (the aforementioned MSR) prevents ring 0 from ring 3 (VMX guest
1273  * or VMX root). It does not protect different user processes from each other,
1274  * or ring 3 VMX guest from ring 3 VMX root or vice versa.
1275  *
1276  * The BHI clearing sequence prevents user exploiting kernel gadgets, and user
1277  * A's use of user B's gadgets.
1278  *
1279  * SMEP and eIBRS are a continuing defense-in-depth measure protecting the
1280  * kernel.
1281  *
1282  * SPECTRE v1, v4
1283  *
1284  * The v1 and v4 variants of spectre are not currently mitigated in the
1285  * system and require other classes of changes to occur in the code.
1286  *
1287  * SPECTRE v1 (SWAPGS VARIANT)
1288  *
1289  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1290  * can generally affect any branch-dependent code. The swapgs issue is one
1291  * variant of this. If we are coming in from userspace, we can have code like
1292  * this:
1293  *
1294  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1295  *	je	1f
1296  *	movq	$0, REGOFF_SAVFP(%rsp)
1297  *	swapgs
1298  *	1:
1299  *	movq	%gs:CPU_THREAD, %rax
1300  *
1301  * If an attacker can cause a mis-speculation of the branch here, we could skip
1302  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1303  * load. If subsequent code can act as the usual Spectre cache gadget, this
1304  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1305  * any use of the %gs override.
1306  *
1307  * The other case is also an issue: if we're coming into a trap from kernel
1308  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1309  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1310  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1311  * case, and the fix is the same in both cases (an lfence at the branch target
1312  * 1: in this example), we'll just do it unconditionally.
1313  *
1314  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1315  * harder for user-space to actually set a useful %gsbase value: although it's
1316  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1317  * mitigate anyway.
1318  *
1319  * MELTDOWN
1320  *
1321  * Meltdown, or spectre v3, allowed a user process to read any data in their
1322  * address space regardless of whether or not the page tables in question
1323  * allowed the user to have the ability to read them. The solution to meltdown
1324  * is kernel page table isolation. In this world, there are two page tables that
1325  * are used for a process, one in user land and one in the kernel. To implement
1326  * this we use per-CPU page tables and switch between the user and kernel
1327  * variants when entering and exiting the kernel.  For more information about
1328  * this process and how the trampolines work, please see the big theory
1329  * statements and additional comments in:
1330  *
1331  *  - uts/i86pc/ml/kpti_trampolines.s
1332  *  - uts/i86pc/vm/hat_i86.c
1333  *
1334  * While Meltdown only impacted Intel systems and there are also Intel systems
1335  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1336  * kernel page table isolation enabled. While this may at first seem weird, an
1337  * important thing to remember is that you can't speculatively read an address
1338  * if it's never in your page table at all. Having user processes without kernel
1339  * pages present provides us with an important layer of defense in the kernel
1340  * against any other side channel attacks that exist and have yet to be
1341  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1342  * default, no matter the x86 system.
1343  *
1344  * L1 TERMINAL FAULT
1345  *
1346  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1347  * execution uses page table entries. Effectively, it is two different problems.
1348  * The first is that it ignores the not present bit in the page table entries
1349  * when performing speculative execution. This means that something can
1350  * speculatively read the listed physical address if it's present in the L1
1351  * cache under certain conditions (see Intel's documentation for the full set of
1352  * conditions). Secondly, this can be used to bypass hardware virtualization
1353  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1354  * instructions.
1355  *
1356  * For the non-hardware virtualized case, this is relatively easy to deal with.
1357  * We must make sure that all unmapped pages have an address of zero. This means
1358  * that they could read the first 4k of physical memory; however, we never use
1359  * that first page in the operating system and always skip putting it in our
1360  * memory map, even if firmware tells us we can use it in our memory map. While
1361  * other systems try to put extra metadata in the address and reserved bits,
1362  * which led to this being problematic in those cases, we do not.
1363  *
1364  * For hardware virtual machines things are more complicated. Because they can
1365  * construct their own page tables, it isn't hard for them to perform this
1366  * attack against any physical address. The one wrinkle is that this physical
1367  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1368  * to flush the L1 data cache. We wrap this up in the function
1369  * spec_uarch_flush(). This function is also used in the mitigation of
1370  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1371  * hypervisors such as KVM or bhyve are responsible for performing this before
1372  * entering the guest.
1373  *
1374  * Because this attack takes place in the L1 cache, there's another wrinkle
1375  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1376  * designs. This means that when a thread enters a hardware virtualized context
1377  * and flushes the L1 data cache, the other thread on the processor may then go
1378  * ahead and put new data in it that can be potentially attacked. While one
1379  * solution is to disable SMT on the system, another option that is available is
1380  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1381  * goes through and makes sure that if a HVM is being scheduled on one thread,
1382  * then the thing on the other thread is from the same hardware virtual machine.
1383  * If an interrupt comes in or the guest exits to the broader system, then the
1384  * other SMT thread will be kicked out.
1385  *
1386  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1387  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1388  * perform L1TF related mitigations.
1389  *
1390  * MICROARCHITECTURAL DATA SAMPLING
1391  *
1392  * Microarchitectural data sampling (MDS) is a combination of four discrete
1393  * vulnerabilities that are similar issues affecting various parts of the CPU's
1394  * microarchitectural implementation around load, store, and fill buffers.
1395  * Specifically it is made up of the following subcomponents:
1396  *
1397  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1398  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1399  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1400  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1401  *
1402  * To begin addressing these, Intel has introduced another feature in microcode
1403  * called MD_CLEAR. This changes the verw instruction to operate in a different
1404  * way. This allows us to execute the verw instruction in a particular way to
1405  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1406  * updated when this microcode is present to flush this state.
1407  *
1408  * Primarily we need to flush this state whenever we transition from the kernel
1409  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1410  * little bit different. Here the structures are statically sized when a logical
1411  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1412  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1413  * mwait, or another ACPI method. To perform these flushes, we call
1414  * x86_md_clear() at all of these transition points.
1415  *
1416  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1417  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1418  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1419  * a no-op.
1420  *
1421  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1422  * particular, everything we've discussed above is only valid for a single
1423  * thread executing on a core. In the case where you have hyper-threading
1424  * present, this attack can be performed between threads. The theoretical fix
1425  * for this is to ensure that both threads are always in the same security
1426  * domain. This means that they are executing in the same ring and mutually
1427  * trust each other. Practically speaking, this would mean that a system call
1428  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1429  * Rather than implement this, we recommend that one disables hyper-threading
1430  * through the use of psradm -aS.
1431  *
1432  * TSX ASYNCHRONOUS ABORT
1433  *
1434  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1435  * behaves like MDS, but leverages Intel's transactional instructions as another
1436  * vector. Effectively, when a transaction hits one of these cases (unmapped
1437  * page, various cache snoop activity, etc.) then the same data can be exposed
1438  * as in the case of MDS. This means that you can attack your twin.
1439  *
1440  * Intel has described that there are two different ways that we can mitigate
1441  * this problem on affected processors:
1442  *
1443  *   1) We can use the same techniques used to deal with MDS. Flushing the
1444  *      microarchitectural buffers and disabling hyperthreading will mitigate
1445  *      this in the same way.
1446  *
1447  *   2) Using microcode to disable TSX.
1448  *
1449  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1450  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1451  * That's OK as we're already doing all such mitigations. On the other hand,
1452  * processors with MDS_NO are all supposed to receive microcode updates that
1453  * enumerate support for disabling TSX. In general, we'd rather use this method
1454  * when available as it doesn't require disabling hyperthreading to be
1455  * effective. Currently we basically are relying on microcode for processors
1456  * that enumerate MDS_NO.
1457  *
1458  * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1459  * Sampling: RFDS. This allows an attacker to sample values that were in any
1460  * of integer, floating point, or vector registers. This was discovered by
1461  * Intel during internal validation work.  The existence of the RFDS_NO
1462  * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1463  * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1464  * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1465  * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1466  * MSR that L1D uses.
1467  *
1468  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1469  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1470  * different powers. The first allows us to cause all transactions to
1471  * immediately abort. The second gives us a means of disabling TSX completely,
1472  * which includes removing it from cpuid. If we have support for this in
1473  * microcode during the first cpuid pass, then we'll disable TSX completely such
1474  * that user land never has a chance to observe the bit. However, if we are late
1475  * loading the microcode, then we must use the functionality to cause
1476  * transactions to automatically abort. This is necessary for user land's sake.
1477  * Once a program sees a cpuid bit, it must not be taken away.
1478  *
1479  * We track whether or not we should do this based on what cpuid pass we're in.
1480  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1481  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1482  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1483  * second time after we do the initial microcode update.  As a result we need to
1484  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1485  * suitable microcode on the current CPU (which happens prior to
1486  * cpuid_pass_ucode()).
1487  *
1488  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1489  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1490  * unfortunate feature in a number of ways, and taking the opportunity to
1491  * finally be able to turn it off is likely to be of benefit in the future.
1492  *
1493  * SUMMARY
1494  *
1495  * The following table attempts to summarize the mitigations for various issues
1496  * and what's done in various places:
1497  *
1498  *  - Spectre v1: Not currently mitigated
1499  *  - swapgs: lfences after swapgs paths
1500  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1501  *  - Meltdown: Kernel Page Table Isolation
1502  *  - Spectre v3a: Updated CPU microcode
1503  *  - Spectre v4: Not currently mitigated
1504  *  - SpectreRSB: SMEP and RSB Stuffing
1505  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1506  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1507  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1508  *  - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1509  *  - BHI: software sequence, and use of BHI_DIS_S if microcode has it.
1510  *
1511  * The following table indicates the x86 feature set bits that indicate that a
1512  * given problem has been solved or a notable feature is present:
1513  *
1514  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1515  *  - MDS_NO: All forms of MDS
1516  *  - TAA_NO: TAA
1517  *  - RFDS_NO: RFDS
1518  *  - BHI_NO: BHI
1519  */
1520 
1521 #include <sys/types.h>
1522 #include <sys/archsystm.h>
1523 #include <sys/x86_archext.h>
1524 #include <sys/kmem.h>
1525 #include <sys/systm.h>
1526 #include <sys/cmn_err.h>
1527 #include <sys/sunddi.h>
1528 #include <sys/sunndi.h>
1529 #include <sys/cpuvar.h>
1530 #include <sys/processor.h>
1531 #include <sys/sysmacros.h>
1532 #include <sys/pg.h>
1533 #include <sys/fp.h>
1534 #include <sys/controlregs.h>
1535 #include <sys/bitmap.h>
1536 #include <sys/auxv_386.h>
1537 #include <sys/memnode.h>
1538 #include <sys/pci_cfgspace.h>
1539 #include <sys/comm_page.h>
1540 #include <sys/mach_mmu.h>
1541 #include <sys/ucode.h>
1542 #include <sys/tsc.h>
1543 #include <sys/kobj.h>
1544 #include <sys/asm_misc.h>
1545 #include <sys/bitmap.h>
1546 
1547 #ifdef __xpv
1548 #include <sys/hypervisor.h>
1549 #else
1550 #include <sys/ontrap.h>
1551 #endif
1552 
1553 uint_t x86_vendor = X86_VENDOR_IntelClone;
1554 uint_t x86_type = X86_TYPE_OTHER;
1555 uint_t x86_clflush_size = 0;
1556 
1557 #if defined(__xpv)
1558 int x86_use_pcid = 0;
1559 int x86_use_invpcid = 0;
1560 #else
1561 int x86_use_pcid = -1;
1562 int x86_use_invpcid = -1;
1563 #endif
1564 
1565 typedef enum {
1566 	X86_SPECTREV2_RETPOLINE,
1567 	X86_SPECTREV2_ENHANCED_IBRS,
1568 	X86_SPECTREV2_AUTO_IBRS,
1569 	X86_SPECTREV2_DISABLED
1570 } x86_spectrev2_mitigation_t;
1571 
1572 uint_t x86_disable_spectrev2 = 0;
1573 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1574     X86_SPECTREV2_RETPOLINE;
1575 
1576 /*
1577  * The mitigation status for TAA:
1578  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1579  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1580  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1581  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1582  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1583  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1584  */
1585 typedef enum {
1586 	X86_TAA_NOTHING,
1587 	X86_TAA_DISABLED,
1588 	X86_TAA_MD_CLEAR,
1589 	X86_TAA_TSX_FORCE_ABORT,
1590 	X86_TAA_TSX_DISABLE,
1591 	X86_TAA_HW_MITIGATED
1592 } x86_taa_mitigation_t;
1593 
1594 uint_t x86_disable_taa = 0;
1595 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1596 
1597 uint_t pentiumpro_bug4046376;
1598 
1599 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1600 
1601 static char *x86_feature_names[NUM_X86_FEATURES] = {
1602 	"lgpg",
1603 	"tsc",
1604 	"msr",
1605 	"mtrr",
1606 	"pge",
1607 	"de",
1608 	"cmov",
1609 	"mmx",
1610 	"mca",
1611 	"pae",
1612 	"cv8",
1613 	"pat",
1614 	"sep",
1615 	"sse",
1616 	"sse2",
1617 	"htt",
1618 	"asysc",
1619 	"nx",
1620 	"sse3",
1621 	"cx16",
1622 	"cmp",
1623 	"tscp",
1624 	"mwait",
1625 	"sse4a",
1626 	"cpuid",
1627 	"ssse3",
1628 	"sse4_1",
1629 	"sse4_2",
1630 	"1gpg",
1631 	"clfsh",
1632 	"64",
1633 	"aes",
1634 	"pclmulqdq",
1635 	"xsave",
1636 	"avx",
1637 	"vmx",
1638 	"svm",
1639 	"topoext",
1640 	"f16c",
1641 	"rdrand",
1642 	"x2apic",
1643 	"avx2",
1644 	"bmi1",
1645 	"bmi2",
1646 	"fma",
1647 	"smep",
1648 	"smap",
1649 	"adx",
1650 	"rdseed",
1651 	"mpx",
1652 	"avx512f",
1653 	"avx512dq",
1654 	"avx512pf",
1655 	"avx512er",
1656 	"avx512cd",
1657 	"avx512bw",
1658 	"avx512vl",
1659 	"avx512fma",
1660 	"avx512vbmi",
1661 	"avx512_vpopcntdq",
1662 	"avx512_4vnniw",
1663 	"avx512_4fmaps",
1664 	"xsaveopt",
1665 	"xsavec",
1666 	"xsaves",
1667 	"sha",
1668 	"umip",
1669 	"pku",
1670 	"ospke",
1671 	"pcid",
1672 	"invpcid",
1673 	"ibrs",
1674 	"ibpb",
1675 	"stibp",
1676 	"ssbd",
1677 	"ssbd_virt",
1678 	"rdcl_no",
1679 	"ibrs_all",
1680 	"rsba",
1681 	"ssb_no",
1682 	"stibp_all",
1683 	"flush_cmd",
1684 	"l1d_vmentry_no",
1685 	"fsgsbase",
1686 	"clflushopt",
1687 	"clwb",
1688 	"monitorx",
1689 	"clzero",
1690 	"xop",
1691 	"fma4",
1692 	"tbm",
1693 	"avx512_vnni",
1694 	"amd_pcec",
1695 	"md_clear",
1696 	"mds_no",
1697 	"core_thermal",
1698 	"pkg_thermal",
1699 	"tsx_ctrl",
1700 	"taa_no",
1701 	"ppin",
1702 	"vaes",
1703 	"vpclmulqdq",
1704 	"lfence_serializing",
1705 	"gfni",
1706 	"avx512_vp2intersect",
1707 	"avx512_bitalg",
1708 	"avx512_vbmi2",
1709 	"avx512_bf16",
1710 	"auto_ibrs",
1711 	"rfds_no",
1712 	"rfds_clear",
1713 	"pbrsb_no",
1714 	"bhi_no",
1715 	"bhi_clear"
1716 };
1717 
1718 boolean_t
is_x86_feature(void * featureset,uint_t feature)1719 is_x86_feature(void *featureset, uint_t feature)
1720 {
1721 	ASSERT(feature < NUM_X86_FEATURES);
1722 	return (BT_TEST((ulong_t *)featureset, feature));
1723 }
1724 
1725 void
add_x86_feature(void * featureset,uint_t feature)1726 add_x86_feature(void *featureset, uint_t feature)
1727 {
1728 	ASSERT(feature < NUM_X86_FEATURES);
1729 	BT_SET((ulong_t *)featureset, feature);
1730 }
1731 
1732 void
remove_x86_feature(void * featureset,uint_t feature)1733 remove_x86_feature(void *featureset, uint_t feature)
1734 {
1735 	ASSERT(feature < NUM_X86_FEATURES);
1736 	BT_CLEAR((ulong_t *)featureset, feature);
1737 }
1738 
1739 boolean_t
compare_x86_featureset(void * setA,void * setB)1740 compare_x86_featureset(void *setA, void *setB)
1741 {
1742 	/*
1743 	 * We assume that the unused bits of the bitmap are always zero.
1744 	 */
1745 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1746 		return (B_TRUE);
1747 	} else {
1748 		return (B_FALSE);
1749 	}
1750 }
1751 
1752 void
print_x86_featureset(void * featureset)1753 print_x86_featureset(void *featureset)
1754 {
1755 	uint_t i;
1756 
1757 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1758 		if (is_x86_feature(featureset, i)) {
1759 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1760 			    x86_feature_names[i]);
1761 		}
1762 	}
1763 }
1764 
1765 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1766 static size_t xsave_state_size = 0;
1767 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1768 boolean_t xsave_force_disable = B_FALSE;
1769 extern int disable_smap;
1770 
1771 /*
1772  * This is set to platform type we are running on.
1773  */
1774 static int platform_type = -1;
1775 
1776 #if !defined(__xpv)
1777 /*
1778  * Variable to patch if hypervisor platform detection needs to be
1779  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1780  */
1781 int enable_platform_detection = 1;
1782 #endif
1783 
1784 /*
1785  * monitor/mwait info.
1786  *
1787  * size_actual and buf_actual are the real address and size allocated to get
1788  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1789  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1790  * processor cache-line alignment, but this is not guarantied in the furture.
1791  */
1792 struct mwait_info {
1793 	size_t		mon_min;	/* min size to avoid missed wakeups */
1794 	size_t		mon_max;	/* size to avoid false wakeups */
1795 	size_t		size_actual;	/* size actually allocated */
1796 	void		*buf_actual;	/* memory actually allocated */
1797 	uint32_t	support;	/* processor support of monitor/mwait */
1798 };
1799 
1800 /*
1801  * xsave/xrestor info.
1802  *
1803  * This structure contains HW feature bits and the size of the xsave save area.
1804  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1805  * (xsave_state) to describe the xsave layout. However, at runtime the
1806  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1807  * xsave_state structure simply represents the legacy layout of the beginning
1808  * of the xsave area.
1809  */
1810 struct xsave_info {
1811 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1812 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1813 	size_t		xsav_max_size;  /* max size save area for HW features */
1814 	size_t		ymm_size;	/* AVX: size of ymm save area */
1815 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1816 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1817 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1818 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1819 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1820 	size_t		opmask_size;	/* AVX512: size of opmask save */
1821 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1822 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1823 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1824 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1825 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1826 };
1827 
1828 
1829 /*
1830  * These constants determine how many of the elements of the
1831  * cpuid we cache in the cpuid_info data structure; the
1832  * remaining elements are accessible via the cpuid instruction.
1833  */
1834 
1835 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1836 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1837 #define	NMAX_CPI_TOPO	0x10		/* Sanity check on leaf 8X26, 1F */
1838 
1839 /*
1840  * See the big theory statement for a more detailed explanation of what some of
1841  * these members mean.
1842  */
1843 struct cpuid_info {
1844 	uint_t cpi_pass;		/* last pass completed */
1845 	/*
1846 	 * standard function information
1847 	 */
1848 	uint_t cpi_maxeax;		/* fn 0: %eax */
1849 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1850 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1851 
1852 	uint_t cpi_family;		/* fn 1: extended family */
1853 	uint_t cpi_model;		/* fn 1: extended model */
1854 	uint_t cpi_step;		/* fn 1: stepping */
1855 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1856 					/*		AMD: package/socket # */
1857 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1858 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1859 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1860 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1861 	uint_t cpi_ncache;		/* fn 2: number of elements */
1862 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1863 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1864 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1865 					/* Intel fn: 4, AMD fn: 8000001d */
1866 	struct cpuid_regs **cpi_cache_leaves;	/* Actual leaves from above */
1867 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1868 	struct cpuid_regs cpi_sub7[2];	/* Leaf 7, sub-leaves 1-2 */
1869 	/*
1870 	 * extended function information
1871 	 */
1872 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1873 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1874 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1875 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1876 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1877 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1878 
1879 	id_t cpi_coreid;		/* same coreid => strands share core */
1880 	int cpi_pkgcoreid;		/* core number within single package */
1881 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1882 					/* Intel: fn 4: %eax[31-26] */
1883 
1884 	/*
1885 	 * These values represent the number of bits that are required to store
1886 	 * information about the number of cores and threads.
1887 	 */
1888 	uint_t cpi_ncore_bits;
1889 	uint_t cpi_nthread_bits;
1890 	/*
1891 	 * supported feature information
1892 	 */
1893 	uint32_t cpi_support[6];
1894 #define	STD_EDX_FEATURES	0
1895 #define	AMD_EDX_FEATURES	1
1896 #define	TM_EDX_FEATURES		2
1897 #define	STD_ECX_FEATURES	3
1898 #define	AMD_ECX_FEATURES	4
1899 #define	STD_EBX_FEATURES	5
1900 	/*
1901 	 * Synthesized information, where known.
1902 	 */
1903 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1904 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1905 	uint32_t cpi_socket;		/* Chip package/socket type */
1906 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1907 
1908 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1909 	uint32_t cpi_apicid;
1910 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1911 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1912 					/* Intel: 1 */
1913 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1914 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1915 
1916 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1917 
1918 	/*
1919 	 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1920 	 * eventually leaf 0x1F (Intel).
1921 	 */
1922 	uint_t cpi_topo_nleaves;
1923 	struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1924 };
1925 
1926 
1927 static struct cpuid_info cpuid_info0;
1928 
1929 /*
1930  * These bit fields are defined by the Intel Application Note AP-485
1931  * "Intel Processor Identification and the CPUID Instruction"
1932  */
1933 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1934 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1935 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1936 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1937 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1938 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1939 
1940 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1941 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1942 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1943 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1944 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1945 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1946 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1947 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1948 #define	CPI_FEATURES_7_2_EDX(cpi)	((cpi)->cpi_sub7[1].cp_edx)
1949 
1950 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1951 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1952 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1953 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1954 
1955 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1956 #define	CPI_XMAXEAX_MAX		0x80000100
1957 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1958 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1959 
1960 /*
1961  * Function 4 (Deterministic Cache Parameters) macros
1962  * Defined by Intel Application Note AP-485
1963  */
1964 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1965 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1966 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1967 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1968 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1969 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1970 #define	CPI_CACHE_TYPE_DONE	0
1971 #define	CPI_CACHE_TYPE_DATA	1
1972 #define	CPI_CACHE_TYPE_INSTR	2
1973 #define	CPI_CACHE_TYPE_UNIFIED	3
1974 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1975 
1976 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1977 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1978 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1979 
1980 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1981 
1982 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1983 
1984 
1985 /*
1986  * A couple of shorthand macros to identify "later" P6-family chips
1987  * like the Pentium M and Core.  First, the "older" P6-based stuff
1988  * (loosely defined as "pre-Pentium-4"):
1989  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1990  */
1991 #define	IS_LEGACY_P6(cpi) (			\
1992 	cpi->cpi_family == 6 &&			\
1993 		(cpi->cpi_model == 1 ||		\
1994 		cpi->cpi_model == 3 ||		\
1995 		cpi->cpi_model == 5 ||		\
1996 		cpi->cpi_model == 6 ||		\
1997 		cpi->cpi_model == 7 ||		\
1998 		cpi->cpi_model == 8 ||		\
1999 		cpi->cpi_model == 0xA ||	\
2000 		cpi->cpi_model == 0xB)		\
2001 )
2002 
2003 /* A "new F6" is everything with family 6 that's not the above */
2004 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
2005 
2006 /* Extended family/model support */
2007 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
2008 	cpi->cpi_family >= 0xf)
2009 
2010 /*
2011  * Info for monitor/mwait idle loop.
2012  *
2013  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
2014  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
2015  * 2006.
2016  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
2017  * Documentation Updates" #33633, Rev 2.05, December 2006.
2018  */
2019 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
2020 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
2021 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
2022 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
2023 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
2024 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
2025 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
2026 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
2027 /*
2028  * Number of sub-cstates for a given c-state.
2029  */
2030 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
2031 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
2032 
2033 /*
2034  * XSAVE leaf 0xD enumeration
2035  */
2036 #define	CPUID_LEAFD_2_YMM_OFFSET	576
2037 #define	CPUID_LEAFD_2_YMM_SIZE		256
2038 
2039 /*
2040  * Common extended leaf names to cut down on typos.
2041  */
2042 #define	CPUID_LEAF_EXT_0		0x80000000
2043 #define	CPUID_LEAF_EXT_8		0x80000008
2044 #define	CPUID_LEAF_EXT_1d		0x8000001d
2045 #define	CPUID_LEAF_EXT_1e		0x8000001e
2046 #define	CPUID_LEAF_EXT_21		0x80000021
2047 #define	CPUID_LEAF_EXT_26		0x80000026
2048 
2049 /*
2050  * Functions we consume from cpuid_subr.c;  don't publish these in a header
2051  * file to try and keep people using the expected cpuid_* interfaces.
2052  */
2053 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2054 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2055 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2056 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2057 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2058 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2059 
2060 /*
2061  * Apply up various platform-dependent restrictions where the
2062  * underlying platform restrictions mean the CPU can be marked
2063  * as less capable than its cpuid instruction would imply.
2064  */
2065 #if defined(__xpv)
2066 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)2067 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2068 {
2069 	switch (eax) {
2070 	case 1: {
2071 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2072 		    0 : CPUID_INTC_EDX_MCA;
2073 		cp->cp_edx &=
2074 		    ~(mcamask |
2075 		    CPUID_INTC_EDX_PSE |
2076 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2077 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2078 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2079 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2080 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2081 		break;
2082 	}
2083 
2084 	case 0x80000001:
2085 		cp->cp_edx &=
2086 		    ~(CPUID_AMD_EDX_PSE |
2087 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2088 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2089 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2090 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2091 		    CPUID_AMD_EDX_TSCP);
2092 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2093 		break;
2094 	default:
2095 		break;
2096 	}
2097 
2098 	switch (vendor) {
2099 	case X86_VENDOR_Intel:
2100 		switch (eax) {
2101 		case 4:
2102 			/*
2103 			 * Zero out the (ncores-per-chip - 1) field
2104 			 */
2105 			cp->cp_eax &= 0x03fffffff;
2106 			break;
2107 		default:
2108 			break;
2109 		}
2110 		break;
2111 	case X86_VENDOR_AMD:
2112 	case X86_VENDOR_HYGON:
2113 		switch (eax) {
2114 
2115 		case 0x80000001:
2116 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2117 			break;
2118 
2119 		case CPUID_LEAF_EXT_8:
2120 			/*
2121 			 * Zero out the (ncores-per-chip - 1) field
2122 			 */
2123 			cp->cp_ecx &= 0xffffff00;
2124 			break;
2125 		default:
2126 			break;
2127 		}
2128 		break;
2129 	default:
2130 		break;
2131 	}
2132 }
2133 #else
2134 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2135 #endif
2136 
2137 /*
2138  *  Some undocumented ways of patching the results of the cpuid
2139  *  instruction to permit running Solaris 10 on future cpus that
2140  *  we don't currently support.  Could be set to non-zero values
2141  *  via settings in eeprom.
2142  */
2143 
2144 uint32_t cpuid_feature_ecx_include;
2145 uint32_t cpuid_feature_ecx_exclude;
2146 uint32_t cpuid_feature_edx_include;
2147 uint32_t cpuid_feature_edx_exclude;
2148 
2149 /*
2150  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2151  */
2152 void
cpuid_alloc_space(cpu_t * cpu)2153 cpuid_alloc_space(cpu_t *cpu)
2154 {
2155 	/*
2156 	 * By convention, cpu0 is the boot cpu, which is set up
2157 	 * before memory allocation is available.  All other cpus get
2158 	 * their cpuid_info struct allocated here.
2159 	 */
2160 	ASSERT(cpu->cpu_id != 0);
2161 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2162 	cpu->cpu_m.mcpu_cpi =
2163 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2164 }
2165 
2166 void
cpuid_free_space(cpu_t * cpu)2167 cpuid_free_space(cpu_t *cpu)
2168 {
2169 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2170 	int i;
2171 
2172 	ASSERT(cpi != NULL);
2173 	ASSERT(cpi != &cpuid_info0);
2174 
2175 	/*
2176 	 * Free up any cache leaf related dynamic storage. The first entry was
2177 	 * cached from the standard cpuid storage, so we should not free it.
2178 	 */
2179 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2180 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2181 	if (cpi->cpi_cache_leaf_size > 0)
2182 		kmem_free(cpi->cpi_cache_leaves,
2183 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2184 
2185 	kmem_free(cpi, sizeof (*cpi));
2186 	cpu->cpu_m.mcpu_cpi = NULL;
2187 }
2188 
2189 #if !defined(__xpv)
2190 /*
2191  * Determine the type of the underlying platform. This is used to customize
2192  * initialization of various subsystems (e.g. TSC). determine_platform() must
2193  * only ever be called once to prevent two processors from seeing different
2194  * values of platform_type. Must be called before cpuid_pass_ident(), the
2195  * earliest consumer to execute; the identification pass will call
2196  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2197  */
2198 void
determine_platform(void)2199 determine_platform(void)
2200 {
2201 	struct cpuid_regs cp;
2202 	uint32_t base;
2203 	uint32_t regs[4];
2204 	char *hvstr = (char *)regs;
2205 
2206 	ASSERT(platform_type == -1);
2207 
2208 	platform_type = HW_NATIVE;
2209 
2210 	if (!enable_platform_detection)
2211 		return;
2212 
2213 	/*
2214 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2215 	 * vendor signature, and set platform type accordingly.
2216 	 *
2217 	 * References:
2218 	 * http://lkml.org/lkml/2008/10/1/246
2219 	 * http://kb.vmware.com/kb/1009458
2220 	 */
2221 	cp.cp_eax = 0x1;
2222 	(void) __cpuid_insn(&cp);
2223 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2224 		cp.cp_eax = 0x40000000;
2225 		(void) __cpuid_insn(&cp);
2226 		regs[0] = cp.cp_ebx;
2227 		regs[1] = cp.cp_ecx;
2228 		regs[2] = cp.cp_edx;
2229 		regs[3] = 0;
2230 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2231 			platform_type = HW_XEN_HVM;
2232 			return;
2233 		}
2234 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2235 			platform_type = HW_VMWARE;
2236 			return;
2237 		}
2238 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2239 			platform_type = HW_KVM;
2240 			return;
2241 		}
2242 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2243 			platform_type = HW_BHYVE;
2244 			return;
2245 		}
2246 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2247 			platform_type = HW_MICROSOFT;
2248 			return;
2249 		}
2250 		if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2251 			platform_type = HW_QEMU_TCG;
2252 			return;
2253 		}
2254 	} else {
2255 		/*
2256 		 * Check older VMware hardware versions. VMware hypervisor is
2257 		 * detected by performing an IN operation to VMware hypervisor
2258 		 * port and checking that value returned in %ebx is VMware
2259 		 * hypervisor magic value.
2260 		 *
2261 		 * References: http://kb.vmware.com/kb/1009458
2262 		 */
2263 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2264 		if (regs[1] == VMWARE_HVMAGIC) {
2265 			platform_type = HW_VMWARE;
2266 			return;
2267 		}
2268 	}
2269 
2270 	/*
2271 	 * Check Xen hypervisor. In a fully virtualized domain,
2272 	 * Xen's pseudo-cpuid function returns a string representing the
2273 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2274 	 * supported cpuid function. We need at least a (base + 2) leaf value
2275 	 * to do what we want to do. Try different base values, since the
2276 	 * hypervisor might use a different one depending on whether Hyper-V
2277 	 * emulation is switched on by default or not.
2278 	 */
2279 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2280 		cp.cp_eax = base;
2281 		(void) __cpuid_insn(&cp);
2282 		regs[0] = cp.cp_ebx;
2283 		regs[1] = cp.cp_ecx;
2284 		regs[2] = cp.cp_edx;
2285 		regs[3] = 0;
2286 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2287 		    cp.cp_eax >= (base + 2)) {
2288 			platform_type &= ~HW_NATIVE;
2289 			platform_type |= HW_XEN_HVM;
2290 			return;
2291 		}
2292 	}
2293 }
2294 
2295 int
get_hwenv(void)2296 get_hwenv(void)
2297 {
2298 	ASSERT(platform_type != -1);
2299 	return (platform_type);
2300 }
2301 
2302 int
is_controldom(void)2303 is_controldom(void)
2304 {
2305 	return (0);
2306 }
2307 
2308 #else
2309 
2310 int
get_hwenv(void)2311 get_hwenv(void)
2312 {
2313 	return (HW_XEN_PV);
2314 }
2315 
2316 int
is_controldom(void)2317 is_controldom(void)
2318 {
2319 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2320 }
2321 
2322 #endif	/* __xpv */
2323 
2324 /*
2325  * Gather the extended topology information. This should be the same for both
2326  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2327  */
2328 static void
cpuid_gather_ext_topo_leaf(struct cpuid_info * cpi,uint32_t leaf)2329 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2330 {
2331 	uint_t i;
2332 
2333 	for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2334 		struct cpuid_regs *regs = &cpi->cpi_topo[i];
2335 
2336 		bzero(regs, sizeof (struct cpuid_regs));
2337 		regs->cp_eax = leaf;
2338 		regs->cp_ecx = i;
2339 
2340 		(void) __cpuid_insn(regs);
2341 		if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2342 		    CPUID_AMD_8X26_TYPE_DONE) {
2343 			break;
2344 		}
2345 	}
2346 
2347 	cpi->cpi_topo_nleaves = i;
2348 }
2349 
2350 /*
2351  * Make sure that we have gathered all of the CPUID leaves that we might need to
2352  * determine topology. We assume that the standard leaf 1 has already been done
2353  * and that xmaxeax has already been calculated.
2354  */
2355 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2356 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2357 {
2358 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2359 
2360 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2361 		struct cpuid_regs *cp;
2362 
2363 		cp = &cpi->cpi_extd[8];
2364 		cp->cp_eax = CPUID_LEAF_EXT_8;
2365 		(void) __cpuid_insn(cp);
2366 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2367 	}
2368 
2369 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2370 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2371 		struct cpuid_regs *cp;
2372 
2373 		cp = &cpi->cpi_extd[0x1e];
2374 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2375 		(void) __cpuid_insn(cp);
2376 	}
2377 
2378 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2379 		cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2380 	}
2381 }
2382 
2383 /*
2384  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2385  * it to everything else. If not, and we're on an AMD system where 8000001e is
2386  * valid, then we use that. Othewrise, we fall back to the default value for the
2387  * APIC ID in leaf 1.
2388  */
2389 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2390 cpuid_gather_apicid(struct cpuid_info *cpi)
2391 {
2392 	/*
2393 	 * Leaf B changes based on the arguments to it. Because we don't cache
2394 	 * it, we need to gather it again.
2395 	 */
2396 	if (cpi->cpi_maxeax >= 0xB) {
2397 		struct cpuid_regs regs;
2398 		struct cpuid_regs *cp;
2399 
2400 		cp = &regs;
2401 		cp->cp_eax = 0xB;
2402 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2403 		(void) __cpuid_insn(cp);
2404 
2405 		if (cp->cp_ebx != 0) {
2406 			return (cp->cp_edx);
2407 		}
2408 	}
2409 
2410 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2411 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2412 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2413 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2414 		return (cpi->cpi_extd[0x1e].cp_eax);
2415 	}
2416 
2417 	return (CPI_APIC_ID(cpi));
2418 }
2419 
2420 /*
2421  * For AMD processors, attempt to calculate the number of chips and cores that
2422  * exist. The way that we do this varies based on the generation, because the
2423  * generations themselves have changed dramatically.
2424  *
2425  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2426  * However, with the advent of family 17h (Zen) it actually tells us the number
2427  * of threads, so we need to look at leaf 0x8000001e if available to determine
2428  * its value. Otherwise, for all prior families, the number of enabled cores is
2429  * the same as threads.
2430  *
2431  * If we do not have leaf 0x80000008, then we assume that this processor does
2432  * not have anything. AMD's older CPUID specification says there's no reason to
2433  * fall back to leaf 1.
2434  *
2435  * In some virtualization cases we will not have leaf 8000001e or it will be
2436  * zero. When that happens we assume the number of threads is one.
2437  */
2438 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2439 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2440 {
2441 	uint_t nthreads, nthread_per_core;
2442 
2443 	nthreads = nthread_per_core = 1;
2444 
2445 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2446 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2447 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2448 		nthreads = CPI_CPU_COUNT(cpi);
2449 	}
2450 
2451 	/*
2452 	 * For us to have threads, and know about it, we have to be at least at
2453 	 * family 17h and have the cpuid bit that says we have extended
2454 	 * topology.
2455 	 */
2456 	if (cpi->cpi_family >= 0x17 &&
2457 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2458 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2459 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2460 	}
2461 
2462 	*ncpus = nthreads;
2463 	*ncores = nthreads / nthread_per_core;
2464 }
2465 
2466 /*
2467  * Seed the initial values for the cores and threads for an Intel based
2468  * processor. These values will be overwritten if we detect that the processor
2469  * supports CPUID leaf 0xb.
2470  */
2471 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2472 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2473 {
2474 	/*
2475 	 * Only seed the number of physical cores from the first level leaf 4
2476 	 * information. The number of threads there indicate how many share the
2477 	 * L1 cache, which may or may not have anything to do with the number of
2478 	 * logical CPUs per core.
2479 	 */
2480 	if (cpi->cpi_maxeax >= 4) {
2481 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2482 	} else {
2483 		*ncores = 1;
2484 	}
2485 
2486 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2487 		*ncpus = CPI_CPU_COUNT(cpi);
2488 	} else {
2489 		*ncpus = *ncores;
2490 	}
2491 }
2492 
2493 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2494 cpuid_leafB_getids(cpu_t *cpu)
2495 {
2496 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2497 	struct cpuid_regs regs;
2498 	struct cpuid_regs *cp;
2499 
2500 	if (cpi->cpi_maxeax < 0xB)
2501 		return (B_FALSE);
2502 
2503 	cp = &regs;
2504 	cp->cp_eax = 0xB;
2505 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2506 
2507 	(void) __cpuid_insn(cp);
2508 
2509 	/*
2510 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2511 	 * indicates that the extended topology enumeration leaf is
2512 	 * available.
2513 	 */
2514 	if (cp->cp_ebx != 0) {
2515 		uint32_t x2apic_id = 0;
2516 		uint_t coreid_shift = 0;
2517 		uint_t ncpu_per_core = 1;
2518 		uint_t chipid_shift = 0;
2519 		uint_t ncpu_per_chip = 1;
2520 		uint_t i;
2521 		uint_t level;
2522 
2523 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2524 			cp->cp_eax = 0xB;
2525 			cp->cp_ecx = i;
2526 
2527 			(void) __cpuid_insn(cp);
2528 			level = CPI_CPU_LEVEL_TYPE(cp);
2529 
2530 			if (level == 1) {
2531 				x2apic_id = cp->cp_edx;
2532 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2533 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2534 			} else if (level == 2) {
2535 				x2apic_id = cp->cp_edx;
2536 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2537 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2538 			}
2539 		}
2540 
2541 		/*
2542 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2543 		 */
2544 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2545 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2546 		    ncpu_per_core;
2547 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2548 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2549 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2550 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2551 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2552 		cpi->cpi_compunitid = cpi->cpi_coreid;
2553 
2554 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2555 			cpi->cpi_nthread_bits = coreid_shift;
2556 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2557 		}
2558 
2559 		return (B_TRUE);
2560 	} else {
2561 		return (B_FALSE);
2562 	}
2563 }
2564 
2565 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2566 cpuid_intel_getids(cpu_t *cpu, void *feature)
2567 {
2568 	uint_t i;
2569 	uint_t chipid_shift = 0;
2570 	uint_t coreid_shift = 0;
2571 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2572 
2573 	/*
2574 	 * There are no compute units or processor nodes currently on Intel.
2575 	 * Always set these to one.
2576 	 */
2577 	cpi->cpi_procnodes_per_pkg = 1;
2578 	cpi->cpi_cores_per_compunit = 1;
2579 
2580 	/*
2581 	 * If cpuid Leaf B is present, use that to try and get this information.
2582 	 * It will be the most accurate for Intel CPUs.
2583 	 */
2584 	if (cpuid_leafB_getids(cpu))
2585 		return;
2586 
2587 	/*
2588 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2589 	 * and ncore_per_chip. These represent the largest power of two values
2590 	 * that we need to cover all of the IDs in the system. Therefore, we use
2591 	 * those values to seed the number of bits needed to cover information
2592 	 * in the case when leaf B is not available. These values will probably
2593 	 * be larger than required, but that's OK.
2594 	 */
2595 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2596 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2597 
2598 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2599 		chipid_shift++;
2600 
2601 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2602 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2603 
2604 	if (is_x86_feature(feature, X86FSET_CMP)) {
2605 		/*
2606 		 * Multi-core (and possibly multi-threaded)
2607 		 * processors.
2608 		 */
2609 		uint_t ncpu_per_core = 0;
2610 
2611 		if (cpi->cpi_ncore_per_chip == 1)
2612 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2613 		else if (cpi->cpi_ncore_per_chip > 1)
2614 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2615 			    cpi->cpi_ncore_per_chip;
2616 		/*
2617 		 * 8bit APIC IDs on dual core Pentiums
2618 		 * look like this:
2619 		 *
2620 		 * +-----------------------+------+------+
2621 		 * | Physical Package ID   |  MC  |  HT  |
2622 		 * +-----------------------+------+------+
2623 		 * <------- chipid -------->
2624 		 * <------- coreid --------------->
2625 		 *			   <--- clogid -->
2626 		 *			   <------>
2627 		 *			   pkgcoreid
2628 		 *
2629 		 * Where the number of bits necessary to
2630 		 * represent MC and HT fields together equals
2631 		 * to the minimum number of bits necessary to
2632 		 * store the value of cpi->cpi_ncpu_per_chip.
2633 		 * Of those bits, the MC part uses the number
2634 		 * of bits necessary to store the value of
2635 		 * cpi->cpi_ncore_per_chip.
2636 		 */
2637 		for (i = 1; i < ncpu_per_core; i <<= 1)
2638 			coreid_shift++;
2639 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2640 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2641 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2642 		/*
2643 		 * Single-core multi-threaded processors.
2644 		 */
2645 		cpi->cpi_coreid = cpi->cpi_chipid;
2646 		cpi->cpi_pkgcoreid = 0;
2647 	} else {
2648 		/*
2649 		 * Single-core single-thread processors.
2650 		 */
2651 		cpi->cpi_coreid = cpu->cpu_id;
2652 		cpi->cpi_pkgcoreid = 0;
2653 	}
2654 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2655 	cpi->cpi_compunitid = cpi->cpi_coreid;
2656 }
2657 
2658 /*
2659  * Historically, AMD has had CMP chips with only a single thread per core.
2660  * However, starting in family 17h (Zen), this has changed and they now have
2661  * multiple threads. Our internal core id needs to be a unique value.
2662  *
2663  * To determine the core id of an AMD system, if we're from a family before 17h,
2664  * then we just use the cpu id, as that gives us a good value that will be
2665  * unique for each core. If instead, we're on family 17h or later, then we need
2666  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2667  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2668  * We can't use the normal core id in that leaf as it's only unique within the
2669  * socket, which is perfect for cpi_pkgcoreid, but not us.
2670  */
2671 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2672 cpuid_amd_get_coreid(cpu_t *cpu)
2673 {
2674 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2675 
2676 	if (cpi->cpi_family >= 0x17 &&
2677 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2678 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2679 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2680 		if (nthreads > 1) {
2681 			VERIFY3U(nthreads, ==, 2);
2682 			return (cpi->cpi_apicid >> 1);
2683 		}
2684 	}
2685 
2686 	return (cpu->cpu_id);
2687 }
2688 
2689 /*
2690  * IDs on AMD is a more challenging task. This is notable because of the
2691  * following two facts:
2692  *
2693  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2694  *     also no way to get an actual unique core id from the system. As such, we
2695  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2696  *     however, guarantee that sibling cores of a chip will have sequential
2697  *     coreids starting at a multiple of the number of cores per chip - that is
2698  *     usually the case, but if the APIC IDs have been set up in a different
2699  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2700  *
2701  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2702  *     called compute units. These compute units share the L1I cache, L2 cache,
2703  *     and the FPU. To deal with this, a new topology leaf was added in
2704  *     0x8000001e. However, parts of this leaf have different meanings
2705  *     once we get to family 0x17.
2706  */
2707 
2708 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2709 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2710 {
2711 	int i, first_half, coreidsz;
2712 	uint32_t nb_caps_reg;
2713 	uint_t node2_1;
2714 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2715 	struct cpuid_regs *cp;
2716 
2717 	/*
2718 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2719 	 * hasn't been stripped by virtualization). We always set the compute
2720 	 * unit id to the same value. Also, initialize the default number of
2721 	 * cores per compute unit and nodes per package. This will be
2722 	 * overwritten when we know information about a particular family.
2723 	 */
2724 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2725 	cpi->cpi_compunitid = cpi->cpi_coreid;
2726 	cpi->cpi_cores_per_compunit = 1;
2727 	cpi->cpi_procnodes_per_pkg = 1;
2728 
2729 	/*
2730 	 * To construct the logical ID, we need to determine how many APIC IDs
2731 	 * are dedicated to the cores and threads. This is provided for us in
2732 	 * 0x80000008. However, if it's not present (say due to virtualization),
2733 	 * then we assume it's one. This should be present on all 64-bit AMD
2734 	 * processors.  It was added in family 0xf (Hammer).
2735 	 */
2736 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2737 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2738 
2739 		/*
2740 		 * In AMD parlance chip is really a node while illumos
2741 		 * uses chip as equivalent to socket/package.
2742 		 */
2743 		if (coreidsz == 0) {
2744 			/* Use legacy method */
2745 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2746 				coreidsz++;
2747 			if (coreidsz == 0)
2748 				coreidsz = 1;
2749 		}
2750 	} else {
2751 		/* Assume single-core part */
2752 		coreidsz = 1;
2753 	}
2754 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2755 
2756 	/*
2757 	 * The package core ID varies depending on the family. While it may be
2758 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2759 	 * this value is the core id in the given node. For non-virtualized
2760 	 * family 17h, we need to take the logical core id and shift off the
2761 	 * threads like we do when getting the core id.  Otherwise, we can use
2762 	 * the clogid as is. When family 17h is virtualized, the clogid should
2763 	 * be sufficient as if we don't have valid data in the leaf, then we
2764 	 * won't think we have SMT, in which case the cpi_clogid should be
2765 	 * sufficient.
2766 	 */
2767 	if (cpi->cpi_family >= 0x17 &&
2768 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2769 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2770 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2771 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2772 		if (nthreads > 1) {
2773 			VERIFY3U(nthreads, ==, 2);
2774 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2775 		} else {
2776 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2777 		}
2778 	} else {
2779 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2780 	}
2781 
2782 	/*
2783 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2784 	 * (bulldozer) or newer, then we can derive all of this from leaf
2785 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2786 	 */
2787 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2788 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2789 		cp = &cpi->cpi_extd[0x1e];
2790 
2791 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2792 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2793 
2794 		/*
2795 		 * For Bulldozer-era CPUs, recalculate the compute unit
2796 		 * information.
2797 		 */
2798 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2799 			cpi->cpi_cores_per_compunit =
2800 			    BITX(cp->cp_ebx, 15, 8) + 1;
2801 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2802 			    (cpi->cpi_ncore_per_chip /
2803 			    cpi->cpi_cores_per_compunit) *
2804 			    (cpi->cpi_procnodeid /
2805 			    cpi->cpi_procnodes_per_pkg);
2806 		}
2807 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2808 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2809 	} else if (cpi->cpi_family == 0x10) {
2810 		/*
2811 		 * See if we are a multi-node processor.
2812 		 * All processors in the system have the same number of nodes
2813 		 */
2814 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2815 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2816 			/* Single-node */
2817 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2818 			    coreidsz);
2819 		} else {
2820 
2821 			/*
2822 			 * Multi-node revision D (2 nodes per package
2823 			 * are supported)
2824 			 */
2825 			cpi->cpi_procnodes_per_pkg = 2;
2826 
2827 			first_half = (cpi->cpi_pkgcoreid <=
2828 			    (cpi->cpi_ncore_per_chip/2 - 1));
2829 
2830 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2831 				/* We are BSP */
2832 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2833 			} else {
2834 
2835 				/* We are AP */
2836 				/* NodeId[2:1] bits to use for reading F3xe8 */
2837 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2838 
2839 				nb_caps_reg =
2840 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2841 
2842 				/*
2843 				 * Check IntNodeNum bit (31:30, but bit 31 is
2844 				 * always 0 on dual-node processors)
2845 				 */
2846 				if (BITX(nb_caps_reg, 30, 30) == 0)
2847 					cpi->cpi_procnodeid = node2_1 +
2848 					    !first_half;
2849 				else
2850 					cpi->cpi_procnodeid = node2_1 +
2851 					    first_half;
2852 			}
2853 		}
2854 	} else {
2855 		cpi->cpi_procnodeid = 0;
2856 	}
2857 
2858 	cpi->cpi_chipid =
2859 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2860 
2861 	cpi->cpi_ncore_bits = coreidsz;
2862 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2863 	    cpi->cpi_ncore_per_chip);
2864 }
2865 
2866 static void
spec_uarch_flush_noop(void)2867 spec_uarch_flush_noop(void)
2868 {
2869 }
2870 
2871 /*
2872  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2873  * MDS-related micro-architectural state that would normally happen by calling
2874  * x86_md_clear().
2875  */
2876 static void
spec_uarch_flush_msr(void)2877 spec_uarch_flush_msr(void)
2878 {
2879 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2880 }
2881 
2882 /*
2883  * This function points to a function that will flush certain
2884  * micro-architectural state on the processor. This flush is used to mitigate
2885  * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2886  * This function can point to one of three functions:
2887  *
2888  * - A noop which is done because we either are vulnerable, but do not have
2889  *   microcode available to help deal with a fix, or because we aren't
2890  *   vulnerable.
2891  *
2892  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2893  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2894  *   however, it only flushes the MDS related micro-architectural state on the
2895  *   current hyperthread, it does not do anything for the twin.
2896  *
2897  * - x86_md_clear which will flush the MDS related state. This is done when we
2898  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2899  *   (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2900  *   can clear it (RFDS_CLEAR is set).
2901  */
2902 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2903 
2904 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2905 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2906 {
2907 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2908 
2909 	/* Non-Intel doesn't concern us here. */
2910 	if (cpi->cpi_vendor != X86_VENDOR_Intel)
2911 		return;
2912 
2913 	/*
2914 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2915 	 * has been fixed in hardware, it doesn't cover everything related to
2916 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2917 	 * need to mitigate this.
2918 	 *
2919 	 * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2920 	 * because of the small cases of RFDS.
2921 	 */
2922 
2923 	if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2924 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2925 	    (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2926 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2927 		const uint8_t nop = NOP_INSTR;
2928 		uint8_t *md = (uint8_t *)x86_md_clear;
2929 
2930 		*md = nop;
2931 	}
2932 
2933 	membar_producer();
2934 }
2935 
2936 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2937 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2938 {
2939 	boolean_t need_l1d, need_mds, need_rfds;
2940 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2941 
2942 	/*
2943 	 * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2944 	 * in hardware, then there's nothing left for us to do for enabling
2945 	 * the flush. We can also go ahead and say that SMT exclusion is
2946 	 * unnecessary.
2947 	 */
2948 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2949 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2950 	    is_x86_feature(featureset, X86FSET_MDS_NO) &&
2951 	    is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2952 		extern int smt_exclusion;
2953 		smt_exclusion = 0;
2954 		spec_uarch_flush = spec_uarch_flush_noop;
2955 		membar_producer();
2956 		return;
2957 	}
2958 
2959 	/*
2960 	 * The locations where we need to perform an L1D flush are required both
2961 	 * for mitigating L1TF and MDS. When verw support is present in
2962 	 * microcode, then the L1D flush will take care of doing that as well.
2963 	 * However, if we have a system where RDCL_NO is present, but we don't
2964 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2965 	 * L1D flush.
2966 	 */
2967 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2968 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2969 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2970 		need_l1d = B_TRUE;
2971 	} else {
2972 		need_l1d = B_FALSE;
2973 	}
2974 
2975 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2976 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2977 		need_mds = B_TRUE;
2978 	} else {
2979 		need_mds = B_FALSE;
2980 	}
2981 
2982 	if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2983 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
2984 		need_rfds = B_TRUE;
2985 	} else {
2986 		need_rfds = B_FALSE;
2987 	}
2988 
2989 	if (need_l1d) {
2990 		/*
2991 		 * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
2992 		 * together. If the following VERIFY trips, we need to add
2993 		 * further fixes here.
2994 		 */
2995 		VERIFY(!need_rfds);
2996 		spec_uarch_flush = spec_uarch_flush_msr;
2997 	} else if (need_mds || need_rfds) {
2998 		spec_uarch_flush = x86_md_clear;
2999 	} else {
3000 		/*
3001 		 * We have no hardware mitigations available to us.
3002 		 */
3003 		spec_uarch_flush = spec_uarch_flush_noop;
3004 	}
3005 	membar_producer();
3006 }
3007 
3008 /*
3009  * Branch History Injection (BHI) mitigations.
3010  *
3011  * Intel has provided a software sequence that will scrub the BHB. Like RSB
3012  * (below) we can scribble a return at the beginning to avoid if if the CPU
3013  * is modern enough. We can also scribble a return if the CPU is old enough
3014  * to not have an RSB (pre-eIBRS).
3015  */
3016 typedef enum {
3017 	X86_BHI_TOO_OLD_OR_DISABLED,	/* Pre-eIBRS or disabled */
3018 	X86_BHI_NEW_ENOUGH,		/* AMD, or Intel with BHI_NO set */
3019 	X86_BHI_DIS_S,			/* BHI_NO == 0, but BHI_DIS_S avail. */
3020 	/* NOTE: BHI_DIS_S above will still need the software sequence. */
3021 	X86_BHI_SOFTWARE_SEQUENCE,	/* Use software sequence */
3022 } x86_native_bhi_mitigation_t;
3023 
3024 x86_native_bhi_mitigation_t x86_bhi_mitigation = X86_BHI_SOFTWARE_SEQUENCE;
3025 
3026 static void
cpuid_enable_bhi_dis_s(void)3027 cpuid_enable_bhi_dis_s(void)
3028 {
3029 	uint64_t val;
3030 
3031 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3032 	val |= IA32_SPEC_CTRL_BHI_DIS_S;
3033 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3034 }
3035 
3036 /*
3037  * This function scribbles RET into the first instruction of x86_bhb_clear()
3038  * if SPECTREV2 mitigations are disabled, the CPU is too old, the CPU is new
3039  * enough to fix (which includes non-Intel CPUs), or the CPU has an explicit
3040  * disable-Branch-History control.
3041  */
3042 static x86_native_bhi_mitigation_t
cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit,cpu_t * cpu,uchar_t * featureset)3043 cpuid_learn_and_patch_bhi(x86_spectrev2_mitigation_t v2mit, cpu_t *cpu,
3044     uchar_t *featureset)
3045 {
3046 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3047 	const uint8_t ret = RET_INSTR;
3048 	uint8_t *bhb_clear = (uint8_t *)x86_bhb_clear;
3049 
3050 	ASSERT0(cpu->cpu_id);
3051 
3052 	/* First check for explicitly disabled... */
3053 	if (v2mit == X86_SPECTREV2_DISABLED) {
3054 		*bhb_clear = ret;
3055 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3056 	}
3057 
3058 	/*
3059 	 * Then check for BHI_NO, which means the CPU doesn't have this bug,
3060 	 * or if it's non-Intel, in which case this mitigation mechanism
3061 	 * doesn't apply.
3062 	 */
3063 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
3064 	    is_x86_feature(featureset, X86FSET_BHI_NO)) {
3065 		*bhb_clear = ret;
3066 		return (X86_BHI_NEW_ENOUGH);
3067 	}
3068 
3069 	/*
3070 	 * Now check for the BHI_CTRL MSR, and then set it if available.
3071 	 * We will still need to use the software sequence, however.
3072 	 */
3073 	if (is_x86_feature(featureset, X86FSET_BHI_CTRL)) {
3074 		cpuid_enable_bhi_dis_s();
3075 		return (X86_BHI_DIS_S);
3076 	}
3077 
3078 	/*
3079 	 * Finally, check if we are too old to bother with RSB:
3080 	 */
3081 	if (v2mit == X86_SPECTREV2_RETPOLINE) {
3082 		*bhb_clear = ret;
3083 		return (X86_BHI_TOO_OLD_OR_DISABLED);
3084 	}
3085 
3086 	ASSERT(*bhb_clear != ret);
3087 	return (X86_BHI_SOFTWARE_SEQUENCE);
3088 }
3089 
3090 /*
3091  * We default to enabling Return Stack Buffer (RSB) mitigations.
3092  *
3093  * We used to skip RSB mitigations with Intel eIBRS, but developments around
3094  * post-barrier RSB (PBRSB) guessing suggests we should enable Intel RSB
3095  * mitigations always unless explicitly bypassed, or unless hardware indicates
3096  * the bug has been fixed.
3097  *
3098  * The current decisions for using, or ignoring, a RSB software stuffing
3099  * sequence are expressed by the following table:
3100  *
3101  * +-------+------------+-----------------+--------+
3102  * | eIBRS |  PBRSB_NO  |  context switch | vmexit |
3103  * +-------+------------+-----------------+--------+
3104  * |   Yes |     No     |  stuff          | stuff  |
3105  * |   Yes |     Yes    |  ignore         | ignore |
3106  * |   No  |     No     |  stuff          | ignore |
3107  * +-------+------------+-----------------+--------+
3108  *
3109  * Note that if an Intel CPU has no eIBRS, it will never enumerate PBRSB_NO,
3110  * because machines with no eIBRS do not have a problem with PBRSB overflow.
3111  * See the Intel document cited below for details.
3112  *
3113  * Also note that AMD AUTO_IBRS has no PBRSB problem, so it is not included in
3114  * the table above, and that there is no situation where vmexit stuffing is
3115  * needed, but context-switch stuffing isn't.
3116  */
3117 
3118 /* BEGIN CSTYLED */
3119 /*
3120  * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/post-barrier-return-stack-buffer-predictions.html
3121  */
3122 /* END CSTYLED */
3123 
3124 /*
3125  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
3126  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
3127  * also states that as long as SMEP and we maintain at least one page between
3128  * the kernel and user space (we have much more of a red zone), then we do not
3129  * need to clear the RSB. We constrain this to only when Automatic IRBS is
3130  * present.
3131  */
3132 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit,bool intel_pbrsb_no)3133 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit, bool intel_pbrsb_no)
3134 {
3135 	const uint8_t ret = RET_INSTR;
3136 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
3137 	uint8_t *vmx_stuff = (uint8_t *)x86_rsb_stuff_vmexit;
3138 
3139 	switch (mit) {
3140 	case X86_SPECTREV2_AUTO_IBRS:
3141 	case X86_SPECTREV2_DISABLED:
3142 		/* Don't bother with any RSB stuffing! */
3143 		*stuff = ret;
3144 		*vmx_stuff = ret;
3145 		break;
3146 	case X86_SPECTREV2_RETPOLINE:
3147 		/*
3148 		 * The Intel document on Post-Barrier RSB says that processors
3149 		 * without eIBRS do not have PBRSB problems upon VMEXIT.
3150 		 */
3151 		VERIFY(!intel_pbrsb_no);
3152 		VERIFY3U(*stuff, !=, ret);
3153 		*vmx_stuff = ret;
3154 		break;
3155 	default:
3156 		/*
3157 		 * eIBRS is all that's left.  If CPU claims PBRSB is fixed,
3158 		 * don't use the RSB mitigation in either case.  Otherwise
3159 		 * both vmexit and context-switching require the software
3160 		 * mitigation.
3161 		 */
3162 		if (intel_pbrsb_no) {
3163 			/* CPU claims PBRSB problems are fixed. */
3164 			*stuff = ret;
3165 			*vmx_stuff = ret;
3166 		}
3167 		VERIFY3U(*stuff, ==, *vmx_stuff);
3168 		break;
3169 	}
3170 }
3171 
3172 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)3173 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3174 {
3175 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3176 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3177 	    "_r14", "_r15" };
3178 	const uint_t nthunks = ARRAY_SIZE(thunks);
3179 	const char *type;
3180 	uint_t i;
3181 
3182 	if (mit == x86_spectrev2_mitigation)
3183 		return;
3184 
3185 	switch (mit) {
3186 	case X86_SPECTREV2_RETPOLINE:
3187 		type = "gen";
3188 		break;
3189 	case X86_SPECTREV2_AUTO_IBRS:
3190 	case X86_SPECTREV2_ENHANCED_IBRS:
3191 	case X86_SPECTREV2_DISABLED:
3192 		type = "jmp";
3193 		break;
3194 	default:
3195 		panic("asked to update retpoline state with unknown state!");
3196 	}
3197 
3198 	for (i = 0; i < nthunks; i++) {
3199 		uintptr_t source, dest;
3200 		int ssize, dsize;
3201 		char sourcebuf[64], destbuf[64];
3202 
3203 		(void) snprintf(destbuf, sizeof (destbuf),
3204 		    "__x86_indirect_thunk%s", thunks[i]);
3205 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
3206 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
3207 
3208 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3209 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
3210 		VERIFY3U(source, !=, 0);
3211 		VERIFY3U(dest, !=, 0);
3212 		VERIFY3S(dsize, >=, ssize);
3213 		bcopy((void *)source, (void *)dest, ssize);
3214 	}
3215 }
3216 
3217 static void
cpuid_enable_enhanced_ibrs(void)3218 cpuid_enable_enhanced_ibrs(void)
3219 {
3220 	uint64_t val;
3221 
3222 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3223 	val |= IA32_SPEC_CTRL_IBRS;
3224 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3225 }
3226 
3227 static void
cpuid_enable_auto_ibrs(void)3228 cpuid_enable_auto_ibrs(void)
3229 {
3230 	uint64_t val;
3231 
3232 	val = rdmsr(MSR_AMD_EFER);
3233 	val |= AMD_EFER_AIBRSE;
3234 	wrmsr(MSR_AMD_EFER, val);
3235 }
3236 
3237 /*
3238  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3239  * we can disable TSX, we do so.
3240  *
3241  * This determination is done only on the boot CPU, potentially after loading
3242  * updated microcode.
3243  */
3244 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)3245 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3246 {
3247 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3248 
3249 	VERIFY(cpu->cpu_id == 0);
3250 
3251 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3252 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3253 		return;
3254 	}
3255 
3256 	if (x86_disable_taa) {
3257 		x86_taa_mitigation = X86_TAA_DISABLED;
3258 		return;
3259 	}
3260 
3261 	/*
3262 	 * If we do not have the ability to disable TSX, then our only
3263 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3264 	 * MDS mitigation as described above.  The latter relies upon us having
3265 	 * configured MDS mitigations correctly! This includes disabling SMT if
3266 	 * we want to cross-CPU-thread protection.
3267 	 */
3268 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3269 		/*
3270 		 * It's not clear whether any parts will enumerate TAA_NO
3271 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3272 		 */
3273 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3274 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3275 			return;
3276 		}
3277 
3278 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3279 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3280 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3281 		} else {
3282 			x86_taa_mitigation = X86_TAA_NOTHING;
3283 		}
3284 		return;
3285 	}
3286 
3287 	/*
3288 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3289 	 * enough in boot.
3290 	 *
3291 	 * Otherwise, we'll fall back to causing transactions to abort as our
3292 	 * mitigation. TSX-using code will always take the fallback path.
3293 	 */
3294 	if (cpi->cpi_pass < 4) {
3295 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3296 	} else {
3297 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3298 	}
3299 }
3300 
3301 /*
3302  * As mentioned, we should only touch the MSR when we've got a suitable
3303  * microcode loaded on this CPU.
3304  */
3305 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)3306 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3307 {
3308 	uint64_t val;
3309 
3310 	switch (taa) {
3311 	case X86_TAA_TSX_DISABLE:
3312 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3313 			return;
3314 		val = rdmsr(MSR_IA32_TSX_CTRL);
3315 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3316 		wrmsr(MSR_IA32_TSX_CTRL, val);
3317 		break;
3318 	case X86_TAA_TSX_FORCE_ABORT:
3319 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3320 			return;
3321 		val = rdmsr(MSR_IA32_TSX_CTRL);
3322 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3323 		wrmsr(MSR_IA32_TSX_CTRL, val);
3324 		break;
3325 	case X86_TAA_HW_MITIGATED:
3326 	case X86_TAA_MD_CLEAR:
3327 	case X86_TAA_DISABLED:
3328 	case X86_TAA_NOTHING:
3329 		break;
3330 	}
3331 }
3332 
3333 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)3334 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3335 {
3336 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3337 	x86_spectrev2_mitigation_t v2mit;
3338 
3339 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3340 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3341 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3342 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3343 			add_x86_feature(featureset, X86FSET_IBPB);
3344 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3345 			add_x86_feature(featureset, X86FSET_IBRS);
3346 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3347 			add_x86_feature(featureset, X86FSET_STIBP);
3348 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3349 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3350 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3351 			add_x86_feature(featureset, X86FSET_SSBD);
3352 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3353 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3354 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3355 			add_x86_feature(featureset, X86FSET_SSB_NO);
3356 
3357 		/*
3358 		 * Rather than Enhanced IBRS, AMD has a different feature that
3359 		 * is a bit in EFER that can be enabled and will basically do
3360 		 * the right thing while executing in the kernel.
3361 		 */
3362 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3363 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3364 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3365 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3366 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3367 		}
3368 
3369 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3370 	    cpi->cpi_maxeax >= 7) {
3371 		struct cpuid_regs *ecp;
3372 		ecp = &cpi->cpi_std[7];
3373 
3374 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3375 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3376 		}
3377 
3378 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3379 			add_x86_feature(featureset, X86FSET_IBRS);
3380 			add_x86_feature(featureset, X86FSET_IBPB);
3381 		}
3382 
3383 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3384 			add_x86_feature(featureset, X86FSET_STIBP);
3385 		}
3386 
3387 		/*
3388 		 * Some prediction controls are enumerated by subleaf 2 of
3389 		 * leaf 7.
3390 		 */
3391 		if (CPI_FEATURES_7_2_EDX(cpi) & CPUID_INTC_EDX_7_2_BHI_CTRL) {
3392 			add_x86_feature(featureset, X86FSET_BHI_CTRL);
3393 		}
3394 
3395 		/*
3396 		 * Don't read the arch caps MSR on xpv where we lack the
3397 		 * on_trap().
3398 		 */
3399 #ifndef __xpv
3400 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3401 			on_trap_data_t otd;
3402 
3403 			/*
3404 			 * Be paranoid and assume we'll get a #GP.
3405 			 */
3406 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3407 				uint64_t reg;
3408 
3409 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3410 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3411 					add_x86_feature(featureset,
3412 					    X86FSET_RDCL_NO);
3413 				}
3414 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3415 					add_x86_feature(featureset,
3416 					    X86FSET_IBRS_ALL);
3417 				}
3418 				if (reg & IA32_ARCH_CAP_RSBA) {
3419 					add_x86_feature(featureset,
3420 					    X86FSET_RSBA);
3421 				}
3422 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3423 					add_x86_feature(featureset,
3424 					    X86FSET_L1D_VM_NO);
3425 				}
3426 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3427 					add_x86_feature(featureset,
3428 					    X86FSET_SSB_NO);
3429 				}
3430 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3431 					add_x86_feature(featureset,
3432 					    X86FSET_MDS_NO);
3433 				}
3434 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3435 					add_x86_feature(featureset,
3436 					    X86FSET_TSX_CTRL);
3437 				}
3438 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3439 					add_x86_feature(featureset,
3440 					    X86FSET_TAA_NO);
3441 				}
3442 				if (reg & IA32_ARCH_CAP_RFDS_NO) {
3443 					add_x86_feature(featureset,
3444 					    X86FSET_RFDS_NO);
3445 				}
3446 				if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3447 					add_x86_feature(featureset,
3448 					    X86FSET_RFDS_CLEAR);
3449 				}
3450 				if (reg & IA32_ARCH_CAP_PBRSB_NO) {
3451 					add_x86_feature(featureset,
3452 					    X86FSET_PBRSB_NO);
3453 				}
3454 				if (reg & IA32_ARCH_CAP_BHI_NO) {
3455 					add_x86_feature(featureset,
3456 					    X86FSET_BHI_NO);
3457 				}
3458 			}
3459 			no_trap();
3460 		}
3461 #endif	/* !__xpv */
3462 
3463 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3464 			add_x86_feature(featureset, X86FSET_SSBD);
3465 
3466 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3467 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3468 	}
3469 
3470 	/*
3471 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3472 	 * will have already run this function and determined what we need to
3473 	 * do. This gives us a hook for per-HW thread mitigations such as
3474 	 * enhanced IBRS, or disabling TSX.
3475 	 */
3476 	if (cpu->cpu_id != 0) {
3477 		switch (x86_spectrev2_mitigation) {
3478 		case X86_SPECTREV2_ENHANCED_IBRS:
3479 			cpuid_enable_enhanced_ibrs();
3480 			break;
3481 		case X86_SPECTREV2_AUTO_IBRS:
3482 			cpuid_enable_auto_ibrs();
3483 			break;
3484 		default:
3485 			break;
3486 		}
3487 
3488 		/* If we're committed to BHI_DIS_S, set it for this core. */
3489 		if (x86_bhi_mitigation == X86_BHI_DIS_S)
3490 			cpuid_enable_bhi_dis_s();
3491 
3492 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3493 		return;
3494 	}
3495 
3496 	/*
3497 	 * Go through and initialize various security mechanisms that we should
3498 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3499 	 * TAA.
3500 	 */
3501 
3502 	/*
3503 	 * By default we've come in with retpolines enabled. Check whether we
3504 	 * should disable them or enable enhanced or automatic IBRS.
3505 	 *
3506 	 * Note, we do not allow the use of AMD optimized retpolines as it was
3507 	 * disclosed by AMD in March 2022 that they were still
3508 	 * vulnerable. Prior to that point, we used them.
3509 	 */
3510 	if (x86_disable_spectrev2 != 0) {
3511 		v2mit = X86_SPECTREV2_DISABLED;
3512 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3513 		cpuid_enable_auto_ibrs();
3514 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3515 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3516 		cpuid_enable_enhanced_ibrs();
3517 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3518 	} else {
3519 		v2mit = X86_SPECTREV2_RETPOLINE;
3520 	}
3521 
3522 	cpuid_patch_retpolines(v2mit);
3523 	cpuid_patch_rsb(v2mit, is_x86_feature(featureset, X86FSET_PBRSB_NO));
3524 	x86_bhi_mitigation = cpuid_learn_and_patch_bhi(v2mit, cpu, featureset);
3525 	x86_spectrev2_mitigation = v2mit;
3526 	membar_producer();
3527 
3528 	/*
3529 	 * We need to determine what changes are required for mitigating L1TF
3530 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3531 	 * is required.
3532 	 *
3533 	 * If any of these are present, then we need to flush u-arch state at
3534 	 * various points. For MDS, we need to do so whenever we change to a
3535 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3536 	 * flush the L1D cache at VM entry. When we have microcode that handles
3537 	 * MDS, the L1D flush also clears the other u-arch state that the
3538 	 * md_clear does.
3539 	 */
3540 
3541 	/*
3542 	 * Update whether or not we need to be taking explicit action against
3543 	 * MDS or RFDS.
3544 	 */
3545 	cpuid_update_md_clear(cpu, featureset);
3546 
3547 	/*
3548 	 * Determine whether SMT exclusion is required and whether or not we
3549 	 * need to perform an l1d flush.
3550 	 */
3551 	cpuid_update_l1d_flush(cpu, featureset);
3552 
3553 	/*
3554 	 * Determine what our mitigation strategy should be for TAA and then
3555 	 * also apply TAA mitigations.
3556 	 */
3557 	cpuid_update_tsx(cpu, featureset);
3558 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3559 }
3560 
3561 /*
3562  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3563  */
3564 void
setup_xfem(void)3565 setup_xfem(void)
3566 {
3567 	uint64_t flags = XFEATURE_LEGACY_FP;
3568 
3569 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3570 
3571 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3572 		flags |= XFEATURE_SSE;
3573 
3574 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3575 		flags |= XFEATURE_AVX;
3576 
3577 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3578 		flags |= XFEATURE_AVX512;
3579 
3580 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3581 
3582 	xsave_bv_all = flags;
3583 }
3584 
3585 static void
cpuid_basic_topology(cpu_t * cpu,uchar_t * featureset)3586 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3587 {
3588 	struct cpuid_info *cpi;
3589 
3590 	cpi = cpu->cpu_m.mcpu_cpi;
3591 
3592 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3593 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3594 		cpuid_gather_amd_topology_leaves(cpu);
3595 	}
3596 
3597 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3598 
3599 	/*
3600 	 * Before we can calculate the IDs that we should assign to this
3601 	 * processor, we need to understand how many cores and threads it has.
3602 	 */
3603 	switch (cpi->cpi_vendor) {
3604 	case X86_VENDOR_Intel:
3605 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3606 		    &cpi->cpi_ncore_per_chip);
3607 		break;
3608 	case X86_VENDOR_AMD:
3609 	case X86_VENDOR_HYGON:
3610 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3611 		    &cpi->cpi_ncore_per_chip);
3612 		break;
3613 	default:
3614 		/*
3615 		 * If we have some other x86 compatible chip, it's not clear how
3616 		 * they would behave. The most common case is virtualization
3617 		 * today, though there are also 64-bit VIA chips. Assume that
3618 		 * all we can get is the basic Leaf 1 HTT information.
3619 		 */
3620 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3621 			cpi->cpi_ncore_per_chip = 1;
3622 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3623 		}
3624 		break;
3625 	}
3626 
3627 	/*
3628 	 * Based on the calculated number of threads and cores, potentially
3629 	 * assign the HTT and CMT features.
3630 	 */
3631 	if (cpi->cpi_ncore_per_chip > 1) {
3632 		add_x86_feature(featureset, X86FSET_CMP);
3633 	}
3634 
3635 	if (cpi->cpi_ncpu_per_chip > 1 &&
3636 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3637 		add_x86_feature(featureset, X86FSET_HTT);
3638 	}
3639 
3640 	/*
3641 	 * Now that has been set up, we need to go through and calculate all of
3642 	 * the rest of the parameters that exist. If we think the CPU doesn't
3643 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3644 	 * up information in some way. The most likely case for this is
3645 	 * virtualization where we have a lot of partial topology information.
3646 	 */
3647 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3648 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3649 		/*
3650 		 * This is a single core, single-threaded processor.
3651 		 */
3652 		cpi->cpi_procnodes_per_pkg = 1;
3653 		cpi->cpi_cores_per_compunit = 1;
3654 		cpi->cpi_compunitid = 0;
3655 		cpi->cpi_chipid = -1;
3656 		cpi->cpi_clogid = 0;
3657 		cpi->cpi_coreid = cpu->cpu_id;
3658 		cpi->cpi_pkgcoreid = 0;
3659 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3660 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3661 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3662 		} else {
3663 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3664 		}
3665 	} else {
3666 		switch (cpi->cpi_vendor) {
3667 		case X86_VENDOR_Intel:
3668 			cpuid_intel_getids(cpu, featureset);
3669 			break;
3670 		case X86_VENDOR_AMD:
3671 		case X86_VENDOR_HYGON:
3672 			cpuid_amd_getids(cpu, featureset);
3673 			break;
3674 		default:
3675 			/*
3676 			 * In this case, it's hard to say what we should do.
3677 			 * We're going to model them to the OS as single core
3678 			 * threads. We don't have a good identifier for them, so
3679 			 * we're just going to use the cpu id all on a single
3680 			 * chip.
3681 			 *
3682 			 * This case has historically been different from the
3683 			 * case above where we don't have HTT or CMP. While they
3684 			 * could be combined, we've opted to keep it separate to
3685 			 * minimize the risk of topology changes in weird cases.
3686 			 */
3687 			cpi->cpi_procnodes_per_pkg = 1;
3688 			cpi->cpi_cores_per_compunit = 1;
3689 			cpi->cpi_chipid = 0;
3690 			cpi->cpi_coreid = cpu->cpu_id;
3691 			cpi->cpi_clogid = cpu->cpu_id;
3692 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3693 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3694 			cpi->cpi_compunitid = cpi->cpi_coreid;
3695 			break;
3696 		}
3697 	}
3698 }
3699 
3700 /*
3701  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3702  * always gather leaf 6 if it's supported; however, we only look for features on
3703  * Intel systems as AMD does not currently define any of the features we look
3704  * for below.
3705  */
3706 static void
cpuid_basic_thermal(cpu_t * cpu,uchar_t * featureset)3707 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3708 {
3709 	struct cpuid_regs *cp;
3710 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3711 
3712 	if (cpi->cpi_maxeax < 6) {
3713 		return;
3714 	}
3715 
3716 	cp = &cpi->cpi_std[6];
3717 	cp->cp_eax = 6;
3718 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3719 	(void) __cpuid_insn(cp);
3720 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3721 
3722 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3723 		return;
3724 	}
3725 
3726 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3727 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3728 	}
3729 
3730 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3731 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3732 	}
3733 }
3734 
3735 /*
3736  * This is used when we discover that we have AVX support in cpuid. This
3737  * proceeds to scan for the rest of the AVX derived features.
3738  */
3739 static void
cpuid_basic_avx(cpu_t * cpu,uchar_t * featureset)3740 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3741 {
3742 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3743 
3744 	/*
3745 	 * If we don't have AVX, don't bother with most of this.
3746 	 */
3747 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3748 		return;
3749 
3750 	add_x86_feature(featureset, X86FSET_AVX);
3751 
3752 	/*
3753 	 * Intel says we can't check these without also
3754 	 * checking AVX.
3755 	 */
3756 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3757 		add_x86_feature(featureset, X86FSET_F16C);
3758 
3759 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3760 		add_x86_feature(featureset, X86FSET_FMA);
3761 
3762 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3763 		add_x86_feature(featureset, X86FSET_BMI1);
3764 
3765 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3766 		add_x86_feature(featureset, X86FSET_BMI2);
3767 
3768 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3769 		add_x86_feature(featureset, X86FSET_AVX2);
3770 
3771 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3772 		add_x86_feature(featureset, X86FSET_VAES);
3773 
3774 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3775 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3776 
3777 	/*
3778 	 * The rest of the AVX features require AVX512. Do not check them unless
3779 	 * it is present.
3780 	 */
3781 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3782 		return;
3783 	add_x86_feature(featureset, X86FSET_AVX512F);
3784 
3785 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3786 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3787 
3788 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3789 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3790 
3791 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3792 		add_x86_feature(featureset, X86FSET_AVX512PF);
3793 
3794 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3795 		add_x86_feature(featureset, X86FSET_AVX512ER);
3796 
3797 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3798 		add_x86_feature(featureset, X86FSET_AVX512CD);
3799 
3800 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3801 		add_x86_feature(featureset, X86FSET_AVX512BW);
3802 
3803 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3804 		add_x86_feature(featureset, X86FSET_AVX512VL);
3805 
3806 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3807 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3808 
3809 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3810 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3811 
3812 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3813 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3814 
3815 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3816 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3817 
3818 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3819 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3820 
3821 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3822 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3823 
3824 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3825 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3826 
3827 	/*
3828 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3829 	 * we don't need to.
3830 	 */
3831 	if (cpi->cpi_std[7].cp_eax < 1)
3832 		return;
3833 
3834 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3835 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3836 }
3837 
3838 /*
3839  * PPIN is the protected processor inventory number. On AMD this is an actual
3840  * feature bit. However, on Intel systems we need to read the platform
3841  * information MSR if we're on a specific model.
3842  */
3843 #if !defined(__xpv)
3844 static void
cpuid_basic_ppin(cpu_t * cpu,uchar_t * featureset)3845 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3846 {
3847 	on_trap_data_t otd;
3848 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3849 
3850 	switch (cpi->cpi_vendor) {
3851 	case X86_VENDOR_AMD:
3852 		/*
3853 		 * This leaf will have already been gathered in the topology
3854 		 * functions.
3855 		 */
3856 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3857 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3858 				add_x86_feature(featureset, X86FSET_PPIN);
3859 			}
3860 		}
3861 		break;
3862 	case X86_VENDOR_Intel:
3863 		if (cpi->cpi_family != 6)
3864 			break;
3865 		switch (cpi->cpi_model) {
3866 		case INTC_MODEL_IVYBRIDGE_XEON:
3867 		case INTC_MODEL_HASWELL_XEON:
3868 		case INTC_MODEL_BROADWELL_XEON:
3869 		case INTC_MODEL_BROADWELL_XEON_D:
3870 		case INTC_MODEL_SKYLAKE_XEON:
3871 		case INTC_MODEL_ICELAKE_XEON:
3872 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3873 				uint64_t value;
3874 
3875 				value = rdmsr(MSR_PLATFORM_INFO);
3876 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3877 					add_x86_feature(featureset,
3878 					    X86FSET_PPIN);
3879 				}
3880 			}
3881 			no_trap();
3882 			break;
3883 		default:
3884 			break;
3885 		}
3886 		break;
3887 	default:
3888 		break;
3889 	}
3890 }
3891 #endif	/* ! __xpv */
3892 
3893 static void
cpuid_pass_prelude(cpu_t * cpu,void * arg)3894 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3895 {
3896 	uchar_t *featureset = (uchar_t *)arg;
3897 
3898 	/*
3899 	 * We don't run on any processor that doesn't have cpuid, and could not
3900 	 * possibly have arrived here.
3901 	 */
3902 	add_x86_feature(featureset, X86FSET_CPUID);
3903 }
3904 
3905 static void
cpuid_pass_ident(cpu_t * cpu,void * arg __unused)3906 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3907 {
3908 	struct cpuid_info *cpi;
3909 	struct cpuid_regs *cp;
3910 
3911 	/*
3912 	 * We require that virtual/native detection be complete and that PCI
3913 	 * config space access has been set up; at present there is no reliable
3914 	 * way to determine the latter.
3915 	 */
3916 #if !defined(__xpv)
3917 	ASSERT3S(platform_type, !=, -1);
3918 #endif	/* !__xpv */
3919 
3920 	cpi = cpu->cpu_m.mcpu_cpi;
3921 	ASSERT(cpi != NULL);
3922 
3923 	cp = &cpi->cpi_std[0];
3924 	cp->cp_eax = 0;
3925 	cpi->cpi_maxeax = __cpuid_insn(cp);
3926 	{
3927 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3928 		*iptr++ = cp->cp_ebx;
3929 		*iptr++ = cp->cp_edx;
3930 		*iptr++ = cp->cp_ecx;
3931 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3932 	}
3933 
3934 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3935 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3936 
3937 	/*
3938 	 * Limit the range in case of weird hardware
3939 	 */
3940 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3941 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3942 	if (cpi->cpi_maxeax < 1)
3943 		return;
3944 
3945 	cp = &cpi->cpi_std[1];
3946 	cp->cp_eax = 1;
3947 	(void) __cpuid_insn(cp);
3948 
3949 	/*
3950 	 * Extract identifying constants for easy access.
3951 	 */
3952 	cpi->cpi_model = CPI_MODEL(cpi);
3953 	cpi->cpi_family = CPI_FAMILY(cpi);
3954 
3955 	if (cpi->cpi_family == 0xf)
3956 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3957 
3958 	/*
3959 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3960 	 * Intel, and presumably everyone else, uses model == 0xf, as
3961 	 * one would expect (max value means possible overflow).  Sigh.
3962 	 */
3963 
3964 	switch (cpi->cpi_vendor) {
3965 	case X86_VENDOR_Intel:
3966 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3967 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3968 		break;
3969 	case X86_VENDOR_AMD:
3970 		if (CPI_FAMILY(cpi) == 0xf)
3971 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3972 		break;
3973 	case X86_VENDOR_HYGON:
3974 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3975 		break;
3976 	default:
3977 		if (cpi->cpi_model == 0xf)
3978 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3979 		break;
3980 	}
3981 
3982 	cpi->cpi_step = CPI_STEP(cpi);
3983 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3984 
3985 	/*
3986 	 * Synthesize chip "revision" and socket type
3987 	 */
3988 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3989 	    cpi->cpi_model, cpi->cpi_step);
3990 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3991 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3992 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3993 	    cpi->cpi_model, cpi->cpi_step);
3994 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3995 	    cpi->cpi_model, cpi->cpi_step);
3996 }
3997 
3998 static void
cpuid_pass_basic(cpu_t * cpu,void * arg)3999 cpuid_pass_basic(cpu_t *cpu, void *arg)
4000 {
4001 	uchar_t *featureset = (uchar_t *)arg;
4002 	uint32_t mask_ecx, mask_edx;
4003 	struct cpuid_info *cpi;
4004 	struct cpuid_regs *cp;
4005 	int xcpuid;
4006 #if !defined(__xpv)
4007 	extern int idle_cpu_prefer_mwait;
4008 #endif
4009 
4010 	cpi = cpu->cpu_m.mcpu_cpi;
4011 	ASSERT(cpi != NULL);
4012 
4013 	if (cpi->cpi_maxeax < 1)
4014 		return;
4015 
4016 	/*
4017 	 * This was filled during the identification pass.
4018 	 */
4019 	cp = &cpi->cpi_std[1];
4020 
4021 	/*
4022 	 * *default* assumptions:
4023 	 * - believe %edx feature word
4024 	 * - ignore %ecx feature word
4025 	 * - 32-bit virtual and physical addressing
4026 	 */
4027 	mask_edx = 0xffffffff;
4028 	mask_ecx = 0;
4029 
4030 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
4031 
4032 	switch (cpi->cpi_vendor) {
4033 	case X86_VENDOR_Intel:
4034 		if (cpi->cpi_family == 5)
4035 			x86_type = X86_TYPE_P5;
4036 		else if (IS_LEGACY_P6(cpi)) {
4037 			x86_type = X86_TYPE_P6;
4038 			pentiumpro_bug4046376 = 1;
4039 			/*
4040 			 * Clear the SEP bit when it was set erroneously
4041 			 */
4042 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
4043 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
4044 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
4045 			x86_type = X86_TYPE_P4;
4046 			/*
4047 			 * We don't currently depend on any of the %ecx
4048 			 * features until Prescott, so we'll only check
4049 			 * this from P4 onwards.  We might want to revisit
4050 			 * that idea later.
4051 			 */
4052 			mask_ecx = 0xffffffff;
4053 		} else if (cpi->cpi_family > 0xf)
4054 			mask_ecx = 0xffffffff;
4055 		/*
4056 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4057 		 * to obtain the monitor linesize.
4058 		 */
4059 		if (cpi->cpi_maxeax < 5)
4060 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4061 		break;
4062 	case X86_VENDOR_IntelClone:
4063 	default:
4064 		break;
4065 	case X86_VENDOR_AMD:
4066 #if defined(OPTERON_ERRATUM_108)
4067 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
4068 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
4069 			cpi->cpi_model = 0xc;
4070 		} else
4071 #endif
4072 		if (cpi->cpi_family == 5) {
4073 			/*
4074 			 * AMD K5 and K6
4075 			 *
4076 			 * These CPUs have an incomplete implementation
4077 			 * of MCA/MCE which we mask away.
4078 			 */
4079 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
4080 
4081 			/*
4082 			 * Model 0 uses the wrong (APIC) bit
4083 			 * to indicate PGE.  Fix it here.
4084 			 */
4085 			if (cpi->cpi_model == 0) {
4086 				if (cp->cp_edx & 0x200) {
4087 					cp->cp_edx &= ~0x200;
4088 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
4089 				}
4090 			}
4091 
4092 			/*
4093 			 * Early models had problems w/ MMX; disable.
4094 			 */
4095 			if (cpi->cpi_model < 6)
4096 				mask_edx &= ~CPUID_INTC_EDX_MMX;
4097 		}
4098 
4099 		/*
4100 		 * For newer families, SSE3 and CX16, at least, are valid;
4101 		 * enable all
4102 		 */
4103 		if (cpi->cpi_family >= 0xf)
4104 			mask_ecx = 0xffffffff;
4105 		/*
4106 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
4107 		 * to obtain the monitor linesize.
4108 		 */
4109 		if (cpi->cpi_maxeax < 5)
4110 			mask_ecx &= ~CPUID_INTC_ECX_MON;
4111 
4112 #if !defined(__xpv)
4113 		/*
4114 		 * AMD has not historically used MWAIT in the CPU's idle loop.
4115 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
4116 		 * know for certain that in at least family 17h, per AMD, mwait
4117 		 * is preferred. Families in-between are less certain.
4118 		 */
4119 		if (cpi->cpi_family < 0x17) {
4120 			idle_cpu_prefer_mwait = 0;
4121 		}
4122 #endif
4123 
4124 		break;
4125 	case X86_VENDOR_HYGON:
4126 		/* Enable all for Hygon Dhyana CPU */
4127 		mask_ecx = 0xffffffff;
4128 		break;
4129 	case X86_VENDOR_TM:
4130 		/*
4131 		 * workaround the NT workaround in CMS 4.1
4132 		 */
4133 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
4134 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
4135 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4136 		break;
4137 	case X86_VENDOR_Centaur:
4138 		/*
4139 		 * workaround the NT workarounds again
4140 		 */
4141 		if (cpi->cpi_family == 6)
4142 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
4143 		break;
4144 	case X86_VENDOR_Cyrix:
4145 		/*
4146 		 * We rely heavily on the probing in locore
4147 		 * to actually figure out what parts, if any,
4148 		 * of the Cyrix cpuid instruction to believe.
4149 		 */
4150 		switch (x86_type) {
4151 		case X86_TYPE_CYRIX_486:
4152 			mask_edx = 0;
4153 			break;
4154 		case X86_TYPE_CYRIX_6x86:
4155 			mask_edx = 0;
4156 			break;
4157 		case X86_TYPE_CYRIX_6x86L:
4158 			mask_edx =
4159 			    CPUID_INTC_EDX_DE |
4160 			    CPUID_INTC_EDX_CX8;
4161 			break;
4162 		case X86_TYPE_CYRIX_6x86MX:
4163 			mask_edx =
4164 			    CPUID_INTC_EDX_DE |
4165 			    CPUID_INTC_EDX_MSR |
4166 			    CPUID_INTC_EDX_CX8 |
4167 			    CPUID_INTC_EDX_PGE |
4168 			    CPUID_INTC_EDX_CMOV |
4169 			    CPUID_INTC_EDX_MMX;
4170 			break;
4171 		case X86_TYPE_CYRIX_GXm:
4172 			mask_edx =
4173 			    CPUID_INTC_EDX_MSR |
4174 			    CPUID_INTC_EDX_CX8 |
4175 			    CPUID_INTC_EDX_CMOV |
4176 			    CPUID_INTC_EDX_MMX;
4177 			break;
4178 		case X86_TYPE_CYRIX_MediaGX:
4179 			break;
4180 		case X86_TYPE_CYRIX_MII:
4181 		case X86_TYPE_VIA_CYRIX_III:
4182 			mask_edx =
4183 			    CPUID_INTC_EDX_DE |
4184 			    CPUID_INTC_EDX_TSC |
4185 			    CPUID_INTC_EDX_MSR |
4186 			    CPUID_INTC_EDX_CX8 |
4187 			    CPUID_INTC_EDX_PGE |
4188 			    CPUID_INTC_EDX_CMOV |
4189 			    CPUID_INTC_EDX_MMX;
4190 			break;
4191 		default:
4192 			break;
4193 		}
4194 		break;
4195 	}
4196 
4197 #if defined(__xpv)
4198 	/*
4199 	 * Do not support MONITOR/MWAIT under a hypervisor
4200 	 */
4201 	mask_ecx &= ~CPUID_INTC_ECX_MON;
4202 	/*
4203 	 * Do not support XSAVE under a hypervisor for now
4204 	 */
4205 	xsave_force_disable = B_TRUE;
4206 
4207 #endif	/* __xpv */
4208 
4209 	if (xsave_force_disable) {
4210 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4211 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
4212 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
4213 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
4214 	}
4215 
4216 	/*
4217 	 * Now we've figured out the masks that determine
4218 	 * which bits we choose to believe, apply the masks
4219 	 * to the feature words, then map the kernel's view
4220 	 * of these feature words into its feature word.
4221 	 */
4222 	cp->cp_edx &= mask_edx;
4223 	cp->cp_ecx &= mask_ecx;
4224 
4225 	/*
4226 	 * apply any platform restrictions (we don't call this
4227 	 * immediately after __cpuid_insn here, because we need the
4228 	 * workarounds applied above first)
4229 	 */
4230 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4231 
4232 	/*
4233 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
4234 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4235 	 * 7 has sub-leaves determined by ecx.
4236 	 */
4237 	if (cpi->cpi_maxeax >= 7) {
4238 		struct cpuid_regs *ecp;
4239 		ecp = &cpi->cpi_std[7];
4240 		ecp->cp_eax = 7;
4241 		ecp->cp_ecx = 0;
4242 		(void) __cpuid_insn(ecp);
4243 
4244 		/*
4245 		 * If XSAVE has been disabled, just ignore all of the
4246 		 * extended-save-area dependent flags here. By removing most of
4247 		 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
4248 		 * end up looking at additional xsave dependent leaves right
4249 		 * now.
4250 		 */
4251 		if (xsave_force_disable) {
4252 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4253 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4254 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4255 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4256 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4257 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4258 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4259 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4260 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4261 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4262 		}
4263 
4264 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4265 			add_x86_feature(featureset, X86FSET_SMEP);
4266 
4267 		/*
4268 		 * We check disable_smap here in addition to in startup_smap()
4269 		 * to ensure CPUs that aren't the boot CPU don't accidentally
4270 		 * include it in the feature set and thus generate a mismatched
4271 		 * x86 feature set across CPUs.
4272 		 */
4273 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4274 		    disable_smap == 0)
4275 			add_x86_feature(featureset, X86FSET_SMAP);
4276 
4277 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
4278 			add_x86_feature(featureset, X86FSET_RDSEED);
4279 
4280 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4281 			add_x86_feature(featureset, X86FSET_ADX);
4282 
4283 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4284 			add_x86_feature(featureset, X86FSET_FSGSBASE);
4285 
4286 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4287 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4288 
4289 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4290 			add_x86_feature(featureset, X86FSET_INVPCID);
4291 
4292 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4293 			add_x86_feature(featureset, X86FSET_UMIP);
4294 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4295 			add_x86_feature(featureset, X86FSET_PKU);
4296 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4297 			add_x86_feature(featureset, X86FSET_OSPKE);
4298 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4299 			add_x86_feature(featureset, X86FSET_GFNI);
4300 
4301 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4302 			add_x86_feature(featureset, X86FSET_CLWB);
4303 
4304 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4305 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4306 				add_x86_feature(featureset, X86FSET_MPX);
4307 		}
4308 
4309 		/*
4310 		 * If we have subleaf 1 or 2 available, grab and store
4311 		 * that. This is used for more AVX and related features.
4312 		 */
4313 		if (ecp->cp_eax >= 1) {
4314 			struct cpuid_regs *c71;
4315 			c71 = &cpi->cpi_sub7[0];
4316 			c71->cp_eax = 7;
4317 			c71->cp_ecx = 1;
4318 			(void) __cpuid_insn(c71);
4319 		}
4320 
4321 		/* Subleaf 2 has certain security indicators in it. */
4322 		if (ecp->cp_eax >= 2) {
4323 			struct cpuid_regs *c72;
4324 			c72 = &cpi->cpi_sub7[1];
4325 			c72->cp_eax = 7;
4326 			c72->cp_ecx = 2;
4327 			(void) __cpuid_insn(c72);
4328 		}
4329 	}
4330 
4331 	/*
4332 	 * fold in overrides from the "eeprom" mechanism
4333 	 */
4334 	cp->cp_edx |= cpuid_feature_edx_include;
4335 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4336 
4337 	cp->cp_ecx |= cpuid_feature_ecx_include;
4338 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4339 
4340 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4341 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4342 	}
4343 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4344 		add_x86_feature(featureset, X86FSET_TSC);
4345 	}
4346 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4347 		add_x86_feature(featureset, X86FSET_MSR);
4348 	}
4349 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4350 		add_x86_feature(featureset, X86FSET_MTRR);
4351 	}
4352 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4353 		add_x86_feature(featureset, X86FSET_PGE);
4354 	}
4355 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4356 		add_x86_feature(featureset, X86FSET_CMOV);
4357 	}
4358 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4359 		add_x86_feature(featureset, X86FSET_MMX);
4360 	}
4361 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4362 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4363 		add_x86_feature(featureset, X86FSET_MCA);
4364 	}
4365 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4366 		add_x86_feature(featureset, X86FSET_PAE);
4367 	}
4368 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4369 		add_x86_feature(featureset, X86FSET_CX8);
4370 	}
4371 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4372 		add_x86_feature(featureset, X86FSET_CX16);
4373 	}
4374 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4375 		add_x86_feature(featureset, X86FSET_PAT);
4376 	}
4377 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4378 		add_x86_feature(featureset, X86FSET_SEP);
4379 	}
4380 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4381 		/*
4382 		 * In our implementation, fxsave/fxrstor
4383 		 * are prerequisites before we'll even
4384 		 * try and do SSE things.
4385 		 */
4386 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4387 			add_x86_feature(featureset, X86FSET_SSE);
4388 		}
4389 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4390 			add_x86_feature(featureset, X86FSET_SSE2);
4391 		}
4392 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4393 			add_x86_feature(featureset, X86FSET_SSE3);
4394 		}
4395 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4396 			add_x86_feature(featureset, X86FSET_SSSE3);
4397 		}
4398 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4399 			add_x86_feature(featureset, X86FSET_SSE4_1);
4400 		}
4401 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4402 			add_x86_feature(featureset, X86FSET_SSE4_2);
4403 		}
4404 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4405 			add_x86_feature(featureset, X86FSET_AES);
4406 		}
4407 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4408 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4409 		}
4410 
4411 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4412 			add_x86_feature(featureset, X86FSET_SHA);
4413 
4414 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4415 			add_x86_feature(featureset, X86FSET_XSAVE);
4416 
4417 			/* We only test AVX & AVX512 when there is XSAVE */
4418 			cpuid_basic_avx(cpu, featureset);
4419 		}
4420 	}
4421 
4422 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4423 		add_x86_feature(featureset, X86FSET_PCID);
4424 	}
4425 
4426 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4427 		add_x86_feature(featureset, X86FSET_X2APIC);
4428 	}
4429 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4430 		add_x86_feature(featureset, X86FSET_DE);
4431 	}
4432 #if !defined(__xpv)
4433 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4434 
4435 		/*
4436 		 * We require the CLFLUSH instruction for erratum workaround
4437 		 * to use MONITOR/MWAIT.
4438 		 */
4439 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4440 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4441 			add_x86_feature(featureset, X86FSET_MWAIT);
4442 		} else {
4443 			extern int idle_cpu_assert_cflush_monitor;
4444 
4445 			/*
4446 			 * All processors we are aware of which have
4447 			 * MONITOR/MWAIT also have CLFLUSH.
4448 			 */
4449 			if (idle_cpu_assert_cflush_monitor) {
4450 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4451 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4452 			}
4453 		}
4454 	}
4455 #endif	/* __xpv */
4456 
4457 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4458 		add_x86_feature(featureset, X86FSET_VMX);
4459 	}
4460 
4461 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4462 		add_x86_feature(featureset, X86FSET_RDRAND);
4463 
4464 	/*
4465 	 * Only need it first time, rest of the cpus would follow suit.
4466 	 * we only capture this for the bootcpu.
4467 	 */
4468 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4469 		add_x86_feature(featureset, X86FSET_CLFSH);
4470 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4471 	}
4472 	if (is_x86_feature(featureset, X86FSET_PAE))
4473 		cpi->cpi_pabits = 36;
4474 
4475 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4476 		struct cpuid_regs r, *ecp;
4477 
4478 		ecp = &r;
4479 		ecp->cp_eax = 0xD;
4480 		ecp->cp_ecx = 1;
4481 		ecp->cp_edx = ecp->cp_ebx = 0;
4482 		(void) __cpuid_insn(ecp);
4483 
4484 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4485 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4486 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4487 			add_x86_feature(featureset, X86FSET_XSAVEC);
4488 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4489 			add_x86_feature(featureset, X86FSET_XSAVES);
4490 
4491 		/*
4492 		 * Zen 2 family processors suffer from erratum 1386 that causes
4493 		 * xsaves to not function correctly in some circumstances. There
4494 		 * are no supervisor states in Zen 2 and earlier. Practically
4495 		 * speaking this has no impact for us as we currently do not
4496 		 * leverage compressed xsave formats. To safeguard against
4497 		 * issues in the future where we may opt to using it, we remove
4498 		 * it from the feature set now. While Matisse has a microcode
4499 		 * update available with a fix, not all Zen 2 CPUs do so it's
4500 		 * simpler for the moment to unconditionally remove it.
4501 		 */
4502 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4503 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4504 			remove_x86_feature(featureset, X86FSET_XSAVES);
4505 		}
4506 	}
4507 
4508 	/*
4509 	 * Work on the "extended" feature information, doing
4510 	 * some basic initialization to be used in the extended pass.
4511 	 */
4512 	xcpuid = 0;
4513 	switch (cpi->cpi_vendor) {
4514 	case X86_VENDOR_Intel:
4515 		/*
4516 		 * On KVM we know we will have proper support for extended
4517 		 * cpuid.
4518 		 */
4519 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4520 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4521 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4522 			xcpuid++;
4523 		break;
4524 	case X86_VENDOR_AMD:
4525 		if (cpi->cpi_family > 5 ||
4526 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4527 			xcpuid++;
4528 		break;
4529 	case X86_VENDOR_Cyrix:
4530 		/*
4531 		 * Only these Cyrix CPUs are -known- to support
4532 		 * extended cpuid operations.
4533 		 */
4534 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4535 		    x86_type == X86_TYPE_CYRIX_GXm)
4536 			xcpuid++;
4537 		break;
4538 	case X86_VENDOR_HYGON:
4539 	case X86_VENDOR_Centaur:
4540 	case X86_VENDOR_TM:
4541 	default:
4542 		xcpuid++;
4543 		break;
4544 	}
4545 
4546 	if (xcpuid) {
4547 		cp = &cpi->cpi_extd[0];
4548 		cp->cp_eax = CPUID_LEAF_EXT_0;
4549 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4550 	}
4551 
4552 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4553 
4554 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4555 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4556 
4557 		switch (cpi->cpi_vendor) {
4558 		case X86_VENDOR_Intel:
4559 		case X86_VENDOR_AMD:
4560 		case X86_VENDOR_HYGON:
4561 			if (cpi->cpi_xmaxeax < 0x80000001)
4562 				break;
4563 			cp = &cpi->cpi_extd[1];
4564 			cp->cp_eax = 0x80000001;
4565 			(void) __cpuid_insn(cp);
4566 
4567 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4568 			    cpi->cpi_family == 5 &&
4569 			    cpi->cpi_model == 6 &&
4570 			    cpi->cpi_step == 6) {
4571 				/*
4572 				 * K6 model 6 uses bit 10 to indicate SYSC
4573 				 * Later models use bit 11. Fix it here.
4574 				 */
4575 				if (cp->cp_edx & 0x400) {
4576 					cp->cp_edx &= ~0x400;
4577 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4578 				}
4579 			}
4580 
4581 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4582 
4583 			/*
4584 			 * Compute the additions to the kernel's feature word.
4585 			 */
4586 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4587 				add_x86_feature(featureset, X86FSET_NX);
4588 			}
4589 
4590 			/*
4591 			 * Regardless whether or not we boot 64-bit,
4592 			 * we should have a way to identify whether
4593 			 * the CPU is capable of running 64-bit.
4594 			 */
4595 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4596 				add_x86_feature(featureset, X86FSET_64);
4597 			}
4598 
4599 			/* 1 GB large page - enable only for 64 bit kernel */
4600 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4601 				add_x86_feature(featureset, X86FSET_1GPG);
4602 			}
4603 
4604 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4605 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4606 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4607 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4608 				add_x86_feature(featureset, X86FSET_SSE4A);
4609 			}
4610 
4611 			/*
4612 			 * It's really tricky to support syscall/sysret in
4613 			 * the i386 kernel; we rely on sysenter/sysexit
4614 			 * instead.  In the amd64 kernel, things are -way-
4615 			 * better.
4616 			 */
4617 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4618 				add_x86_feature(featureset, X86FSET_ASYSC);
4619 			}
4620 
4621 			/*
4622 			 * While we're thinking about system calls, note
4623 			 * that AMD processors don't support sysenter
4624 			 * in long mode at all, so don't try to program them.
4625 			 */
4626 			if (x86_vendor == X86_VENDOR_AMD ||
4627 			    x86_vendor == X86_VENDOR_HYGON) {
4628 				remove_x86_feature(featureset, X86FSET_SEP);
4629 			}
4630 
4631 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4632 				add_x86_feature(featureset, X86FSET_TSCP);
4633 			}
4634 
4635 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4636 				add_x86_feature(featureset, X86FSET_SVM);
4637 			}
4638 
4639 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4640 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4641 			}
4642 
4643 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4644 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4645 			}
4646 
4647 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4648 				add_x86_feature(featureset, X86FSET_XOP);
4649 			}
4650 
4651 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4652 				add_x86_feature(featureset, X86FSET_FMA4);
4653 			}
4654 
4655 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4656 				add_x86_feature(featureset, X86FSET_TBM);
4657 			}
4658 
4659 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4660 				add_x86_feature(featureset, X86FSET_MONITORX);
4661 			}
4662 			break;
4663 		default:
4664 			break;
4665 		}
4666 
4667 		/*
4668 		 * Get CPUID data about processor cores and hyperthreads.
4669 		 */
4670 		switch (cpi->cpi_vendor) {
4671 		case X86_VENDOR_Intel:
4672 			if (cpi->cpi_maxeax >= 4) {
4673 				cp = &cpi->cpi_std[4];
4674 				cp->cp_eax = 4;
4675 				cp->cp_ecx = 0;
4676 				(void) __cpuid_insn(cp);
4677 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4678 			}
4679 			/*FALLTHROUGH*/
4680 		case X86_VENDOR_AMD:
4681 		case X86_VENDOR_HYGON:
4682 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4683 				break;
4684 			cp = &cpi->cpi_extd[8];
4685 			cp->cp_eax = CPUID_LEAF_EXT_8;
4686 			(void) __cpuid_insn(cp);
4687 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4688 			    cp);
4689 
4690 			/*
4691 			 * AMD uses ebx for some extended functions.
4692 			 */
4693 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4694 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4695 				/*
4696 				 * While we're here, check for the AMD "Error
4697 				 * Pointer Zero/Restore" feature. This can be
4698 				 * used to setup the FP save handlers
4699 				 * appropriately.
4700 				 */
4701 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4702 					cpi->cpi_fp_amd_save = 0;
4703 				} else {
4704 					cpi->cpi_fp_amd_save = 1;
4705 				}
4706 
4707 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4708 					add_x86_feature(featureset,
4709 					    X86FSET_CLZERO);
4710 				}
4711 			}
4712 
4713 			/*
4714 			 * Virtual and physical address limits from
4715 			 * cpuid override previously guessed values.
4716 			 */
4717 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4718 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4719 			break;
4720 		default:
4721 			break;
4722 		}
4723 
4724 		/*
4725 		 * Get CPUID data about TSC Invariance in Deep C-State.
4726 		 */
4727 		switch (cpi->cpi_vendor) {
4728 		case X86_VENDOR_Intel:
4729 		case X86_VENDOR_AMD:
4730 		case X86_VENDOR_HYGON:
4731 			if (cpi->cpi_maxeax >= 7) {
4732 				cp = &cpi->cpi_extd[7];
4733 				cp->cp_eax = 0x80000007;
4734 				cp->cp_ecx = 0;
4735 				(void) __cpuid_insn(cp);
4736 			}
4737 			break;
4738 		default:
4739 			break;
4740 		}
4741 	}
4742 
4743 	/*
4744 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4745 	 * run and thus gathered some of its dependent leaves.
4746 	 */
4747 	cpuid_basic_topology(cpu, featureset);
4748 	cpuid_basic_thermal(cpu, featureset);
4749 #if !defined(__xpv)
4750 	cpuid_basic_ppin(cpu, featureset);
4751 #endif
4752 
4753 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4754 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4755 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4756 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4757 			/* Special handling for AMD FP not necessary. */
4758 			cpi->cpi_fp_amd_save = 0;
4759 		} else {
4760 			cpi->cpi_fp_amd_save = 1;
4761 		}
4762 	}
4763 
4764 	/*
4765 	 * Check (and potentially set) if lfence is serializing.
4766 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4767 	 */
4768 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4769 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4770 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4771 		/*
4772 		 * The AMD white paper Software Techniques For Managing
4773 		 * Speculation on AMD Processors details circumstances for when
4774 		 * lfence instructions are serializing.
4775 		 *
4776 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4777 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4778 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4779 		 * committed to supporting that MSR on all later CPUs.
4780 		 */
4781 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4782 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4783 		} else if (cpi->cpi_family >= 0x10) {
4784 #if !defined(__xpv)
4785 			uint64_t val;
4786 
4787 			/*
4788 			 * Be careful when attempting to enable the bit, and
4789 			 * verify that it was actually set in case we are
4790 			 * running in a hypervisor which is less than faithful
4791 			 * about its emulation of this feature.
4792 			 */
4793 			on_trap_data_t otd;
4794 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4795 				val = rdmsr(MSR_AMD_DE_CFG);
4796 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4797 				wrmsr(MSR_AMD_DE_CFG, val);
4798 				val = rdmsr(MSR_AMD_DE_CFG);
4799 			} else {
4800 				val = 0;
4801 			}
4802 			no_trap();
4803 
4804 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4805 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4806 			}
4807 #endif
4808 		}
4809 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4810 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4811 		/*
4812 		 * Documentation and other OSes indicate that lfence is always
4813 		 * serializing on Intel CPUs.
4814 		 */
4815 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4816 	}
4817 
4818 
4819 	/*
4820 	 * Check the processor leaves that are used for security features. Grab
4821 	 * any additional processor-specific leaves that we may not have yet.
4822 	 */
4823 	switch (cpi->cpi_vendor) {
4824 	case X86_VENDOR_AMD:
4825 	case X86_VENDOR_HYGON:
4826 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4827 			cp = &cpi->cpi_extd[7];
4828 			cp->cp_eax = CPUID_LEAF_EXT_21;
4829 			cp->cp_ecx = 0;
4830 			(void) __cpuid_insn(cp);
4831 		}
4832 		break;
4833 	default:
4834 		break;
4835 	}
4836 
4837 	cpuid_scan_security(cpu, featureset);
4838 }
4839 
4840 /*
4841  * Make copies of the cpuid table entries we depend on, in
4842  * part for ease of parsing now, in part so that we have only
4843  * one place to correct any of it, in part for ease of
4844  * later export to userland, and in part so we can look at
4845  * this stuff in a crash dump.
4846  */
4847 
4848 static void
cpuid_pass_extended(cpu_t * cpu,void * _arg __unused)4849 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4850 {
4851 	uint_t n, nmax;
4852 	int i;
4853 	struct cpuid_regs *cp;
4854 	uint8_t *dp;
4855 	uint32_t *iptr;
4856 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4857 
4858 	if (cpi->cpi_maxeax < 1)
4859 		return;
4860 
4861 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4862 		nmax = NMAX_CPI_STD;
4863 	/*
4864 	 * (We already handled n == 0 and n == 1 in the basic pass)
4865 	 */
4866 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4867 		/*
4868 		 * leaves 6 and 7 were handled in the basic pass
4869 		 */
4870 		if (n == 6 || n == 7)
4871 			continue;
4872 
4873 		cp->cp_eax = n;
4874 
4875 		/*
4876 		 * CPUID function 4 expects %ecx to be initialized
4877 		 * with an index which indicates which cache to return
4878 		 * information about. The OS is expected to call function 4
4879 		 * with %ecx set to 0, 1, 2, ... until it returns with
4880 		 * EAX[4:0] set to 0, which indicates there are no more
4881 		 * caches.
4882 		 *
4883 		 * Here, populate cpi_std[4] with the information returned by
4884 		 * function 4 when %ecx == 0, and do the rest in a later pass
4885 		 * when dynamic memory allocation becomes available.
4886 		 *
4887 		 * Note: we need to explicitly initialize %ecx here, since
4888 		 * function 4 may have been previously invoked.
4889 		 */
4890 		if (n == 4)
4891 			cp->cp_ecx = 0;
4892 
4893 		(void) __cpuid_insn(cp);
4894 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4895 		switch (n) {
4896 		case 2:
4897 			/*
4898 			 * "the lower 8 bits of the %eax register
4899 			 * contain a value that identifies the number
4900 			 * of times the cpuid [instruction] has to be
4901 			 * executed to obtain a complete image of the
4902 			 * processor's caching systems."
4903 			 *
4904 			 * How *do* they make this stuff up?
4905 			 */
4906 			cpi->cpi_ncache = sizeof (*cp) *
4907 			    BITX(cp->cp_eax, 7, 0);
4908 			if (cpi->cpi_ncache == 0)
4909 				break;
4910 			cpi->cpi_ncache--;	/* skip count byte */
4911 
4912 			/*
4913 			 * Well, for now, rather than attempt to implement
4914 			 * this slightly dubious algorithm, we just look
4915 			 * at the first 15 ..
4916 			 */
4917 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4918 				cpi->cpi_ncache = sizeof (*cp) - 1;
4919 
4920 			dp = cpi->cpi_cacheinfo;
4921 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4922 				uint8_t *p = (void *)&cp->cp_eax;
4923 				for (i = 1; i < 4; i++)
4924 					if (p[i] != 0)
4925 						*dp++ = p[i];
4926 			}
4927 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4928 				uint8_t *p = (void *)&cp->cp_ebx;
4929 				for (i = 0; i < 4; i++)
4930 					if (p[i] != 0)
4931 						*dp++ = p[i];
4932 			}
4933 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4934 				uint8_t *p = (void *)&cp->cp_ecx;
4935 				for (i = 0; i < 4; i++)
4936 					if (p[i] != 0)
4937 						*dp++ = p[i];
4938 			}
4939 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4940 				uint8_t *p = (void *)&cp->cp_edx;
4941 				for (i = 0; i < 4; i++)
4942 					if (p[i] != 0)
4943 						*dp++ = p[i];
4944 			}
4945 			break;
4946 
4947 		case 3:	/* Processor serial number, if PSN supported */
4948 			break;
4949 
4950 		case 4:	/* Deterministic cache parameters */
4951 			break;
4952 
4953 		case 5:	/* Monitor/Mwait parameters */
4954 		{
4955 			size_t mwait_size;
4956 
4957 			/*
4958 			 * check cpi_mwait.support which was set in
4959 			 * cpuid_pass_basic()
4960 			 */
4961 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4962 				break;
4963 
4964 			/*
4965 			 * Protect ourself from insane mwait line size.
4966 			 * Workaround for incomplete hardware emulator(s).
4967 			 */
4968 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4969 			if (mwait_size < sizeof (uint32_t) ||
4970 			    !ISP2(mwait_size)) {
4971 #if DEBUG
4972 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4973 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4974 #endif
4975 				break;
4976 			}
4977 
4978 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4979 			cpi->cpi_mwait.mon_max = mwait_size;
4980 			if (MWAIT_EXTENSION(cpi)) {
4981 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4982 				if (MWAIT_INT_ENABLE(cpi))
4983 					cpi->cpi_mwait.support |=
4984 					    MWAIT_ECX_INT_ENABLE;
4985 			}
4986 			break;
4987 		}
4988 		default:
4989 			break;
4990 		}
4991 	}
4992 
4993 	/*
4994 	 * XSAVE enumeration
4995 	 */
4996 	if (cpi->cpi_maxeax >= 0xD) {
4997 		struct cpuid_regs regs;
4998 		boolean_t cpuid_d_valid = B_TRUE;
4999 
5000 		cp = &regs;
5001 		cp->cp_eax = 0xD;
5002 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
5003 
5004 		(void) __cpuid_insn(cp);
5005 
5006 		/*
5007 		 * Sanity checks for debug
5008 		 */
5009 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
5010 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
5011 			cpuid_d_valid = B_FALSE;
5012 		}
5013 
5014 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
5015 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
5016 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
5017 
5018 		/*
5019 		 * If the hw supports AVX, get the size and offset in the save
5020 		 * area for the ymm state.
5021 		 */
5022 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
5023 			cp->cp_eax = 0xD;
5024 			cp->cp_ecx = 2;
5025 			cp->cp_edx = cp->cp_ebx = 0;
5026 
5027 			(void) __cpuid_insn(cp);
5028 
5029 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
5030 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
5031 				cpuid_d_valid = B_FALSE;
5032 			}
5033 
5034 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
5035 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
5036 		}
5037 
5038 		/*
5039 		 * If the hw supports MPX, get the size and offset in the
5040 		 * save area for BNDREGS and BNDCSR.
5041 		 */
5042 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
5043 			cp->cp_eax = 0xD;
5044 			cp->cp_ecx = 3;
5045 			cp->cp_edx = cp->cp_ebx = 0;
5046 
5047 			(void) __cpuid_insn(cp);
5048 
5049 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
5050 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
5051 
5052 			cp->cp_eax = 0xD;
5053 			cp->cp_ecx = 4;
5054 			cp->cp_edx = cp->cp_ebx = 0;
5055 
5056 			(void) __cpuid_insn(cp);
5057 
5058 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
5059 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
5060 		}
5061 
5062 		/*
5063 		 * If the hw supports AVX512, get the size and offset in the
5064 		 * save area for the opmask registers and zmm state.
5065 		 */
5066 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
5067 			cp->cp_eax = 0xD;
5068 			cp->cp_ecx = 5;
5069 			cp->cp_edx = cp->cp_ebx = 0;
5070 
5071 			(void) __cpuid_insn(cp);
5072 
5073 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
5074 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
5075 
5076 			cp->cp_eax = 0xD;
5077 			cp->cp_ecx = 6;
5078 			cp->cp_edx = cp->cp_ebx = 0;
5079 
5080 			(void) __cpuid_insn(cp);
5081 
5082 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
5083 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
5084 
5085 			cp->cp_eax = 0xD;
5086 			cp->cp_ecx = 7;
5087 			cp->cp_edx = cp->cp_ebx = 0;
5088 
5089 			(void) __cpuid_insn(cp);
5090 
5091 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
5092 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
5093 		}
5094 
5095 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
5096 			xsave_state_size = 0;
5097 		} else if (cpuid_d_valid) {
5098 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
5099 		} else {
5100 			/* Broken CPUID 0xD, probably in HVM */
5101 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
5102 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
5103 			    ", ymm_size = %d, ymm_offset = %d\n",
5104 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
5105 			    cpi->cpi_xsave.xsav_hw_features_high,
5106 			    (int)cpi->cpi_xsave.xsav_max_size,
5107 			    (int)cpi->cpi_xsave.ymm_size,
5108 			    (int)cpi->cpi_xsave.ymm_offset);
5109 
5110 			if (xsave_state_size != 0) {
5111 				/*
5112 				 * This must be a non-boot CPU. We cannot
5113 				 * continue, because boot cpu has already
5114 				 * enabled XSAVE.
5115 				 */
5116 				ASSERT(cpu->cpu_id != 0);
5117 				cmn_err(CE_PANIC, "cpu%d: we have already "
5118 				    "enabled XSAVE on boot cpu, cannot "
5119 				    "continue.", cpu->cpu_id);
5120 			} else {
5121 				/*
5122 				 * If we reached here on the boot CPU, it's also
5123 				 * almost certain that we'll reach here on the
5124 				 * non-boot CPUs. When we're here on a boot CPU
5125 				 * we should disable the feature, on a non-boot
5126 				 * CPU we need to confirm that we have.
5127 				 */
5128 				if (cpu->cpu_id == 0) {
5129 					remove_x86_feature(x86_featureset,
5130 					    X86FSET_XSAVE);
5131 					remove_x86_feature(x86_featureset,
5132 					    X86FSET_AVX);
5133 					remove_x86_feature(x86_featureset,
5134 					    X86FSET_F16C);
5135 					remove_x86_feature(x86_featureset,
5136 					    X86FSET_BMI1);
5137 					remove_x86_feature(x86_featureset,
5138 					    X86FSET_BMI2);
5139 					remove_x86_feature(x86_featureset,
5140 					    X86FSET_FMA);
5141 					remove_x86_feature(x86_featureset,
5142 					    X86FSET_AVX2);
5143 					remove_x86_feature(x86_featureset,
5144 					    X86FSET_MPX);
5145 					remove_x86_feature(x86_featureset,
5146 					    X86FSET_AVX512F);
5147 					remove_x86_feature(x86_featureset,
5148 					    X86FSET_AVX512DQ);
5149 					remove_x86_feature(x86_featureset,
5150 					    X86FSET_AVX512PF);
5151 					remove_x86_feature(x86_featureset,
5152 					    X86FSET_AVX512ER);
5153 					remove_x86_feature(x86_featureset,
5154 					    X86FSET_AVX512CD);
5155 					remove_x86_feature(x86_featureset,
5156 					    X86FSET_AVX512BW);
5157 					remove_x86_feature(x86_featureset,
5158 					    X86FSET_AVX512VL);
5159 					remove_x86_feature(x86_featureset,
5160 					    X86FSET_AVX512FMA);
5161 					remove_x86_feature(x86_featureset,
5162 					    X86FSET_AVX512VBMI);
5163 					remove_x86_feature(x86_featureset,
5164 					    X86FSET_AVX512VNNI);
5165 					remove_x86_feature(x86_featureset,
5166 					    X86FSET_AVX512VPOPCDQ);
5167 					remove_x86_feature(x86_featureset,
5168 					    X86FSET_AVX512NNIW);
5169 					remove_x86_feature(x86_featureset,
5170 					    X86FSET_AVX512FMAPS);
5171 					remove_x86_feature(x86_featureset,
5172 					    X86FSET_VAES);
5173 					remove_x86_feature(x86_featureset,
5174 					    X86FSET_VPCLMULQDQ);
5175 					remove_x86_feature(x86_featureset,
5176 					    X86FSET_GFNI);
5177 					remove_x86_feature(x86_featureset,
5178 					    X86FSET_AVX512_VP2INT);
5179 					remove_x86_feature(x86_featureset,
5180 					    X86FSET_AVX512_BITALG);
5181 					remove_x86_feature(x86_featureset,
5182 					    X86FSET_AVX512_VBMI2);
5183 					remove_x86_feature(x86_featureset,
5184 					    X86FSET_AVX512_BF16);
5185 
5186 					xsave_force_disable = B_TRUE;
5187 				} else {
5188 					VERIFY(is_x86_feature(x86_featureset,
5189 					    X86FSET_XSAVE) == B_FALSE);
5190 				}
5191 			}
5192 		}
5193 	}
5194 
5195 
5196 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
5197 		return;
5198 
5199 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
5200 		nmax = NMAX_CPI_EXTD;
5201 	/*
5202 	 * Copy the extended properties, fixing them as we go. While we start at
5203 	 * 2 because we've already handled a few cases in the basic pass, the
5204 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5205 	 */
5206 	iptr = (void *)cpi->cpi_brandstr;
5207 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5208 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5209 		(void) __cpuid_insn(cp);
5210 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5211 		    cp);
5212 		switch (n) {
5213 		case 2:
5214 		case 3:
5215 		case 4:
5216 			/*
5217 			 * Extract the brand string
5218 			 */
5219 			*iptr++ = cp->cp_eax;
5220 			*iptr++ = cp->cp_ebx;
5221 			*iptr++ = cp->cp_ecx;
5222 			*iptr++ = cp->cp_edx;
5223 			break;
5224 		case 5:
5225 			switch (cpi->cpi_vendor) {
5226 			case X86_VENDOR_AMD:
5227 				/*
5228 				 * The Athlon and Duron were the first
5229 				 * parts to report the sizes of the
5230 				 * TLB for large pages. Before then,
5231 				 * we don't trust the data.
5232 				 */
5233 				if (cpi->cpi_family < 6 ||
5234 				    (cpi->cpi_family == 6 &&
5235 				    cpi->cpi_model < 1))
5236 					cp->cp_eax = 0;
5237 				break;
5238 			default:
5239 				break;
5240 			}
5241 			break;
5242 		case 6:
5243 			switch (cpi->cpi_vendor) {
5244 			case X86_VENDOR_AMD:
5245 				/*
5246 				 * The Athlon and Duron were the first
5247 				 * AMD parts with L2 TLB's.
5248 				 * Before then, don't trust the data.
5249 				 */
5250 				if (cpi->cpi_family < 6 ||
5251 				    (cpi->cpi_family == 6 &&
5252 				    cpi->cpi_model < 1))
5253 					cp->cp_eax = cp->cp_ebx = 0;
5254 				/*
5255 				 * AMD Duron rev A0 reports L2
5256 				 * cache size incorrectly as 1K
5257 				 * when it is really 64K
5258 				 */
5259 				if (cpi->cpi_family == 6 &&
5260 				    cpi->cpi_model == 3 &&
5261 				    cpi->cpi_step == 0) {
5262 					cp->cp_ecx &= 0xffff;
5263 					cp->cp_ecx |= 0x400000;
5264 				}
5265 				break;
5266 			case X86_VENDOR_Cyrix:	/* VIA C3 */
5267 				/*
5268 				 * VIA C3 processors are a bit messed
5269 				 * up w.r.t. encoding cache sizes in %ecx
5270 				 */
5271 				if (cpi->cpi_family != 6)
5272 					break;
5273 				/*
5274 				 * model 7 and 8 were incorrectly encoded
5275 				 *
5276 				 * xxx is model 8 really broken?
5277 				 */
5278 				if (cpi->cpi_model == 7 ||
5279 				    cpi->cpi_model == 8)
5280 					cp->cp_ecx =
5281 					    BITX(cp->cp_ecx, 31, 24) << 16 |
5282 					    BITX(cp->cp_ecx, 23, 16) << 12 |
5283 					    BITX(cp->cp_ecx, 15, 8) << 8 |
5284 					    BITX(cp->cp_ecx, 7, 0);
5285 				/*
5286 				 * model 9 stepping 1 has wrong associativity
5287 				 */
5288 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5289 					cp->cp_ecx |= 8 << 12;
5290 				break;
5291 			case X86_VENDOR_Intel:
5292 				/*
5293 				 * Extended L2 Cache features function.
5294 				 * First appeared on Prescott.
5295 				 */
5296 			default:
5297 				break;
5298 			}
5299 			break;
5300 		default:
5301 			break;
5302 		}
5303 	}
5304 }
5305 
5306 static const char *
intel_cpubrand(const struct cpuid_info * cpi)5307 intel_cpubrand(const struct cpuid_info *cpi)
5308 {
5309 	int i;
5310 
5311 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5312 
5313 	switch (cpi->cpi_family) {
5314 	case 5:
5315 		return ("Intel Pentium(r)");
5316 	case 6:
5317 		switch (cpi->cpi_model) {
5318 			uint_t celeron, xeon;
5319 			const struct cpuid_regs *cp;
5320 		case 0:
5321 		case 1:
5322 		case 2:
5323 			return ("Intel Pentium(r) Pro");
5324 		case 3:
5325 		case 4:
5326 			return ("Intel Pentium(r) II");
5327 		case 6:
5328 			return ("Intel Celeron(r)");
5329 		case 5:
5330 		case 7:
5331 			celeron = xeon = 0;
5332 			cp = &cpi->cpi_std[2];	/* cache info */
5333 
5334 			for (i = 1; i < 4; i++) {
5335 				uint_t tmp;
5336 
5337 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5338 				if (tmp == 0x40)
5339 					celeron++;
5340 				if (tmp >= 0x44 && tmp <= 0x45)
5341 					xeon++;
5342 			}
5343 
5344 			for (i = 0; i < 2; i++) {
5345 				uint_t tmp;
5346 
5347 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5348 				if (tmp == 0x40)
5349 					celeron++;
5350 				else if (tmp >= 0x44 && tmp <= 0x45)
5351 					xeon++;
5352 			}
5353 
5354 			for (i = 0; i < 4; i++) {
5355 				uint_t tmp;
5356 
5357 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5358 				if (tmp == 0x40)
5359 					celeron++;
5360 				else if (tmp >= 0x44 && tmp <= 0x45)
5361 					xeon++;
5362 			}
5363 
5364 			for (i = 0; i < 4; i++) {
5365 				uint_t tmp;
5366 
5367 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5368 				if (tmp == 0x40)
5369 					celeron++;
5370 				else if (tmp >= 0x44 && tmp <= 0x45)
5371 					xeon++;
5372 			}
5373 
5374 			if (celeron)
5375 				return ("Intel Celeron(r)");
5376 			if (xeon)
5377 				return (cpi->cpi_model == 5 ?
5378 				    "Intel Pentium(r) II Xeon(tm)" :
5379 				    "Intel Pentium(r) III Xeon(tm)");
5380 			return (cpi->cpi_model == 5 ?
5381 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5382 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5383 		default:
5384 			break;
5385 		}
5386 	default:
5387 		break;
5388 	}
5389 
5390 	/* BrandID is present if the field is nonzero */
5391 	if (cpi->cpi_brandid != 0) {
5392 		static const struct {
5393 			uint_t bt_bid;
5394 			const char *bt_str;
5395 		} brand_tbl[] = {
5396 			{ 0x1,	"Intel(r) Celeron(r)" },
5397 			{ 0x2,	"Intel(r) Pentium(r) III" },
5398 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5399 			{ 0x4,	"Intel(r) Pentium(r) III" },
5400 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5401 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5402 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5403 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5404 			{ 0xa,	"Intel(r) Celeron(r)" },
5405 			{ 0xb,	"Intel(r) Xeon(tm)" },
5406 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5407 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5408 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5409 			{ 0x11, "Mobile Genuine Intel(r)" },
5410 			{ 0x12, "Intel(r) Celeron(r) M" },
5411 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5412 			{ 0x14, "Intel(r) Celeron(r)" },
5413 			{ 0x15, "Mobile Genuine Intel(r)" },
5414 			{ 0x16,	"Intel(r) Pentium(r) M" },
5415 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5416 		};
5417 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5418 		uint_t sgn;
5419 
5420 		sgn = (cpi->cpi_family << 8) |
5421 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5422 
5423 		for (i = 0; i < btblmax; i++)
5424 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5425 				break;
5426 		if (i < btblmax) {
5427 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5428 				return ("Intel(r) Celeron(r)");
5429 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5430 				return ("Intel(r) Xeon(tm) MP");
5431 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5432 				return ("Intel(r) Xeon(tm)");
5433 			return (brand_tbl[i].bt_str);
5434 		}
5435 	}
5436 
5437 	return (NULL);
5438 }
5439 
5440 static const char *
amd_cpubrand(const struct cpuid_info * cpi)5441 amd_cpubrand(const struct cpuid_info *cpi)
5442 {
5443 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5444 
5445 	switch (cpi->cpi_family) {
5446 	case 5:
5447 		switch (cpi->cpi_model) {
5448 		case 0:
5449 		case 1:
5450 		case 2:
5451 		case 3:
5452 		case 4:
5453 		case 5:
5454 			return ("AMD-K5(r)");
5455 		case 6:
5456 		case 7:
5457 			return ("AMD-K6(r)");
5458 		case 8:
5459 			return ("AMD-K6(r)-2");
5460 		case 9:
5461 			return ("AMD-K6(r)-III");
5462 		default:
5463 			return ("AMD (family 5)");
5464 		}
5465 	case 6:
5466 		switch (cpi->cpi_model) {
5467 		case 1:
5468 			return ("AMD-K7(tm)");
5469 		case 0:
5470 		case 2:
5471 		case 4:
5472 			return ("AMD Athlon(tm)");
5473 		case 3:
5474 		case 7:
5475 			return ("AMD Duron(tm)");
5476 		case 6:
5477 		case 8:
5478 		case 10:
5479 			/*
5480 			 * Use the L2 cache size to distinguish
5481 			 */
5482 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5483 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5484 		default:
5485 			return ("AMD (family 6)");
5486 		}
5487 	default:
5488 		break;
5489 	}
5490 
5491 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5492 	    cpi->cpi_brandid != 0) {
5493 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5494 		case 3:
5495 			return ("AMD Opteron(tm) UP 1xx");
5496 		case 4:
5497 			return ("AMD Opteron(tm) DP 2xx");
5498 		case 5:
5499 			return ("AMD Opteron(tm) MP 8xx");
5500 		default:
5501 			return ("AMD Opteron(tm)");
5502 		}
5503 	}
5504 
5505 	return (NULL);
5506 }
5507 
5508 static const char *
cyrix_cpubrand(struct cpuid_info * cpi,uint_t type)5509 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5510 {
5511 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5512 
5513 	switch (type) {
5514 	case X86_TYPE_CYRIX_6x86:
5515 		return ("Cyrix 6x86");
5516 	case X86_TYPE_CYRIX_6x86L:
5517 		return ("Cyrix 6x86L");
5518 	case X86_TYPE_CYRIX_6x86MX:
5519 		return ("Cyrix 6x86MX");
5520 	case X86_TYPE_CYRIX_GXm:
5521 		return ("Cyrix GXm");
5522 	case X86_TYPE_CYRIX_MediaGX:
5523 		return ("Cyrix MediaGX");
5524 	case X86_TYPE_CYRIX_MII:
5525 		return ("Cyrix M2");
5526 	case X86_TYPE_VIA_CYRIX_III:
5527 		return ("VIA Cyrix M3");
5528 	default:
5529 		/*
5530 		 * Have another wild guess ..
5531 		 */
5532 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5533 			return ("Cyrix 5x86");
5534 		else if (cpi->cpi_family == 5) {
5535 			switch (cpi->cpi_model) {
5536 			case 2:
5537 				return ("Cyrix 6x86");	/* Cyrix M1 */
5538 			case 4:
5539 				return ("Cyrix MediaGX");
5540 			default:
5541 				break;
5542 			}
5543 		} else if (cpi->cpi_family == 6) {
5544 			switch (cpi->cpi_model) {
5545 			case 0:
5546 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5547 			case 5:
5548 			case 6:
5549 			case 7:
5550 			case 8:
5551 			case 9:
5552 				return ("VIA C3");
5553 			default:
5554 				break;
5555 			}
5556 		}
5557 		break;
5558 	}
5559 	return (NULL);
5560 }
5561 
5562 /*
5563  * This only gets called in the case that the CPU extended
5564  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5565  * aren't available, or contain null bytes for some reason.
5566  */
5567 static void
fabricate_brandstr(struct cpuid_info * cpi)5568 fabricate_brandstr(struct cpuid_info *cpi)
5569 {
5570 	const char *brand = NULL;
5571 
5572 	switch (cpi->cpi_vendor) {
5573 	case X86_VENDOR_Intel:
5574 		brand = intel_cpubrand(cpi);
5575 		break;
5576 	case X86_VENDOR_AMD:
5577 		brand = amd_cpubrand(cpi);
5578 		break;
5579 	case X86_VENDOR_Cyrix:
5580 		brand = cyrix_cpubrand(cpi, x86_type);
5581 		break;
5582 	case X86_VENDOR_NexGen:
5583 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5584 			brand = "NexGen Nx586";
5585 		break;
5586 	case X86_VENDOR_Centaur:
5587 		if (cpi->cpi_family == 5)
5588 			switch (cpi->cpi_model) {
5589 			case 4:
5590 				brand = "Centaur C6";
5591 				break;
5592 			case 8:
5593 				brand = "Centaur C2";
5594 				break;
5595 			case 9:
5596 				brand = "Centaur C3";
5597 				break;
5598 			default:
5599 				break;
5600 			}
5601 		break;
5602 	case X86_VENDOR_Rise:
5603 		if (cpi->cpi_family == 5 &&
5604 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5605 			brand = "Rise mP6";
5606 		break;
5607 	case X86_VENDOR_SiS:
5608 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5609 			brand = "SiS 55x";
5610 		break;
5611 	case X86_VENDOR_TM:
5612 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5613 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5614 		break;
5615 	case X86_VENDOR_NSC:
5616 	case X86_VENDOR_UMC:
5617 	default:
5618 		break;
5619 	}
5620 	if (brand) {
5621 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5622 		return;
5623 	}
5624 
5625 	/*
5626 	 * If all else fails ...
5627 	 */
5628 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5629 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5630 	    cpi->cpi_model, cpi->cpi_step);
5631 }
5632 
5633 /*
5634  * This routine is called just after kernel memory allocation
5635  * becomes available on cpu0, and as part of mp_startup() on
5636  * the other cpus.
5637  *
5638  * Fixup the brand string, and collect any information from cpuid
5639  * that requires dynamically allocated storage to represent.
5640  */
5641 
5642 static void
cpuid_pass_dynamic(cpu_t * cpu,void * _arg __unused)5643 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5644 {
5645 	int	i, max, shft, level, size;
5646 	struct cpuid_regs regs;
5647 	struct cpuid_regs *cp;
5648 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5649 
5650 	/*
5651 	 * Deterministic cache parameters
5652 	 *
5653 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5654 	 * values that are present are currently defined to be the same. This
5655 	 * means we can use the same logic to parse it as long as we use the
5656 	 * appropriate leaf to get the data. If you're updating this, make sure
5657 	 * you're careful about which vendor supports which aspect.
5658 	 *
5659 	 * Take this opportunity to detect the number of threads sharing the
5660 	 * last level cache, and construct a corresponding cache id. The
5661 	 * respective cpuid_info members are initialized to the default case of
5662 	 * "no last level cache sharing".
5663 	 */
5664 	cpi->cpi_ncpu_shr_last_cache = 1;
5665 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5666 
5667 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5668 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5669 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5670 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5671 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5672 		uint32_t leaf;
5673 
5674 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5675 			leaf = 4;
5676 		} else {
5677 			leaf = CPUID_LEAF_EXT_1d;
5678 		}
5679 
5680 		/*
5681 		 * Find the # of elements (size) returned by the leaf and along
5682 		 * the way detect last level cache sharing details.
5683 		 */
5684 		bzero(&regs, sizeof (regs));
5685 		cp = &regs;
5686 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5687 			cp->cp_eax = leaf;
5688 			cp->cp_ecx = i;
5689 
5690 			(void) __cpuid_insn(cp);
5691 
5692 			if (CPI_CACHE_TYPE(cp) == 0)
5693 				break;
5694 			level = CPI_CACHE_LVL(cp);
5695 			if (level > max) {
5696 				max = level;
5697 				cpi->cpi_ncpu_shr_last_cache =
5698 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5699 			}
5700 		}
5701 		cpi->cpi_cache_leaf_size = size = i;
5702 
5703 		/*
5704 		 * Allocate the cpi_cache_leaves array. The first element
5705 		 * references the regs for the corresponding leaf with %ecx set
5706 		 * to 0. This was gathered in cpuid_pass_extended().
5707 		 */
5708 		if (size > 0) {
5709 			cpi->cpi_cache_leaves =
5710 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5711 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5712 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5713 			} else {
5714 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5715 			}
5716 
5717 			/*
5718 			 * Allocate storage to hold the additional regs
5719 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5720 			 *
5721 			 * The regs for the leaf, %ecx == 0 has already
5722 			 * been allocated as indicated above.
5723 			 */
5724 			for (i = 1; i < size; i++) {
5725 				cp = cpi->cpi_cache_leaves[i] =
5726 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5727 				cp->cp_eax = leaf;
5728 				cp->cp_ecx = i;
5729 
5730 				(void) __cpuid_insn(cp);
5731 			}
5732 		}
5733 		/*
5734 		 * Determine the number of bits needed to represent
5735 		 * the number of CPUs sharing the last level cache.
5736 		 *
5737 		 * Shift off that number of bits from the APIC id to
5738 		 * derive the cache id.
5739 		 */
5740 		shft = 0;
5741 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5742 			shft++;
5743 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5744 	}
5745 
5746 	/*
5747 	 * Now fixup the brand string
5748 	 */
5749 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5750 		fabricate_brandstr(cpi);
5751 	} else {
5752 
5753 		/*
5754 		 * If we successfully extracted a brand string from the cpuid
5755 		 * instruction, clean it up by removing leading spaces and
5756 		 * similar junk.
5757 		 */
5758 		if (cpi->cpi_brandstr[0]) {
5759 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5760 			char *src, *dst;
5761 
5762 			dst = src = (char *)cpi->cpi_brandstr;
5763 			src[maxlen - 1] = '\0';
5764 			/*
5765 			 * strip leading spaces
5766 			 */
5767 			while (*src == ' ')
5768 				src++;
5769 			/*
5770 			 * Remove any 'Genuine' or "Authentic" prefixes
5771 			 */
5772 			if (strncmp(src, "Genuine ", 8) == 0)
5773 				src += 8;
5774 			if (strncmp(src, "Authentic ", 10) == 0)
5775 				src += 10;
5776 
5777 			/*
5778 			 * Now do an in-place copy.
5779 			 * Map (R) to (r) and (TM) to (tm).
5780 			 * The era of teletypes is long gone, and there's
5781 			 * -really- no need to shout.
5782 			 */
5783 			while (*src != '\0') {
5784 				if (src[0] == '(') {
5785 					if (strncmp(src + 1, "R)", 2) == 0) {
5786 						(void) strncpy(dst, "(r)", 3);
5787 						src += 3;
5788 						dst += 3;
5789 						continue;
5790 					}
5791 					if (strncmp(src + 1, "TM)", 3) == 0) {
5792 						(void) strncpy(dst, "(tm)", 4);
5793 						src += 4;
5794 						dst += 4;
5795 						continue;
5796 					}
5797 				}
5798 				*dst++ = *src++;
5799 			}
5800 			*dst = '\0';
5801 
5802 			/*
5803 			 * Finally, remove any trailing spaces
5804 			 */
5805 			while (--dst > cpi->cpi_brandstr)
5806 				if (*dst == ' ')
5807 					*dst = '\0';
5808 				else
5809 					break;
5810 		} else
5811 			fabricate_brandstr(cpi);
5812 	}
5813 }
5814 
5815 typedef struct {
5816 	uint32_t avm_av;
5817 	uint32_t avm_feat;
5818 } av_feat_map_t;
5819 
5820 /*
5821  * These arrays are used to map features that we should add based on x86
5822  * features that are present. As a large number depend on kernel features,
5823  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5824  * There is an array of these for each hwcap word. Some features aren't tracked
5825  * in the kernel x86 featureset and that's ok. They will not show up in here.
5826  */
5827 static const av_feat_map_t x86fset_to_av1[] = {
5828 	{ AV_386_CX8, X86FSET_CX8 },
5829 	{ AV_386_SEP, X86FSET_SEP },
5830 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5831 	{ AV_386_CMOV, X86FSET_CMOV },
5832 	{ AV_386_FXSR, X86FSET_SSE },
5833 	{ AV_386_SSE, X86FSET_SSE },
5834 	{ AV_386_SSE2, X86FSET_SSE2 },
5835 	{ AV_386_SSE3, X86FSET_SSE3 },
5836 	{ AV_386_CX16, X86FSET_CX16 },
5837 	{ AV_386_TSCP, X86FSET_TSCP },
5838 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5839 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5840 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5841 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5842 	{ AV_386_AES, X86FSET_AES },
5843 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5844 	{ AV_386_XSAVE, X86FSET_XSAVE },
5845 	{ AV_386_AVX, X86FSET_AVX },
5846 	{ AV_386_VMX, X86FSET_VMX },
5847 	{ AV_386_AMD_SVM, X86FSET_SVM }
5848 };
5849 
5850 static const av_feat_map_t x86fset_to_av2[] = {
5851 	{ AV_386_2_F16C, X86FSET_F16C },
5852 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5853 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5854 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5855 	{ AV_386_2_FMA, X86FSET_FMA },
5856 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5857 	{ AV_386_2_ADX, X86FSET_ADX },
5858 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5859 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5860 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5861 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5862 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5863 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5864 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5865 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5866 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5867 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5868 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5869 	{ AV_386_2_SHA, X86FSET_SHA },
5870 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5871 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5872 	{ AV_386_2_CLWB, X86FSET_CLWB },
5873 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5874 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5875 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5876 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5877 	{ AV_386_2_VAES, X86FSET_VAES },
5878 	{ AV_386_2_GFNI, X86FSET_GFNI },
5879 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5880 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5881 };
5882 
5883 static const av_feat_map_t x86fset_to_av3[] = {
5884 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5885 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5886 };
5887 
5888 /*
5889  * This routine is called out of bind_hwcap() much later in the life
5890  * of the kernel (post_startup()).  The job of this routine is to resolve
5891  * the hardware feature support and kernel support for those features into
5892  * what we're actually going to tell applications via the aux vector.
5893  *
5894  * Most of the aux vector is derived from the x86_featureset array vector where
5895  * a given feature indicates that an aux vector should be plumbed through. This
5896  * allows the kernel to use one tracking mechanism for these based on whether or
5897  * not it has the required hardware support (most often xsave). Most newer
5898  * features are added there in case we need them in the kernel. Otherwise,
5899  * features are evaluated based on looking at the cpuid features that remain. If
5900  * you find yourself wanting to clear out cpuid features for some reason, they
5901  * should instead be driven by the feature set so we have a consistent view.
5902  */
5903 
5904 static void
cpuid_pass_resolve(cpu_t * cpu,void * arg)5905 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5906 {
5907 	uint_t *hwcap_out = (uint_t *)arg;
5908 	struct cpuid_info *cpi;
5909 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5910 
5911 	cpi = cpu->cpu_m.mcpu_cpi;
5912 
5913 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5914 		if (is_x86_feature(x86_featureset,
5915 		    x86fset_to_av1[i].avm_feat)) {
5916 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5917 		}
5918 	}
5919 
5920 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5921 		if (is_x86_feature(x86_featureset,
5922 		    x86fset_to_av2[i].avm_feat)) {
5923 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5924 		}
5925 	}
5926 
5927 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5928 		if (is_x86_feature(x86_featureset,
5929 		    x86fset_to_av3[i].avm_feat)) {
5930 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5931 		}
5932 	}
5933 
5934 	/*
5935 	 * From here on out we're working through features that don't have
5936 	 * corresponding kernel feature flags for various reasons that are
5937 	 * mostly just due to the historical implementation.
5938 	 */
5939 	if (cpi->cpi_maxeax >= 1) {
5940 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5941 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5942 
5943 		*edx = CPI_FEATURES_EDX(cpi);
5944 		*ecx = CPI_FEATURES_ECX(cpi);
5945 
5946 		/*
5947 		 * [no explicit support required beyond x87 fp context]
5948 		 */
5949 		if (!fpu_exists)
5950 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5951 
5952 		/*
5953 		 * Now map the supported feature vector to things that we
5954 		 * think userland will care about.
5955 		 */
5956 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5957 			hwcap_flags |= AV_386_MOVBE;
5958 
5959 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5960 			hwcap_flags |= AV_386_POPCNT;
5961 		if (*edx & CPUID_INTC_EDX_FPU)
5962 			hwcap_flags |= AV_386_FPU;
5963 		if (*edx & CPUID_INTC_EDX_MMX)
5964 			hwcap_flags |= AV_386_MMX;
5965 		if (*edx & CPUID_INTC_EDX_TSC)
5966 			hwcap_flags |= AV_386_TSC;
5967 	}
5968 
5969 	/*
5970 	 * Check a few miscellaneous features.
5971 	 */
5972 	if (cpi->cpi_xmaxeax < 0x80000001)
5973 		goto resolve_done;
5974 
5975 	switch (cpi->cpi_vendor) {
5976 		uint32_t *edx, *ecx;
5977 
5978 	case X86_VENDOR_Intel:
5979 		/*
5980 		 * Seems like Intel duplicated what we necessary
5981 		 * here to make the initial crop of 64-bit OS's work.
5982 		 * Hopefully, those are the only "extended" bits
5983 		 * they'll add.
5984 		 */
5985 		/*FALLTHROUGH*/
5986 
5987 	case X86_VENDOR_AMD:
5988 	case X86_VENDOR_HYGON:
5989 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5990 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5991 
5992 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5993 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5994 
5995 		/*
5996 		 * [no explicit support required beyond
5997 		 * x87 fp context and exception handlers]
5998 		 */
5999 		if (!fpu_exists)
6000 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
6001 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
6002 
6003 		/*
6004 		 * Now map the supported feature vector to
6005 		 * things that we think userland will care about.
6006 		 */
6007 		if (*edx & CPUID_AMD_EDX_MMXamd)
6008 			hwcap_flags |= AV_386_AMD_MMX;
6009 		if (*edx & CPUID_AMD_EDX_3DNow)
6010 			hwcap_flags |= AV_386_AMD_3DNow;
6011 		if (*edx & CPUID_AMD_EDX_3DNowx)
6012 			hwcap_flags |= AV_386_AMD_3DNowx;
6013 
6014 		switch (cpi->cpi_vendor) {
6015 		case X86_VENDOR_AMD:
6016 		case X86_VENDOR_HYGON:
6017 			if (*ecx & CPUID_AMD_ECX_AHF64)
6018 				hwcap_flags |= AV_386_AHF;
6019 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6020 				hwcap_flags |= AV_386_AMD_LZCNT;
6021 			break;
6022 
6023 		case X86_VENDOR_Intel:
6024 			if (*ecx & CPUID_AMD_ECX_LZCNT)
6025 				hwcap_flags |= AV_386_AMD_LZCNT;
6026 			/*
6027 			 * Aarrgh.
6028 			 * Intel uses a different bit in the same word.
6029 			 */
6030 			if (*ecx & CPUID_INTC_ECX_AHF64)
6031 				hwcap_flags |= AV_386_AHF;
6032 			break;
6033 		default:
6034 			break;
6035 		}
6036 		break;
6037 
6038 	default:
6039 		break;
6040 	}
6041 
6042 resolve_done:
6043 	if (hwcap_out != NULL) {
6044 		hwcap_out[0] = hwcap_flags;
6045 		hwcap_out[1] = hwcap_flags_2;
6046 		hwcap_out[2] = hwcap_flags_3;
6047 	}
6048 }
6049 
6050 
6051 /*
6052  * Simulate the cpuid instruction using the data we previously
6053  * captured about this CPU.  We try our best to return the truth
6054  * about the hardware, independently of kernel support.
6055  */
6056 uint32_t
cpuid_insn(cpu_t * cpu,struct cpuid_regs * cp)6057 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
6058 {
6059 	struct cpuid_info *cpi;
6060 	struct cpuid_regs *xcp;
6061 
6062 	if (cpu == NULL)
6063 		cpu = CPU;
6064 	cpi = cpu->cpu_m.mcpu_cpi;
6065 
6066 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6067 
6068 	/*
6069 	 * CPUID data is cached in two separate places: cpi_std for standard
6070 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
6071 	 */
6072 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
6073 		xcp = &cpi->cpi_std[cp->cp_eax];
6074 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
6075 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
6076 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
6077 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
6078 	} else {
6079 		/*
6080 		 * The caller is asking for data from an input parameter which
6081 		 * the kernel has not cached.  In this case we go fetch from
6082 		 * the hardware and return the data directly to the user.
6083 		 */
6084 		return (__cpuid_insn(cp));
6085 	}
6086 
6087 	cp->cp_eax = xcp->cp_eax;
6088 	cp->cp_ebx = xcp->cp_ebx;
6089 	cp->cp_ecx = xcp->cp_ecx;
6090 	cp->cp_edx = xcp->cp_edx;
6091 	return (cp->cp_eax);
6092 }
6093 
6094 boolean_t
cpuid_checkpass(const cpu_t * const cpu,const cpuid_pass_t pass)6095 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
6096 {
6097 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
6098 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
6099 }
6100 
6101 int
cpuid_getbrandstr(cpu_t * cpu,char * s,size_t n)6102 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
6103 {
6104 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
6105 
6106 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
6107 }
6108 
6109 int
cpuid_is_cmt(cpu_t * cpu)6110 cpuid_is_cmt(cpu_t *cpu)
6111 {
6112 	if (cpu == NULL)
6113 		cpu = CPU;
6114 
6115 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6116 
6117 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
6118 }
6119 
6120 /*
6121  * AMD and Intel both implement the 64-bit variant of the syscall
6122  * instruction (syscallq), so if there's -any- support for syscall,
6123  * cpuid currently says "yes, we support this".
6124  *
6125  * However, Intel decided to -not- implement the 32-bit variant of the
6126  * syscall instruction, so we provide a predicate to allow our caller
6127  * to test that subtlety here.
6128  *
6129  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
6130  *	even in the case where the hardware would in fact support it.
6131  */
6132 /*ARGSUSED*/
6133 int
cpuid_syscall32_insn(cpu_t * cpu)6134 cpuid_syscall32_insn(cpu_t *cpu)
6135 {
6136 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
6137 
6138 #if !defined(__xpv)
6139 	if (cpu == NULL)
6140 		cpu = CPU;
6141 
6142 	/*CSTYLED*/
6143 	{
6144 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6145 
6146 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
6147 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
6148 		    cpi->cpi_xmaxeax >= 0x80000001 &&
6149 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
6150 			return (1);
6151 	}
6152 #endif
6153 	return (0);
6154 }
6155 
6156 int
cpuid_getidstr(cpu_t * cpu,char * s,size_t n)6157 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
6158 {
6159 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6160 
6161 	static const char fmt[] =
6162 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
6163 	static const char fmt_ht[] =
6164 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
6165 
6166 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6167 
6168 	if (cpuid_is_cmt(cpu))
6169 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
6170 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6171 		    cpi->cpi_family, cpi->cpi_model,
6172 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6173 	return (snprintf(s, n, fmt,
6174 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
6175 	    cpi->cpi_family, cpi->cpi_model,
6176 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
6177 }
6178 
6179 const char *
cpuid_getvendorstr(cpu_t * cpu)6180 cpuid_getvendorstr(cpu_t *cpu)
6181 {
6182 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6183 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
6184 }
6185 
6186 uint_t
cpuid_getvendor(cpu_t * cpu)6187 cpuid_getvendor(cpu_t *cpu)
6188 {
6189 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6190 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
6191 }
6192 
6193 uint_t
cpuid_getfamily(cpu_t * cpu)6194 cpuid_getfamily(cpu_t *cpu)
6195 {
6196 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6197 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
6198 }
6199 
6200 uint_t
cpuid_getmodel(cpu_t * cpu)6201 cpuid_getmodel(cpu_t *cpu)
6202 {
6203 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6204 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
6205 }
6206 
6207 uint_t
cpuid_get_ncpu_per_chip(cpu_t * cpu)6208 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6209 {
6210 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6211 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6212 }
6213 
6214 uint_t
cpuid_get_ncore_per_chip(cpu_t * cpu)6215 cpuid_get_ncore_per_chip(cpu_t *cpu)
6216 {
6217 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6218 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6219 }
6220 
6221 uint_t
cpuid_get_ncpu_sharing_last_cache(cpu_t * cpu)6222 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6223 {
6224 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6225 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6226 }
6227 
6228 id_t
cpuid_get_last_lvl_cacheid(cpu_t * cpu)6229 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6230 {
6231 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6232 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6233 }
6234 
6235 uint_t
cpuid_getstep(cpu_t * cpu)6236 cpuid_getstep(cpu_t *cpu)
6237 {
6238 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6239 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
6240 }
6241 
6242 uint_t
cpuid_getsig(struct cpu * cpu)6243 cpuid_getsig(struct cpu *cpu)
6244 {
6245 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6246 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6247 }
6248 
6249 uint32_t
cpuid_getchiprev(struct cpu * cpu)6250 cpuid_getchiprev(struct cpu *cpu)
6251 {
6252 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6253 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6254 }
6255 
6256 const char *
cpuid_getchiprevstr(struct cpu * cpu)6257 cpuid_getchiprevstr(struct cpu *cpu)
6258 {
6259 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6260 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6261 }
6262 
6263 uint32_t
cpuid_getsockettype(struct cpu * cpu)6264 cpuid_getsockettype(struct cpu *cpu)
6265 {
6266 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6267 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6268 }
6269 
6270 const char *
cpuid_getsocketstr(cpu_t * cpu)6271 cpuid_getsocketstr(cpu_t *cpu)
6272 {
6273 	static const char *socketstr = NULL;
6274 	struct cpuid_info *cpi;
6275 
6276 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6277 	cpi = cpu->cpu_m.mcpu_cpi;
6278 
6279 	/* Assume that socket types are the same across the system */
6280 	if (socketstr == NULL)
6281 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6282 		    cpi->cpi_model, cpi->cpi_step);
6283 
6284 
6285 	return (socketstr);
6286 }
6287 
6288 x86_uarchrev_t
cpuid_getuarchrev(cpu_t * cpu)6289 cpuid_getuarchrev(cpu_t *cpu)
6290 {
6291 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6292 }
6293 
6294 int
cpuid_get_chipid(cpu_t * cpu)6295 cpuid_get_chipid(cpu_t *cpu)
6296 {
6297 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6298 
6299 	if (cpuid_is_cmt(cpu))
6300 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6301 	return (cpu->cpu_id);
6302 }
6303 
6304 id_t
cpuid_get_coreid(cpu_t * cpu)6305 cpuid_get_coreid(cpu_t *cpu)
6306 {
6307 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6308 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6309 }
6310 
6311 int
cpuid_get_pkgcoreid(cpu_t * cpu)6312 cpuid_get_pkgcoreid(cpu_t *cpu)
6313 {
6314 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6315 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6316 }
6317 
6318 int
cpuid_get_clogid(cpu_t * cpu)6319 cpuid_get_clogid(cpu_t *cpu)
6320 {
6321 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6322 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6323 }
6324 
6325 int
cpuid_get_cacheid(cpu_t * cpu)6326 cpuid_get_cacheid(cpu_t *cpu)
6327 {
6328 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6329 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6330 }
6331 
6332 uint_t
cpuid_get_procnodeid(cpu_t * cpu)6333 cpuid_get_procnodeid(cpu_t *cpu)
6334 {
6335 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6336 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6337 }
6338 
6339 uint_t
cpuid_get_procnodes_per_pkg(cpu_t * cpu)6340 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6341 {
6342 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6343 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6344 }
6345 
6346 uint_t
cpuid_get_compunitid(cpu_t * cpu)6347 cpuid_get_compunitid(cpu_t *cpu)
6348 {
6349 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6350 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6351 }
6352 
6353 uint_t
cpuid_get_cores_per_compunit(cpu_t * cpu)6354 cpuid_get_cores_per_compunit(cpu_t *cpu)
6355 {
6356 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6357 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6358 }
6359 
6360 uint32_t
cpuid_get_apicid(cpu_t * cpu)6361 cpuid_get_apicid(cpu_t *cpu)
6362 {
6363 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6364 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6365 		return (UINT32_MAX);
6366 	} else {
6367 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6368 	}
6369 }
6370 
6371 void
cpuid_get_addrsize(cpu_t * cpu,uint_t * pabits,uint_t * vabits)6372 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6373 {
6374 	struct cpuid_info *cpi;
6375 
6376 	if (cpu == NULL)
6377 		cpu = CPU;
6378 	cpi = cpu->cpu_m.mcpu_cpi;
6379 
6380 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6381 
6382 	if (pabits)
6383 		*pabits = cpi->cpi_pabits;
6384 	if (vabits)
6385 		*vabits = cpi->cpi_vabits;
6386 }
6387 
6388 size_t
cpuid_get_xsave_size(void)6389 cpuid_get_xsave_size(void)
6390 {
6391 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6392 	    sizeof (struct xsave_state)));
6393 }
6394 
6395 /*
6396  * Export information about known offsets to the kernel. We only care about
6397  * things we have actually enabled support for in %xcr0.
6398  */
6399 void
cpuid_get_xsave_info(uint64_t bit,size_t * sizep,size_t * offp)6400 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6401 {
6402 	size_t size, off;
6403 
6404 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6405 
6406 	if (sizep == NULL)
6407 		sizep = &size;
6408 	if (offp == NULL)
6409 		offp = &off;
6410 
6411 	switch (bit) {
6412 	case XFEATURE_LEGACY_FP:
6413 	case XFEATURE_SSE:
6414 		*sizep = sizeof (struct fxsave_state);
6415 		*offp = 0;
6416 		break;
6417 	case XFEATURE_AVX:
6418 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6419 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6420 		break;
6421 	case XFEATURE_AVX512_OPMASK:
6422 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6423 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6424 		break;
6425 	case XFEATURE_AVX512_ZMM:
6426 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6427 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6428 		break;
6429 	case XFEATURE_AVX512_HI_ZMM:
6430 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6431 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6432 		break;
6433 	default:
6434 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6435 	}
6436 }
6437 
6438 /*
6439  * Return true if the CPUs on this system require 'pointer clearing' for the
6440  * floating point error pointer exception handling. In the past, this has been
6441  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6442  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6443  * feature bit and is reflected in the cpi_fp_amd_save member.
6444  */
6445 boolean_t
cpuid_need_fp_excp_handling(void)6446 cpuid_need_fp_excp_handling(void)
6447 {
6448 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6449 	    cpuid_info0.cpi_fp_amd_save != 0);
6450 }
6451 
6452 /*
6453  * Returns the number of data TLB entries for a corresponding
6454  * pagesize.  If it can't be computed, or isn't known, the
6455  * routine returns zero.  If you ask about an architecturally
6456  * impossible pagesize, the routine will panic (so that the
6457  * hat implementor knows that things are inconsistent.)
6458  */
6459 uint_t
cpuid_get_dtlb_nent(cpu_t * cpu,size_t pagesize)6460 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6461 {
6462 	struct cpuid_info *cpi;
6463 	uint_t dtlb_nent = 0;
6464 
6465 	if (cpu == NULL)
6466 		cpu = CPU;
6467 	cpi = cpu->cpu_m.mcpu_cpi;
6468 
6469 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6470 
6471 	/*
6472 	 * Check the L2 TLB info
6473 	 */
6474 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6475 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6476 
6477 		switch (pagesize) {
6478 
6479 		case 4 * 1024:
6480 			/*
6481 			 * All zero in the top 16 bits of the register
6482 			 * indicates a unified TLB. Size is in low 16 bits.
6483 			 */
6484 			if ((cp->cp_ebx & 0xffff0000) == 0)
6485 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6486 			else
6487 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6488 			break;
6489 
6490 		case 2 * 1024 * 1024:
6491 			if ((cp->cp_eax & 0xffff0000) == 0)
6492 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6493 			else
6494 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6495 			break;
6496 
6497 		default:
6498 			panic("unknown L2 pagesize");
6499 			/*NOTREACHED*/
6500 		}
6501 	}
6502 
6503 	if (dtlb_nent != 0)
6504 		return (dtlb_nent);
6505 
6506 	/*
6507 	 * No L2 TLB support for this size, try L1.
6508 	 */
6509 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6510 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6511 
6512 		switch (pagesize) {
6513 		case 4 * 1024:
6514 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6515 			break;
6516 		case 2 * 1024 * 1024:
6517 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6518 			break;
6519 		default:
6520 			panic("unknown L1 d-TLB pagesize");
6521 			/*NOTREACHED*/
6522 		}
6523 	}
6524 
6525 	return (dtlb_nent);
6526 }
6527 
6528 /*
6529  * Return 0 if the erratum is not present or not applicable, positive
6530  * if it is, and negative if the status of the erratum is unknown.
6531  *
6532  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6533  * Processors" #25759, Rev 3.57, August 2005
6534  */
6535 int
cpuid_opteron_erratum(cpu_t * cpu,uint_t erratum)6536 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6537 {
6538 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6539 	uint_t eax;
6540 
6541 	/*
6542 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6543 	 * a legacy (32-bit) AMD CPU.
6544 	 */
6545 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6546 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6547 	    cpi->cpi_family == 6) {
6548 		return (0);
6549 	}
6550 
6551 	eax = cpi->cpi_std[1].cp_eax;
6552 
6553 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6554 #define	SH_B3(eax)	(eax == 0xf51)
6555 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6556 
6557 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6558 
6559 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6560 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6561 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6562 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6563 
6564 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6565 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6566 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6567 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6568 
6569 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6570 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6571 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6572 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6573 #define	BH_E4(eax)	(eax == 0x20fb1)
6574 #define	SH_E5(eax)	(eax == 0x20f42)
6575 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6576 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6577 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6578 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6579 			    DH_E6(eax) || JH_E6(eax))
6580 
6581 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6582 #define	DR_B0(eax)	(eax == 0x100f20)
6583 #define	DR_B1(eax)	(eax == 0x100f21)
6584 #define	DR_BA(eax)	(eax == 0x100f2a)
6585 #define	DR_B2(eax)	(eax == 0x100f22)
6586 #define	DR_B3(eax)	(eax == 0x100f23)
6587 #define	RB_C0(eax)	(eax == 0x100f40)
6588 
6589 	switch (erratum) {
6590 	case 1:
6591 		return (cpi->cpi_family < 0x10);
6592 	case 51:	/* what does the asterisk mean? */
6593 		return (B(eax) || SH_C0(eax) || CG(eax));
6594 	case 52:
6595 		return (B(eax));
6596 	case 57:
6597 		return (cpi->cpi_family <= 0x11);
6598 	case 58:
6599 		return (B(eax));
6600 	case 60:
6601 		return (cpi->cpi_family <= 0x11);
6602 	case 61:
6603 	case 62:
6604 	case 63:
6605 	case 64:
6606 	case 65:
6607 	case 66:
6608 	case 68:
6609 	case 69:
6610 	case 70:
6611 	case 71:
6612 		return (B(eax));
6613 	case 72:
6614 		return (SH_B0(eax));
6615 	case 74:
6616 		return (B(eax));
6617 	case 75:
6618 		return (cpi->cpi_family < 0x10);
6619 	case 76:
6620 		return (B(eax));
6621 	case 77:
6622 		return (cpi->cpi_family <= 0x11);
6623 	case 78:
6624 		return (B(eax) || SH_C0(eax));
6625 	case 79:
6626 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6627 	case 80:
6628 	case 81:
6629 	case 82:
6630 		return (B(eax));
6631 	case 83:
6632 		return (B(eax) || SH_C0(eax) || CG(eax));
6633 	case 85:
6634 		return (cpi->cpi_family < 0x10);
6635 	case 86:
6636 		return (SH_C0(eax) || CG(eax));
6637 	case 88:
6638 		return (B(eax) || SH_C0(eax));
6639 	case 89:
6640 		return (cpi->cpi_family < 0x10);
6641 	case 90:
6642 		return (B(eax) || SH_C0(eax) || CG(eax));
6643 	case 91:
6644 	case 92:
6645 		return (B(eax) || SH_C0(eax));
6646 	case 93:
6647 		return (SH_C0(eax));
6648 	case 94:
6649 		return (B(eax) || SH_C0(eax) || CG(eax));
6650 	case 95:
6651 		return (B(eax) || SH_C0(eax));
6652 	case 96:
6653 		return (B(eax) || SH_C0(eax) || CG(eax));
6654 	case 97:
6655 	case 98:
6656 		return (SH_C0(eax) || CG(eax));
6657 	case 99:
6658 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6659 	case 100:
6660 		return (B(eax) || SH_C0(eax));
6661 	case 101:
6662 	case 103:
6663 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6664 	case 104:
6665 		return (SH_C0(eax) || CG(eax) || D0(eax));
6666 	case 105:
6667 	case 106:
6668 	case 107:
6669 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6670 	case 108:
6671 		return (DH_CG(eax));
6672 	case 109:
6673 		return (SH_C0(eax) || CG(eax) || D0(eax));
6674 	case 110:
6675 		return (D0(eax) || EX(eax));
6676 	case 111:
6677 		return (CG(eax));
6678 	case 112:
6679 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6680 	case 113:
6681 		return (eax == 0x20fc0);
6682 	case 114:
6683 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6684 	case 115:
6685 		return (SH_E0(eax) || JH_E1(eax));
6686 	case 116:
6687 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6688 	case 117:
6689 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6690 	case 118:
6691 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6692 		    JH_E6(eax));
6693 	case 121:
6694 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6695 	case 122:
6696 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6697 	case 123:
6698 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6699 	case 131:
6700 		return (cpi->cpi_family < 0x10);
6701 	case 6336786:
6702 
6703 		/*
6704 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6705 		 * if this is a K8 family or newer processor. We're testing for
6706 		 * this 'erratum' to determine whether or not we have a constant
6707 		 * TSC.
6708 		 *
6709 		 * Our current fix for this is to disable the C1-Clock ramping.
6710 		 * However, this doesn't work on newer processor families nor
6711 		 * does it work when virtualized as those devices don't exist.
6712 		 */
6713 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6714 			return (0);
6715 		}
6716 
6717 		if (CPI_FAMILY(cpi) == 0xf) {
6718 			struct cpuid_regs regs;
6719 			regs.cp_eax = 0x80000007;
6720 			(void) __cpuid_insn(&regs);
6721 			return (!(regs.cp_edx & 0x100));
6722 		}
6723 		return (0);
6724 	case 147:
6725 		/*
6726 		 * This erratum (K8 #147) is not present on family 10 and newer.
6727 		 */
6728 		if (cpi->cpi_family >= 0x10) {
6729 			return (0);
6730 		}
6731 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6732 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6733 
6734 	case 6671130:
6735 		/*
6736 		 * check for processors (pre-Shanghai) that do not provide
6737 		 * optimal management of 1gb ptes in its tlb.
6738 		 */
6739 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6740 
6741 	case 298:
6742 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6743 		    DR_B2(eax) || RB_C0(eax));
6744 
6745 	case 721:
6746 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6747 
6748 	default:
6749 		return (-1);
6750 
6751 	}
6752 }
6753 
6754 /*
6755  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6756  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6757  */
6758 int
osvw_opteron_erratum(cpu_t * cpu,uint_t erratum)6759 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6760 {
6761 	struct cpuid_info	*cpi;
6762 	uint_t			osvwid;
6763 	static int		osvwfeature = -1;
6764 	uint64_t		osvwlength;
6765 
6766 
6767 	cpi = cpu->cpu_m.mcpu_cpi;
6768 
6769 	/* confirm OSVW supported */
6770 	if (osvwfeature == -1) {
6771 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6772 	} else {
6773 		/* assert that osvw feature setting is consistent on all cpus */
6774 		ASSERT(osvwfeature ==
6775 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6776 	}
6777 	if (!osvwfeature)
6778 		return (-1);
6779 
6780 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6781 
6782 	switch (erratum) {
6783 	case 298:	/* osvwid is 0 */
6784 		osvwid = 0;
6785 		if (osvwlength <= (uint64_t)osvwid) {
6786 			/* osvwid 0 is unknown */
6787 			return (-1);
6788 		}
6789 
6790 		/*
6791 		 * Check the OSVW STATUS MSR to determine the state
6792 		 * of the erratum where:
6793 		 *   0 - fixed by HW
6794 		 *   1 - BIOS has applied the workaround when BIOS
6795 		 *   workaround is available. (Or for other errata,
6796 		 *   OS workaround is required.)
6797 		 * For a value of 1, caller will confirm that the
6798 		 * erratum 298 workaround has indeed been applied by BIOS.
6799 		 *
6800 		 * A 1 may be set in cpus that have a HW fix
6801 		 * in a mixed cpu system. Regarding erratum 298:
6802 		 *   In a multiprocessor platform, the workaround above
6803 		 *   should be applied to all processors regardless of
6804 		 *   silicon revision when an affected processor is
6805 		 *   present.
6806 		 */
6807 
6808 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6809 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6810 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6811 
6812 	default:
6813 		return (-1);
6814 	}
6815 }
6816 
6817 static const char assoc_str[] = "associativity";
6818 static const char line_str[] = "line-size";
6819 static const char size_str[] = "size";
6820 
6821 static void
add_cache_prop(dev_info_t * devi,const char * label,const char * type,uint32_t val)6822 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6823     uint32_t val)
6824 {
6825 	char buf[128];
6826 
6827 	/*
6828 	 * ndi_prop_update_int() is used because it is desirable for
6829 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6830 	 */
6831 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6832 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6833 }
6834 
6835 /*
6836  * Intel-style cache/tlb description
6837  *
6838  * Standard cpuid level 2 gives a randomly ordered
6839  * selection of tags that index into a table that describes
6840  * cache and tlb properties.
6841  */
6842 
6843 static const char l1_icache_str[] = "l1-icache";
6844 static const char l1_dcache_str[] = "l1-dcache";
6845 static const char l2_cache_str[] = "l2-cache";
6846 static const char l3_cache_str[] = "l3-cache";
6847 static const char itlb4k_str[] = "itlb-4K";
6848 static const char dtlb4k_str[] = "dtlb-4K";
6849 static const char itlb2M_str[] = "itlb-2M";
6850 static const char itlb4M_str[] = "itlb-4M";
6851 static const char dtlb4M_str[] = "dtlb-4M";
6852 static const char dtlb24_str[] = "dtlb0-2M-4M";
6853 static const char itlb424_str[] = "itlb-4K-2M-4M";
6854 static const char itlb24_str[] = "itlb-2M-4M";
6855 static const char dtlb44_str[] = "dtlb-4K-4M";
6856 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6857 static const char sl2_cache_str[] = "sectored-l2-cache";
6858 static const char itrace_str[] = "itrace-cache";
6859 static const char sl3_cache_str[] = "sectored-l3-cache";
6860 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6861 
6862 static const struct cachetab {
6863 	uint8_t		ct_code;
6864 	uint8_t		ct_assoc;
6865 	uint16_t	ct_line_size;
6866 	size_t		ct_size;
6867 	const char	*ct_label;
6868 } intel_ctab[] = {
6869 	/*
6870 	 * maintain descending order!
6871 	 *
6872 	 * Codes ignored - Reason
6873 	 * ----------------------
6874 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6875 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6876 	 */
6877 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6878 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6879 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6880 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6881 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6882 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6883 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6884 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6885 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6886 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6887 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6888 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6889 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6890 	{ 0xc0, 4, 0, 8, dtlb44_str },
6891 	{ 0xba, 4, 0, 64, dtlb4k_str },
6892 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6893 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6894 	{ 0xb2, 4, 0, 64, itlb4k_str },
6895 	{ 0xb0, 4, 0, 128, itlb4k_str },
6896 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6897 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6898 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6899 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6900 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6901 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6902 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6903 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6904 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6905 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6906 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6907 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6908 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6909 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6910 	{ 0x73, 8, 0, 64*1024, itrace_str},
6911 	{ 0x72, 8, 0, 32*1024, itrace_str},
6912 	{ 0x71, 8, 0, 16*1024, itrace_str},
6913 	{ 0x70, 8, 0, 12*1024, itrace_str},
6914 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6915 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6916 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6917 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6918 	{ 0x5d, 0, 0, 256, dtlb44_str},
6919 	{ 0x5c, 0, 0, 128, dtlb44_str},
6920 	{ 0x5b, 0, 0, 64, dtlb44_str},
6921 	{ 0x5a, 4, 0, 32, dtlb24_str},
6922 	{ 0x59, 0, 0, 16, dtlb4k_str},
6923 	{ 0x57, 4, 0, 16, dtlb4k_str},
6924 	{ 0x56, 4, 0, 16, dtlb4M_str},
6925 	{ 0x55, 0, 0, 7, itlb24_str},
6926 	{ 0x52, 0, 0, 256, itlb424_str},
6927 	{ 0x51, 0, 0, 128, itlb424_str},
6928 	{ 0x50, 0, 0, 64, itlb424_str},
6929 	{ 0x4f, 0, 0, 32, itlb4k_str},
6930 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6931 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6932 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6933 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6934 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6935 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6936 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6937 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6938 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6939 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6940 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6941 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6942 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6943 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6944 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6945 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6946 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6947 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6948 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6949 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6950 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6951 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6952 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6953 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6954 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6955 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6956 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6957 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6958 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6959 	{ 0x0b, 4, 0, 4, itlb4M_str},
6960 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6961 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6962 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6963 	{ 0x05, 4, 0, 32, dtlb4M_str},
6964 	{ 0x04, 4, 0, 8, dtlb4M_str},
6965 	{ 0x03, 4, 0, 64, dtlb4k_str},
6966 	{ 0x02, 4, 0, 2, itlb4M_str},
6967 	{ 0x01, 4, 0, 32, itlb4k_str},
6968 	{ 0 }
6969 };
6970 
6971 static const struct cachetab cyrix_ctab[] = {
6972 	{ 0x70, 4, 0, 32, "tlb-4K" },
6973 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6974 	{ 0 }
6975 };
6976 
6977 /*
6978  * Search a cache table for a matching entry
6979  */
6980 static const struct cachetab *
find_cacheent(const struct cachetab * ct,uint_t code)6981 find_cacheent(const struct cachetab *ct, uint_t code)
6982 {
6983 	if (code != 0) {
6984 		for (; ct->ct_code != 0; ct++)
6985 			if (ct->ct_code <= code)
6986 				break;
6987 		if (ct->ct_code == code)
6988 			return (ct);
6989 	}
6990 	return (NULL);
6991 }
6992 
6993 /*
6994  * Populate cachetab entry with L2 or L3 cache-information using
6995  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6996  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6997  * information is found.
6998  */
6999 static int
intel_cpuid_4_cache_info(struct cachetab * ct,struct cpuid_info * cpi)7000 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
7001 {
7002 	uint32_t level, i;
7003 	int ret = 0;
7004 
7005 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
7006 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
7007 
7008 		if (level == 2 || level == 3) {
7009 			ct->ct_assoc =
7010 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
7011 			ct->ct_line_size =
7012 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
7013 			ct->ct_size = ct->ct_assoc *
7014 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
7015 			    ct->ct_line_size *
7016 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
7017 
7018 			if (level == 2) {
7019 				ct->ct_label = l2_cache_str;
7020 			} else if (level == 3) {
7021 				ct->ct_label = l3_cache_str;
7022 			}
7023 			ret = 1;
7024 		}
7025 	}
7026 
7027 	return (ret);
7028 }
7029 
7030 /*
7031  * Walk the cacheinfo descriptor, applying 'func' to every valid element
7032  * The walk is terminated if the walker returns non-zero.
7033  */
7034 static void
intel_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7035 intel_walk_cacheinfo(struct cpuid_info *cpi,
7036     void *arg, int (*func)(void *, const struct cachetab *))
7037 {
7038 	const struct cachetab *ct;
7039 	struct cachetab des_49_ct, des_b1_ct;
7040 	uint8_t *dp;
7041 	int i;
7042 
7043 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7044 		return;
7045 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7046 		/*
7047 		 * For overloaded descriptor 0x49 we use cpuid function 4
7048 		 * if supported by the current processor, to create
7049 		 * cache information.
7050 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
7051 		 * to disambiguate the cache information.
7052 		 */
7053 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
7054 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
7055 				ct = &des_49_ct;
7056 		} else if (*dp == 0xb1) {
7057 			des_b1_ct.ct_code = 0xb1;
7058 			des_b1_ct.ct_assoc = 4;
7059 			des_b1_ct.ct_line_size = 0;
7060 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
7061 				des_b1_ct.ct_size = 8;
7062 				des_b1_ct.ct_label = itlb2M_str;
7063 			} else {
7064 				des_b1_ct.ct_size = 4;
7065 				des_b1_ct.ct_label = itlb4M_str;
7066 			}
7067 			ct = &des_b1_ct;
7068 		} else {
7069 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
7070 				continue;
7071 			}
7072 		}
7073 
7074 		if (func(arg, ct) != 0) {
7075 			break;
7076 		}
7077 	}
7078 }
7079 
7080 /*
7081  * (Like the Intel one, except for Cyrix CPUs)
7082  */
7083 static void
cyrix_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))7084 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
7085     void *arg, int (*func)(void *, const struct cachetab *))
7086 {
7087 	const struct cachetab *ct;
7088 	uint8_t *dp;
7089 	int i;
7090 
7091 	if ((dp = cpi->cpi_cacheinfo) == NULL)
7092 		return;
7093 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
7094 		/*
7095 		 * Search Cyrix-specific descriptor table first ..
7096 		 */
7097 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
7098 			if (func(arg, ct) != 0)
7099 				break;
7100 			continue;
7101 		}
7102 		/*
7103 		 * .. else fall back to the Intel one
7104 		 */
7105 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
7106 			if (func(arg, ct) != 0)
7107 				break;
7108 			continue;
7109 		}
7110 	}
7111 }
7112 
7113 /*
7114  * A cacheinfo walker that adds associativity, line-size, and size properties
7115  * to the devinfo node it is passed as an argument.
7116  */
7117 static int
add_cacheent_props(void * arg,const struct cachetab * ct)7118 add_cacheent_props(void *arg, const struct cachetab *ct)
7119 {
7120 	dev_info_t *devi = arg;
7121 
7122 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
7123 	if (ct->ct_line_size != 0)
7124 		add_cache_prop(devi, ct->ct_label, line_str,
7125 		    ct->ct_line_size);
7126 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
7127 	return (0);
7128 }
7129 
7130 
7131 static const char fully_assoc[] = "fully-associative?";
7132 
7133 /*
7134  * AMD style cache/tlb description
7135  *
7136  * Extended functions 5 and 6 directly describe properties of
7137  * tlbs and various cache levels.
7138  */
7139 static void
add_amd_assoc(dev_info_t * devi,const char * label,uint_t assoc)7140 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7141 {
7142 	switch (assoc) {
7143 	case 0:	/* reserved; ignore */
7144 		break;
7145 	default:
7146 		add_cache_prop(devi, label, assoc_str, assoc);
7147 		break;
7148 	case 0xff:
7149 		add_cache_prop(devi, label, fully_assoc, 1);
7150 		break;
7151 	}
7152 }
7153 
7154 static void
add_amd_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7155 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7156 {
7157 	if (size == 0)
7158 		return;
7159 	add_cache_prop(devi, label, size_str, size);
7160 	add_amd_assoc(devi, label, assoc);
7161 }
7162 
7163 static void
add_amd_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7164 add_amd_cache(dev_info_t *devi, const char *label,
7165     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7166 {
7167 	if (size == 0 || line_size == 0)
7168 		return;
7169 	add_amd_assoc(devi, label, assoc);
7170 	/*
7171 	 * Most AMD parts have a sectored cache. Multiple cache lines are
7172 	 * associated with each tag. A sector consists of all cache lines
7173 	 * associated with a tag. For example, the AMD K6-III has a sector
7174 	 * size of 2 cache lines per tag.
7175 	 */
7176 	if (lines_per_tag != 0)
7177 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7178 	add_cache_prop(devi, label, line_str, line_size);
7179 	add_cache_prop(devi, label, size_str, size * 1024);
7180 }
7181 
7182 static void
add_amd_l2_assoc(dev_info_t * devi,const char * label,uint_t assoc)7183 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
7184 {
7185 	switch (assoc) {
7186 	case 0:	/* off */
7187 		break;
7188 	case 1:
7189 	case 2:
7190 	case 4:
7191 		add_cache_prop(devi, label, assoc_str, assoc);
7192 		break;
7193 	case 6:
7194 		add_cache_prop(devi, label, assoc_str, 8);
7195 		break;
7196 	case 8:
7197 		add_cache_prop(devi, label, assoc_str, 16);
7198 		break;
7199 	case 0xf:
7200 		add_cache_prop(devi, label, fully_assoc, 1);
7201 		break;
7202 	default: /* reserved; ignore */
7203 		break;
7204 	}
7205 }
7206 
7207 static void
add_amd_l2_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7208 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7209 {
7210 	if (size == 0 || assoc == 0)
7211 		return;
7212 	add_amd_l2_assoc(devi, label, assoc);
7213 	add_cache_prop(devi, label, size_str, size);
7214 }
7215 
7216 static void
add_amd_l2_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7217 add_amd_l2_cache(dev_info_t *devi, const char *label,
7218     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7219 {
7220 	if (size == 0 || assoc == 0 || line_size == 0)
7221 		return;
7222 	add_amd_l2_assoc(devi, label, assoc);
7223 	if (lines_per_tag != 0)
7224 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7225 	add_cache_prop(devi, label, line_str, line_size);
7226 	add_cache_prop(devi, label, size_str, size * 1024);
7227 }
7228 
7229 static void
amd_cache_info(struct cpuid_info * cpi,dev_info_t * devi)7230 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7231 {
7232 	struct cpuid_regs *cp;
7233 
7234 	if (cpi->cpi_xmaxeax < 0x80000005)
7235 		return;
7236 	cp = &cpi->cpi_extd[5];
7237 
7238 	/*
7239 	 * 4M/2M L1 TLB configuration
7240 	 *
7241 	 * We report the size for 2M pages because AMD uses two
7242 	 * TLB entries for one 4M page.
7243 	 */
7244 	add_amd_tlb(devi, "dtlb-2M",
7245 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7246 	add_amd_tlb(devi, "itlb-2M",
7247 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7248 
7249 	/*
7250 	 * 4K L1 TLB configuration
7251 	 */
7252 
7253 	switch (cpi->cpi_vendor) {
7254 		uint_t nentries;
7255 	case X86_VENDOR_TM:
7256 		if (cpi->cpi_family >= 5) {
7257 			/*
7258 			 * Crusoe processors have 256 TLB entries, but
7259 			 * cpuid data format constrains them to only
7260 			 * reporting 255 of them.
7261 			 */
7262 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7263 				nentries = 256;
7264 			/*
7265 			 * Crusoe processors also have a unified TLB
7266 			 */
7267 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7268 			    nentries);
7269 			break;
7270 		}
7271 		/*FALLTHROUGH*/
7272 	default:
7273 		add_amd_tlb(devi, itlb4k_str,
7274 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7275 		add_amd_tlb(devi, dtlb4k_str,
7276 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7277 		break;
7278 	}
7279 
7280 	/*
7281 	 * data L1 cache configuration
7282 	 */
7283 
7284 	add_amd_cache(devi, l1_dcache_str,
7285 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7286 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7287 
7288 	/*
7289 	 * code L1 cache configuration
7290 	 */
7291 
7292 	add_amd_cache(devi, l1_icache_str,
7293 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7294 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7295 
7296 	if (cpi->cpi_xmaxeax < 0x80000006)
7297 		return;
7298 	cp = &cpi->cpi_extd[6];
7299 
7300 	/* Check for a unified L2 TLB for large pages */
7301 
7302 	if (BITX(cp->cp_eax, 31, 16) == 0)
7303 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7304 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7305 	else {
7306 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7307 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7308 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7309 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7310 	}
7311 
7312 	/* Check for a unified L2 TLB for 4K pages */
7313 
7314 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7315 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7316 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7317 	} else {
7318 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7319 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7320 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7321 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7322 	}
7323 
7324 	add_amd_l2_cache(devi, l2_cache_str,
7325 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7326 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7327 }
7328 
7329 /*
7330  * There are two basic ways that the x86 world describes it cache
7331  * and tlb architecture - Intel's way and AMD's way.
7332  *
7333  * Return which flavor of cache architecture we should use
7334  */
7335 static int
x86_which_cacheinfo(struct cpuid_info * cpi)7336 x86_which_cacheinfo(struct cpuid_info *cpi)
7337 {
7338 	switch (cpi->cpi_vendor) {
7339 	case X86_VENDOR_Intel:
7340 		if (cpi->cpi_maxeax >= 2)
7341 			return (X86_VENDOR_Intel);
7342 		break;
7343 	case X86_VENDOR_AMD:
7344 		/*
7345 		 * The K5 model 1 was the first part from AMD that reported
7346 		 * cache sizes via extended cpuid functions.
7347 		 */
7348 		if (cpi->cpi_family > 5 ||
7349 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7350 			return (X86_VENDOR_AMD);
7351 		break;
7352 	case X86_VENDOR_HYGON:
7353 		return (X86_VENDOR_AMD);
7354 	case X86_VENDOR_TM:
7355 		if (cpi->cpi_family >= 5)
7356 			return (X86_VENDOR_AMD);
7357 		/*FALLTHROUGH*/
7358 	default:
7359 		/*
7360 		 * If they have extended CPU data for 0x80000005
7361 		 * then we assume they have AMD-format cache
7362 		 * information.
7363 		 *
7364 		 * If not, and the vendor happens to be Cyrix,
7365 		 * then try our-Cyrix specific handler.
7366 		 *
7367 		 * If we're not Cyrix, then assume we're using Intel's
7368 		 * table-driven format instead.
7369 		 */
7370 		if (cpi->cpi_xmaxeax >= 0x80000005)
7371 			return (X86_VENDOR_AMD);
7372 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7373 			return (X86_VENDOR_Cyrix);
7374 		else if (cpi->cpi_maxeax >= 2)
7375 			return (X86_VENDOR_Intel);
7376 		break;
7377 	}
7378 	return (-1);
7379 }
7380 
7381 void
cpuid_set_cpu_properties(void * dip,processorid_t cpu_id,struct cpuid_info * cpi)7382 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7383     struct cpuid_info *cpi)
7384 {
7385 	dev_info_t *cpu_devi;
7386 	int create;
7387 
7388 	cpu_devi = (dev_info_t *)dip;
7389 
7390 	/* device_type */
7391 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7392 	    "device_type", "cpu");
7393 
7394 	/* reg */
7395 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7396 	    "reg", cpu_id);
7397 
7398 	/* cpu-mhz, and clock-frequency */
7399 	if (cpu_freq > 0) {
7400 		long long mul;
7401 
7402 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7403 		    "cpu-mhz", cpu_freq);
7404 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7405 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7406 			    "clock-frequency", (int)mul);
7407 	}
7408 
7409 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7410 
7411 	/* vendor-id */
7412 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7413 	    "vendor-id", cpi->cpi_vendorstr);
7414 
7415 	if (cpi->cpi_maxeax == 0) {
7416 		return;
7417 	}
7418 
7419 	/*
7420 	 * family, model, and step
7421 	 */
7422 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7423 	    "family", CPI_FAMILY(cpi));
7424 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7425 	    "cpu-model", CPI_MODEL(cpi));
7426 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7427 	    "stepping-id", CPI_STEP(cpi));
7428 
7429 	/* type */
7430 	switch (cpi->cpi_vendor) {
7431 	case X86_VENDOR_Intel:
7432 		create = 1;
7433 		break;
7434 	default:
7435 		create = 0;
7436 		break;
7437 	}
7438 	if (create)
7439 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7440 		    "type", CPI_TYPE(cpi));
7441 
7442 	/* ext-family */
7443 	switch (cpi->cpi_vendor) {
7444 	case X86_VENDOR_Intel:
7445 	case X86_VENDOR_AMD:
7446 		create = cpi->cpi_family >= 0xf;
7447 		break;
7448 	case X86_VENDOR_HYGON:
7449 		create = 1;
7450 		break;
7451 	default:
7452 		create = 0;
7453 		break;
7454 	}
7455 	if (create)
7456 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7457 		    "ext-family", CPI_FAMILY_XTD(cpi));
7458 
7459 	/* ext-model */
7460 	switch (cpi->cpi_vendor) {
7461 	case X86_VENDOR_Intel:
7462 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7463 		break;
7464 	case X86_VENDOR_AMD:
7465 		create = CPI_FAMILY(cpi) == 0xf;
7466 		break;
7467 	case X86_VENDOR_HYGON:
7468 		create = 1;
7469 		break;
7470 	default:
7471 		create = 0;
7472 		break;
7473 	}
7474 	if (create)
7475 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7476 		    "ext-model", CPI_MODEL_XTD(cpi));
7477 
7478 	/* generation */
7479 	switch (cpi->cpi_vendor) {
7480 	case X86_VENDOR_AMD:
7481 	case X86_VENDOR_HYGON:
7482 		/*
7483 		 * AMD K5 model 1 was the first part to support this
7484 		 */
7485 		create = cpi->cpi_xmaxeax >= 0x80000001;
7486 		break;
7487 	default:
7488 		create = 0;
7489 		break;
7490 	}
7491 	if (create)
7492 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7493 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7494 
7495 	/* brand-id */
7496 	switch (cpi->cpi_vendor) {
7497 	case X86_VENDOR_Intel:
7498 		/*
7499 		 * brand id first appeared on Pentium III Xeon model 8,
7500 		 * and Celeron model 8 processors and Opteron
7501 		 */
7502 		create = cpi->cpi_family > 6 ||
7503 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7504 		break;
7505 	case X86_VENDOR_AMD:
7506 		create = cpi->cpi_family >= 0xf;
7507 		break;
7508 	case X86_VENDOR_HYGON:
7509 		create = 1;
7510 		break;
7511 	default:
7512 		create = 0;
7513 		break;
7514 	}
7515 	if (create && cpi->cpi_brandid != 0) {
7516 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7517 		    "brand-id", cpi->cpi_brandid);
7518 	}
7519 
7520 	/* chunks, and apic-id */
7521 	switch (cpi->cpi_vendor) {
7522 		/*
7523 		 * first available on Pentium IV and Opteron (K8)
7524 		 */
7525 	case X86_VENDOR_Intel:
7526 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7527 		break;
7528 	case X86_VENDOR_AMD:
7529 		create = cpi->cpi_family >= 0xf;
7530 		break;
7531 	case X86_VENDOR_HYGON:
7532 		create = 1;
7533 		break;
7534 	default:
7535 		create = 0;
7536 		break;
7537 	}
7538 	if (create) {
7539 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7540 		    "chunks", CPI_CHUNKS(cpi));
7541 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7542 		    "apic-id", cpi->cpi_apicid);
7543 		if (cpi->cpi_chipid >= 0) {
7544 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7545 			    "chip#", cpi->cpi_chipid);
7546 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7547 			    "clog#", cpi->cpi_clogid);
7548 		}
7549 	}
7550 
7551 	/* cpuid-features */
7552 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7553 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7554 
7555 
7556 	/* cpuid-features-ecx */
7557 	switch (cpi->cpi_vendor) {
7558 	case X86_VENDOR_Intel:
7559 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7560 		break;
7561 	case X86_VENDOR_AMD:
7562 		create = cpi->cpi_family >= 0xf;
7563 		break;
7564 	case X86_VENDOR_HYGON:
7565 		create = 1;
7566 		break;
7567 	default:
7568 		create = 0;
7569 		break;
7570 	}
7571 	if (create)
7572 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7573 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7574 
7575 	/* ext-cpuid-features */
7576 	switch (cpi->cpi_vendor) {
7577 	case X86_VENDOR_Intel:
7578 	case X86_VENDOR_AMD:
7579 	case X86_VENDOR_HYGON:
7580 	case X86_VENDOR_Cyrix:
7581 	case X86_VENDOR_TM:
7582 	case X86_VENDOR_Centaur:
7583 		create = cpi->cpi_xmaxeax >= 0x80000001;
7584 		break;
7585 	default:
7586 		create = 0;
7587 		break;
7588 	}
7589 	if (create) {
7590 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7591 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7592 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7593 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7594 	}
7595 
7596 	/*
7597 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7598 	 * model 1, and Cyrix GXm.  On earlier models we try and
7599 	 * simulate something similar .. so this string should always
7600 	 * same -something- about the processor, however lame.
7601 	 */
7602 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7603 	    "brand-string", cpi->cpi_brandstr);
7604 
7605 	/*
7606 	 * Finally, cache and tlb information
7607 	 */
7608 	switch (x86_which_cacheinfo(cpi)) {
7609 	case X86_VENDOR_Intel:
7610 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7611 		break;
7612 	case X86_VENDOR_Cyrix:
7613 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7614 		break;
7615 	case X86_VENDOR_AMD:
7616 		amd_cache_info(cpi, cpu_devi);
7617 		break;
7618 	default:
7619 		break;
7620 	}
7621 }
7622 
7623 struct l2info {
7624 	int *l2i_csz;
7625 	int *l2i_lsz;
7626 	int *l2i_assoc;
7627 	int l2i_ret;
7628 };
7629 
7630 /*
7631  * A cacheinfo walker that fetches the size, line-size and associativity
7632  * of the L2 cache
7633  */
7634 static int
intel_l2cinfo(void * arg,const struct cachetab * ct)7635 intel_l2cinfo(void *arg, const struct cachetab *ct)
7636 {
7637 	struct l2info *l2i = arg;
7638 	int *ip;
7639 
7640 	if (ct->ct_label != l2_cache_str &&
7641 	    ct->ct_label != sl2_cache_str)
7642 		return (0);	/* not an L2 -- keep walking */
7643 
7644 	if ((ip = l2i->l2i_csz) != NULL)
7645 		*ip = ct->ct_size;
7646 	if ((ip = l2i->l2i_lsz) != NULL)
7647 		*ip = ct->ct_line_size;
7648 	if ((ip = l2i->l2i_assoc) != NULL)
7649 		*ip = ct->ct_assoc;
7650 	l2i->l2i_ret = ct->ct_size;
7651 	return (1);		/* was an L2 -- terminate walk */
7652 }
7653 
7654 /*
7655  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7656  *
7657  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7658  *	value is the associativity, the associativity for the L2 cache and
7659  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7660  *	an index into the amd_afd[] array to determine the associativity.
7661  *	-1 is undefined. 0 is fully associative.
7662  */
7663 
7664 static int amd_afd[] =
7665 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7666 
7667 static void
amd_l2cacheinfo(struct cpuid_info * cpi,struct l2info * l2i)7668 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7669 {
7670 	struct cpuid_regs *cp;
7671 	uint_t size, assoc;
7672 	int i;
7673 	int *ip;
7674 
7675 	if (cpi->cpi_xmaxeax < 0x80000006)
7676 		return;
7677 	cp = &cpi->cpi_extd[6];
7678 
7679 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7680 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7681 		uint_t cachesz = size * 1024;
7682 		assoc = amd_afd[i];
7683 
7684 		ASSERT(assoc != -1);
7685 
7686 		if ((ip = l2i->l2i_csz) != NULL)
7687 			*ip = cachesz;
7688 		if ((ip = l2i->l2i_lsz) != NULL)
7689 			*ip = BITX(cp->cp_ecx, 7, 0);
7690 		if ((ip = l2i->l2i_assoc) != NULL)
7691 			*ip = assoc;
7692 		l2i->l2i_ret = cachesz;
7693 	}
7694 }
7695 
7696 int
getl2cacheinfo(cpu_t * cpu,int * csz,int * lsz,int * assoc)7697 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7698 {
7699 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7700 	struct l2info __l2info, *l2i = &__l2info;
7701 
7702 	l2i->l2i_csz = csz;
7703 	l2i->l2i_lsz = lsz;
7704 	l2i->l2i_assoc = assoc;
7705 	l2i->l2i_ret = -1;
7706 
7707 	switch (x86_which_cacheinfo(cpi)) {
7708 	case X86_VENDOR_Intel:
7709 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7710 		break;
7711 	case X86_VENDOR_Cyrix:
7712 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7713 		break;
7714 	case X86_VENDOR_AMD:
7715 		amd_l2cacheinfo(cpi, l2i);
7716 		break;
7717 	default:
7718 		break;
7719 	}
7720 	return (l2i->l2i_ret);
7721 }
7722 
7723 #if !defined(__xpv)
7724 
7725 uint32_t *
cpuid_mwait_alloc(cpu_t * cpu)7726 cpuid_mwait_alloc(cpu_t *cpu)
7727 {
7728 	uint32_t	*ret;
7729 	size_t		mwait_size;
7730 
7731 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7732 
7733 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7734 	if (mwait_size == 0)
7735 		return (NULL);
7736 
7737 	/*
7738 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7739 	 * allocations.  mwait_size is currently cache line sized.  Neither
7740 	 * of these implementation details are guarantied to be true in the
7741 	 * future.
7742 	 *
7743 	 * First try allocating mwait_size as kmem_alloc() currently returns
7744 	 * correctly aligned memory.  If kmem_alloc() does not return
7745 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7746 	 *
7747 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7748 	 * decide to free this memory.
7749 	 */
7750 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7751 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7752 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7753 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7754 		*ret = MWAIT_RUNNING;
7755 		return (ret);
7756 	} else {
7757 		kmem_free(ret, mwait_size);
7758 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7759 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7760 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7761 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7762 		*ret = MWAIT_RUNNING;
7763 		return (ret);
7764 	}
7765 }
7766 
7767 void
cpuid_mwait_free(cpu_t * cpu)7768 cpuid_mwait_free(cpu_t *cpu)
7769 {
7770 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7771 		return;
7772 	}
7773 
7774 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7775 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7776 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7777 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7778 	}
7779 
7780 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7781 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7782 }
7783 
7784 void
patch_tsc_read(int flag)7785 patch_tsc_read(int flag)
7786 {
7787 	size_t cnt;
7788 
7789 	switch (flag) {
7790 	case TSC_NONE:
7791 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7792 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7793 		break;
7794 	case TSC_RDTSC_LFENCE:
7795 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7796 		(void) memcpy((void *)tsc_read,
7797 		    (void *)&_tsc_lfence_start, cnt);
7798 		break;
7799 	case TSC_TSCP:
7800 		cnt = &_tscp_end - &_tscp_start;
7801 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7802 		break;
7803 	default:
7804 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7805 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7806 		break;
7807 	}
7808 	tsc_type = flag;
7809 }
7810 
7811 int
cpuid_deep_cstates_supported(void)7812 cpuid_deep_cstates_supported(void)
7813 {
7814 	struct cpuid_info *cpi;
7815 	struct cpuid_regs regs;
7816 
7817 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7818 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7819 
7820 	cpi = CPU->cpu_m.mcpu_cpi;
7821 
7822 	switch (cpi->cpi_vendor) {
7823 	case X86_VENDOR_Intel:
7824 		if (cpi->cpi_xmaxeax < 0x80000007)
7825 			return (0);
7826 
7827 		/*
7828 		 * Does TSC run at a constant rate in all C-states?
7829 		 */
7830 		regs.cp_eax = 0x80000007;
7831 		(void) __cpuid_insn(&regs);
7832 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7833 
7834 	default:
7835 		return (0);
7836 	}
7837 }
7838 
7839 #endif	/* !__xpv */
7840 
7841 void
post_startup_cpu_fixups(void)7842 post_startup_cpu_fixups(void)
7843 {
7844 #ifndef __xpv
7845 	/*
7846 	 * Some AMD processors support C1E state. Entering this state will
7847 	 * cause the local APIC timer to stop, which we can't deal with at
7848 	 * this time.
7849 	 */
7850 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7851 		on_trap_data_t otd;
7852 		uint64_t reg;
7853 
7854 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7855 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7856 			/* Disable C1E state if it is enabled by BIOS */
7857 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7858 			    AMD_ACTONCMPHALT_MASK) {
7859 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7860 				    AMD_ACTONCMPHALT_SHIFT);
7861 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7862 			}
7863 		}
7864 		no_trap();
7865 	}
7866 #endif	/* !__xpv */
7867 }
7868 
7869 void
enable_pcid(void)7870 enable_pcid(void)
7871 {
7872 	if (x86_use_pcid == -1)
7873 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7874 
7875 	if (x86_use_invpcid == -1) {
7876 		x86_use_invpcid = is_x86_feature(x86_featureset,
7877 		    X86FSET_INVPCID);
7878 	}
7879 
7880 	if (!x86_use_pcid)
7881 		return;
7882 
7883 	/*
7884 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7885 	 * bits; better make sure there's nothing there.
7886 	 */
7887 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7888 
7889 	setcr4(getcr4() | CR4_PCIDE);
7890 }
7891 
7892 /*
7893  * Setup necessary registers to enable XSAVE feature on this processor.
7894  * This function needs to be called early enough, so that no xsave/xrstor
7895  * ops will execute on the processor before the MSRs are properly set up.
7896  *
7897  * Current implementation has the following assumption:
7898  * - cpuid_pass_basic() is done, so that X86 features are known.
7899  * - fpu_probe() is done, so that fp_save_mech is chosen.
7900  */
7901 void
xsave_setup_msr(cpu_t * cpu)7902 xsave_setup_msr(cpu_t *cpu)
7903 {
7904 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7905 	ASSERT(fp_save_mech == FP_XSAVE);
7906 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7907 
7908 	/* Enable OSXSAVE in CR4. */
7909 	setcr4(getcr4() | CR4_OSXSAVE);
7910 	/*
7911 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7912 	 * correct value.
7913 	 */
7914 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7915 	setup_xfem();
7916 }
7917 
7918 /*
7919  * Starting with the Westmere processor the local
7920  * APIC timer will continue running in all C-states,
7921  * including the deepest C-states.
7922  */
7923 int
cpuid_arat_supported(void)7924 cpuid_arat_supported(void)
7925 {
7926 	struct cpuid_info *cpi;
7927 	struct cpuid_regs regs;
7928 
7929 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7930 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7931 
7932 	cpi = CPU->cpu_m.mcpu_cpi;
7933 
7934 	switch (cpi->cpi_vendor) {
7935 	case X86_VENDOR_Intel:
7936 		/*
7937 		 * Always-running Local APIC Timer is
7938 		 * indicated by CPUID.6.EAX[2].
7939 		 */
7940 		if (cpi->cpi_maxeax >= 6) {
7941 			regs.cp_eax = 6;
7942 			(void) cpuid_insn(NULL, &regs);
7943 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7944 		} else {
7945 			return (0);
7946 		}
7947 	default:
7948 		return (0);
7949 	}
7950 }
7951 
7952 /*
7953  * Check support for Intel ENERGY_PERF_BIAS feature
7954  */
7955 int
cpuid_iepb_supported(struct cpu * cp)7956 cpuid_iepb_supported(struct cpu *cp)
7957 {
7958 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7959 	struct cpuid_regs regs;
7960 
7961 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7962 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7963 
7964 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7965 		return (0);
7966 	}
7967 
7968 	/*
7969 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7970 	 * capability bit CPUID.6.ECX.3
7971 	 */
7972 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7973 		return (0);
7974 
7975 	regs.cp_eax = 0x6;
7976 	(void) cpuid_insn(NULL, &regs);
7977 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7978 }
7979 
7980 /*
7981  * Check support for TSC deadline timer
7982  *
7983  * TSC deadline timer provides a superior software programming
7984  * model over local APIC timer that eliminates "time drifts".
7985  * Instead of specifying a relative time, software specifies an
7986  * absolute time as the target at which the processor should
7987  * generate a timer event.
7988  */
7989 int
cpuid_deadline_tsc_supported(void)7990 cpuid_deadline_tsc_supported(void)
7991 {
7992 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7993 	struct cpuid_regs regs;
7994 
7995 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7996 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7997 
7998 	switch (cpi->cpi_vendor) {
7999 	case X86_VENDOR_Intel:
8000 		if (cpi->cpi_maxeax >= 1) {
8001 			regs.cp_eax = 1;
8002 			(void) cpuid_insn(NULL, &regs);
8003 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
8004 		} else {
8005 			return (0);
8006 		}
8007 	default:
8008 		return (0);
8009 	}
8010 }
8011 
8012 #if !defined(__xpv)
8013 /*
8014  * Patch in versions of bcopy for high performance Intel Nhm processors
8015  * and later...
8016  */
8017 void
patch_memops(uint_t vendor)8018 patch_memops(uint_t vendor)
8019 {
8020 	size_t cnt, i;
8021 	caddr_t to, from;
8022 
8023 	if ((vendor == X86_VENDOR_Intel) &&
8024 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
8025 		cnt = &bcopy_patch_end - &bcopy_patch_start;
8026 		to = &bcopy_ck_size;
8027 		from = &bcopy_patch_start;
8028 		for (i = 0; i < cnt; i++) {
8029 			*to++ = *from++;
8030 		}
8031 	}
8032 }
8033 #endif  /*  !__xpv */
8034 
8035 /*
8036  * We're being asked to tell the system how many bits are required to represent
8037  * the various thread and strand IDs. While it's tempting to derive this based
8038  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
8039  * correct. Instead, this needs to be based on the number of bits that the APIC
8040  * allows for these different configurations. We only update these to a larger
8041  * value if we find one.
8042  */
8043 void
cpuid_get_ext_topo(cpu_t * cpu,uint_t * core_nbits,uint_t * strand_nbits)8044 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
8045 {
8046 	struct cpuid_info *cpi;
8047 
8048 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
8049 	cpi = cpu->cpu_m.mcpu_cpi;
8050 
8051 	if (cpi->cpi_ncore_bits > *core_nbits) {
8052 		*core_nbits = cpi->cpi_ncore_bits;
8053 	}
8054 
8055 	if (cpi->cpi_nthread_bits > *strand_nbits) {
8056 		*strand_nbits = cpi->cpi_nthread_bits;
8057 	}
8058 }
8059 
8060 void
cpuid_pass_ucode(cpu_t * cpu,uchar_t * fset)8061 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
8062 {
8063 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
8064 	struct cpuid_regs cp;
8065 
8066 	/*
8067 	 * Reread the CPUID portions that we need for various security
8068 	 * information.
8069 	 */
8070 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
8071 		/*
8072 		 * Check if we now have leaf 7 available to us.
8073 		 */
8074 		if (cpi->cpi_maxeax < 7) {
8075 			bzero(&cp, sizeof (cp));
8076 			cp.cp_eax = 0;
8077 			cpi->cpi_maxeax = __cpuid_insn(&cp);
8078 			if (cpi->cpi_maxeax < 7)
8079 				return;
8080 		}
8081 
8082 		bzero(&cp, sizeof (cp));
8083 		cp.cp_eax = 7;
8084 		cp.cp_ecx = 0;
8085 		(void) __cpuid_insn(&cp);
8086 		cpi->cpi_std[7] = cp;
8087 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
8088 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
8089 		/* No xcpuid support */
8090 		if (cpi->cpi_family < 5 ||
8091 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
8092 			return;
8093 
8094 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
8095 			bzero(&cp, sizeof (cp));
8096 			cp.cp_eax = CPUID_LEAF_EXT_0;
8097 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
8098 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
8099 				return;
8100 			}
8101 		}
8102 
8103 		/*
8104 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
8105 		 * leaf 0x21. So we also check that.
8106 		 */
8107 		bzero(&cp, sizeof (cp));
8108 		cp.cp_eax = CPUID_LEAF_EXT_8;
8109 		(void) __cpuid_insn(&cp);
8110 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
8111 		cpi->cpi_extd[8] = cp;
8112 
8113 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
8114 			return;
8115 		}
8116 
8117 		bzero(&cp, sizeof (cp));
8118 		cp.cp_eax = CPUID_LEAF_EXT_21;
8119 		(void) __cpuid_insn(&cp);
8120 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
8121 		cpi->cpi_extd[0x21] = cp;
8122 	} else {
8123 		/*
8124 		 * Nothing to do here. Return an empty set which has already
8125 		 * been zeroed for us.
8126 		 */
8127 		return;
8128 	}
8129 	cpuid_scan_security(cpu, fset);
8130 }
8131 
8132 /* ARGSUSED */
8133 static int
cpuid_post_ucodeadm_xc(xc_arg_t arg0,xc_arg_t arg1,xc_arg_t arg2)8134 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
8135 {
8136 	uchar_t *fset;
8137 	boolean_t first_pass = (boolean_t)arg1;
8138 
8139 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
8140 	if (first_pass && CPU->cpu_id != 0)
8141 		return (0);
8142 	if (!first_pass && CPU->cpu_id == 0)
8143 		return (0);
8144 	cpuid_pass_ucode(CPU, fset);
8145 
8146 	return (0);
8147 }
8148 
8149 /*
8150  * After a microcode update where the version has changed, then we need to
8151  * rescan CPUID. To do this we check every CPU to make sure that they have the
8152  * same microcode. Then we perform a cross call to all such CPUs. It's the
8153  * caller's job to make sure that no one else can end up doing an update while
8154  * this is going on.
8155  *
8156  * We assume that the system is microcode capable if we're called.
8157  */
8158 void
cpuid_post_ucodeadm(void)8159 cpuid_post_ucodeadm(void)
8160 {
8161 	uint32_t rev;
8162 	int i;
8163 	struct cpu *cpu;
8164 	cpuset_t cpuset;
8165 	void *argdata;
8166 	uchar_t *f0;
8167 
8168 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
8169 
8170 	mutex_enter(&cpu_lock);
8171 	cpu = cpu_get(0);
8172 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
8173 	CPUSET_ONLY(cpuset, 0);
8174 	for (i = 1; i < max_ncpus; i++) {
8175 		if ((cpu = cpu_get(i)) == NULL)
8176 			continue;
8177 
8178 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
8179 			panic("post microcode update CPU %d has differing "
8180 			    "microcode revision (%u) from CPU 0 (%u)",
8181 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
8182 		}
8183 		CPUSET_ADD(cpuset, i);
8184 	}
8185 
8186 	/*
8187 	 * We do the cross calls in two passes. The first pass is only for the
8188 	 * boot CPU. The second pass is for all of the other CPUs. This allows
8189 	 * the boot CPU to go through and change behavior related to patching or
8190 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
8191 	 * other CPUs to follow suit.
8192 	 */
8193 	kpreempt_disable();
8194 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
8195 	    cpuid_post_ucodeadm_xc);
8196 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
8197 	    cpuid_post_ucodeadm_xc);
8198 	kpreempt_enable();
8199 
8200 	/*
8201 	 * OK, now look at each CPU and see if their feature sets are equal.
8202 	 */
8203 	f0 = argdata;
8204 	for (i = 1; i < max_ncpus; i++) {
8205 		uchar_t *fset;
8206 		if (!CPU_IN_SET(cpuset, i))
8207 			continue;
8208 
8209 		fset = (uchar_t *)((uintptr_t)argdata +
8210 		    sizeof (x86_featureset) * i);
8211 
8212 		if (!compare_x86_featureset(f0, fset)) {
8213 			panic("Post microcode update CPU %d has "
8214 			    "differing security feature (%p) set from CPU 0 "
8215 			    "(%p), not appending to feature set", i,
8216 			    (void *)fset, (void *)f0);
8217 		}
8218 	}
8219 
8220 	mutex_exit(&cpu_lock);
8221 
8222 	for (i = 0; i < NUM_X86_FEATURES; i++) {
8223 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8224 		    x86_feature_names[i]);
8225 		if (is_x86_feature(f0, i)) {
8226 			add_x86_feature(x86_featureset, i);
8227 		}
8228 	}
8229 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8230 }
8231 
8232 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8233 
8234 typedef struct cpuid_pass_def {
8235 	cpuid_pass_t cpd_pass;
8236 	cpuid_pass_f cpd_func;
8237 } cpuid_pass_def_t;
8238 
8239 /*
8240  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8241  * normal sense and should not appear here.
8242  */
8243 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8244 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8245 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
8246 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
8247 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
8248 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8249 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8250 };
8251 
8252 void
cpuid_execpass(cpu_t * cp,cpuid_pass_t pass,void * arg)8253 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8254 {
8255 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
8256 
8257 	if (cp == NULL)
8258 		cp = CPU;
8259 
8260 	/*
8261 	 * Space statically allocated for BSP, ensure pointer is set
8262 	 */
8263 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8264 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
8265 
8266 	ASSERT(cpuid_checkpass(cp, pass - 1));
8267 
8268 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8269 		if (cpuid_pass_defs[i].cpd_pass == pass) {
8270 			cpuid_pass_defs[i].cpd_func(cp, arg);
8271 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8272 			return;
8273 		}
8274 	}
8275 
8276 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8277 	    pass, cp->cpu_id);
8278 }
8279 
8280 /*
8281  * Extract the processor family from a chiprev.  Processor families are not the
8282  * same as cpuid families; see comments above and in x86_archext.h.
8283  */
8284 x86_processor_family_t
chiprev_family(const x86_chiprev_t cr)8285 chiprev_family(const x86_chiprev_t cr)
8286 {
8287 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8288 }
8289 
8290 /*
8291  * A chiprev matches its template if the vendor and family are identical and the
8292  * revision of the chiprev matches one of the bits set in the template.  Callers
8293  * may bitwise-OR together chiprevs of the same vendor and family to form the
8294  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8295  * multiple vendors or processor families with a single call.  Note that this
8296  * function operates on processor families, not cpuid families.
8297  */
8298 boolean_t
chiprev_matches(const x86_chiprev_t cr,const x86_chiprev_t template)8299 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8300 {
8301 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8302 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8303 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8304 }
8305 
8306 /*
8307  * A chiprev is at least min if the vendor and family are identical and the
8308  * revision of the chiprev is at least as recent as that of min.  Processor
8309  * families are considered unordered and cannot be compared using this function.
8310  * Note that this function operates on processor families, not cpuid families.
8311  * Use of the _ANY chiprev variant with this function is not useful; it will
8312  * always return B_FALSE if the _ANY variant is supplied as the minimum
8313  * revision.  To determine only whether a chiprev is of a given processor
8314  * family, test the return value of chiprev_family() instead.
8315  */
8316 boolean_t
chiprev_at_least(const x86_chiprev_t cr,const x86_chiprev_t min)8317 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8318 {
8319 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8320 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8321 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8322 }
8323 
8324 /*
8325  * The uarch functions operate in a manner similar to the chiprev functions
8326  * above.  While it is tempting to allow these to operate on microarchitectures
8327  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8328  * than ZEN2), we elect not to do so because a manufacturer may supply
8329  * processors of multiple different microarchitecture families each of which may
8330  * be internally ordered but unordered with respect to those of other families.
8331  */
8332 x86_uarch_t
uarchrev_uarch(const x86_uarchrev_t ur)8333 uarchrev_uarch(const x86_uarchrev_t ur)
8334 {
8335 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8336 }
8337 
8338 boolean_t
uarchrev_matches(const x86_uarchrev_t ur,const x86_uarchrev_t template)8339 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8340 {
8341 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8342 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8343 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8344 }
8345 
8346 boolean_t
uarchrev_at_least(const x86_uarchrev_t ur,const x86_uarchrev_t min)8347 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8348 {
8349 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8350 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8351 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8352 }
8353 
8354 /*
8355  * Topology cache related information. This is yet another cache interface that
8356  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8357  * AMD Leaf 8x1D (introduced with Zen 1).
8358  */
8359 static boolean_t
cpuid_cache_topo_sup(const struct cpuid_info * cpi)8360 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8361 {
8362 	switch (cpi->cpi_vendor) {
8363 	case X86_VENDOR_Intel:
8364 		if (cpi->cpi_maxeax >= 4) {
8365 			return (B_TRUE);
8366 		}
8367 		break;
8368 	case X86_VENDOR_AMD:
8369 	case X86_VENDOR_HYGON:
8370 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8371 		    is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8372 			return (B_TRUE);
8373 		}
8374 		break;
8375 	default:
8376 		break;
8377 	}
8378 
8379 	return (B_FALSE);
8380 }
8381 
8382 int
cpuid_getncaches(struct cpu * cpu,uint32_t * ncache)8383 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8384 {
8385 	const struct cpuid_info *cpi;
8386 
8387 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8388 	cpi = cpu->cpu_m.mcpu_cpi;
8389 
8390 	if (!cpuid_cache_topo_sup(cpi)) {
8391 		return (ENOTSUP);
8392 	}
8393 
8394 	*ncache = cpi->cpi_cache_leaf_size;
8395 	return (0);
8396 }
8397 
8398 int
cpuid_getcache(struct cpu * cpu,uint32_t cno,x86_cache_t * cache)8399 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8400 {
8401 	const struct cpuid_info *cpi;
8402 	const struct cpuid_regs *cp;
8403 
8404 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8405 	cpi = cpu->cpu_m.mcpu_cpi;
8406 
8407 	if (!cpuid_cache_topo_sup(cpi)) {
8408 		return (ENOTSUP);
8409 	}
8410 
8411 	if (cno >= cpi->cpi_cache_leaf_size) {
8412 		return (EINVAL);
8413 	}
8414 
8415 	bzero(cache, sizeof (cache));
8416 	cp = cpi->cpi_cache_leaves[cno];
8417 	switch (CPI_CACHE_TYPE(cp)) {
8418 	case CPI_CACHE_TYPE_DATA:
8419 		cache->xc_type = X86_CACHE_TYPE_DATA;
8420 		break;
8421 	case CPI_CACHE_TYPE_INSTR:
8422 		cache->xc_type = X86_CACHE_TYPE_INST;
8423 		break;
8424 	case CPI_CACHE_TYPE_UNIFIED:
8425 		cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8426 		break;
8427 	case CPI_CACHE_TYPE_DONE:
8428 	default:
8429 		return (EINVAL);
8430 	}
8431 	cache->xc_level = CPI_CACHE_LVL(cp);
8432 	if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8433 		cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8434 	}
8435 	cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8436 	/*
8437 	 * The number of sets is reserved on AMD if the CPU is tagged as fully
8438 	 * associative, where as it is considered valid on Intel.
8439 	 */
8440 	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8441 	    CPI_FULL_ASSOC_CACHE(cp) != 0) {
8442 		cache->xc_nsets = 1;
8443 	} else {
8444 		cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8445 	}
8446 	cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8447 	cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8448 	cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8449 	    cache->xc_line_size;
8450 	/*
8451 	 * We're looking for the number of bits to cover the number of CPUs that
8452 	 * are being shared. Normally this would be the value - 1, but the CPUID
8453 	 * value is encoded as the actual value minus one, so we don't modify
8454 	 * this at all.
8455 	 */
8456 	cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8457 
8458 	/*
8459 	 * To construct a unique ID we construct a uint64_t that looks as
8460 	 * follows:
8461 	 *
8462 	 * [47:40] cache level
8463 	 * [39:32] CPUID cache type
8464 	 * [31:00] shifted APIC ID
8465 	 *
8466 	 * The shifted APIC ID gives us a guarantee that a given cache entry is
8467 	 * unique within its peers. The other two numbers give us something that
8468 	 * ensures that something is unique within the CPU. If we just had the
8469 	 * APIC ID shifted over by the indicated number of bits we'd end up with
8470 	 * an ID of zero for the L1I, L1D, L2, and L3.
8471 	 *
8472 	 * The format of this ID is private to the system and can change across
8473 	 * a reboot for the time being.
8474 	 */
8475 	cache->xc_id = (uint64_t)cache->xc_level << 40;
8476 	cache->xc_id |= (uint64_t)cache->xc_type << 32;
8477 	cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8478 
8479 	return (0);
8480 }
8481