xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision 2faf06a0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2023 Oxide Computer Company
28  * Copyright 2022 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * The Microcode Pass
328  *
329  * After a microcode update, we do a selective rescan of the cpuid leaves to
330  * determine what features have changed. Microcode updates can provide more
331  * details about security related features to deal with issues like Spectre and
332  * L1TF. On occasion, vendors have violated their contract and removed bits.
333  * However, we don't try to detect that because that puts us in a situation that
334  * we really can't deal with. As such, the only thing we rescan are security
335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
336  * different sequence on APs and therefore is not part of the sequential order;
337  * It is invoked directly instead of by cpuid_execpass() and its completion
338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
339  * a more complex dependency mechanism if warranted by future developments.
340  *
341  * All of the passes are run on all CPUs. However, for the most part we only
342  * care about what the boot CPU says about this information and use the other
343  * CPUs as a rough guide to sanity check that we have the same feature set.
344  *
345  * We do not support running multiple logical CPUs with disjoint, let alone
346  * different, feature sets.
347  *
348  * ------------------
349  * Processor Topology
350  * ------------------
351  *
352  * One of the important things that we need to do is to understand the topology
353  * of the underlying processor. When we say topology in this case, we're trying
354  * to understand the relationship between the logical CPUs that the operating
355  * system sees and the underlying physical layout. Different logical CPUs may
356  * share different resources which can have important consequences for the
357  * performance of the system. For example, they may share caches, execution
358  * units, and more.
359  *
360  * The topology of the processor changes from generation to generation and
361  * vendor to vendor.  Along with that, different vendors use different
362  * terminology, and the operating system itself uses occasionally overlapping
363  * terminology. It's important to understand what this topology looks like so
364  * one can understand the different things that we try to calculate and
365  * determine.
366  *
367  * To get started, let's talk about a little bit of terminology that we've used
368  * so far, is used throughout this file, and is fairly generic across multiple
369  * vendors:
370  *
371  * CPU
372  *	A central processing unit (CPU) refers to a logical and/or virtual
373  *	entity that the operating system can execute instructions on. The
374  *	underlying resources for this CPU may be shared between multiple
375  *	entities; however, to the operating system it is a discrete unit.
376  *
377  * PROCESSOR and PACKAGE
378  *
379  *	Generally, when we use the term 'processor' on its own, we are referring
380  *	to the physical entity that one buys and plugs into a board. However,
381  *	because processor has been overloaded and one might see it used to mean
382  *	multiple different levels, we will instead use the term 'package' for
383  *	the rest of this file. The term package comes from the electrical
384  *	engineering side and refers to the physical entity that encloses the
385  *	electronics inside. Strictly speaking the package can contain more than
386  *	just the CPU, for example, on many processors it may also have what's
387  *	called an 'integrated graphical processing unit (GPU)'. Because the
388  *	package can encapsulate multiple units, it is the largest physical unit
389  *	that we refer to.
390  *
391  * SOCKET
392  *
393  *	A socket refers to unit on a system board (generally the motherboard)
394  *	that can receive a package. A single package, or processor, is plugged
395  *	into a single socket. A system may have multiple sockets. Often times,
396  *	the term socket is used interchangeably with package and refers to the
397  *	electrical component that has plugged in, and not the receptacle itself.
398  *
399  * CORE
400  *
401  *	A core refers to the physical instantiation of a CPU, generally, with a
402  *	full set of hardware resources available to it. A package may contain
403  *	multiple cores inside of it or it may just have a single one. A
404  *	processor with more than one core is often referred to as 'multi-core'.
405  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
406  *	that has 'multi-core' processors.
407  *
408  *	A core may expose a single logical CPU to the operating system, or it
409  *	may expose multiple CPUs, which we call threads, defined below.
410  *
411  *	Some resources may still be shared by cores in the same package. For
412  *	example, many processors will share the level 3 cache between cores.
413  *	Some AMD generations share hardware resources between cores. For more
414  *	information on that see the section 'AMD Topology'.
415  *
416  * THREAD and STRAND
417  *
418  *	In this file, generally a thread refers to a hardware resources and not
419  *	the operating system's logical abstraction. A thread is always exposed
420  *	as an independent logical CPU to the operating system. A thread belongs
421  *	to a specific core. A core may have more than one thread. When that is
422  *	the case, the threads that are part of the same core are often referred
423  *	to as 'siblings'.
424  *
425  *	When multiple threads exist, this is generally referred to as
426  *	simultaneous multi-threading (SMT). When Intel introduced this in their
427  *	processors they called it hyper-threading (HT). When multiple threads
428  *	are active in a core, they split the resources of the core. For example,
429  *	two threads may share the same set of hardware execution units.
430  *
431  *	The operating system often uses the term 'strand' to refer to a thread.
432  *	This helps disambiguate it from the software concept.
433  *
434  * CHIP
435  *
436  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
437  *	base meaning, it is used to refer to a single integrated circuit, which
438  *	may or may not be the only thing in the package. In illumos, when you
439  *	see the term 'chip' it is almost always referring to the same thing as
440  *	the 'package'. However, many vendors may use chip to refer to one of
441  *	many integrated circuits that have been placed in the package. As an
442  *	example, see the subsequent definition.
443  *
444  *	To try and keep things consistent, we will only use chip when referring
445  *	to the entire integrated circuit package, with the exception of the
446  *	definition of multi-chip module (because it is in the name) and use the
447  *	term 'die' when we want the more general, potential sub-component
448  *	definition.
449  *
450  * DIE
451  *
452  *	A die refers to an integrated circuit. Inside of the package there may
453  *	be a single die or multiple dies. This is sometimes called a 'chip' in
454  *	vendor's parlance, but in this file, we use the term die to refer to a
455  *	subcomponent.
456  *
457  * MULTI-CHIP MODULE
458  *
459  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
460  *	are connected together in the same package. When a multi-chip design is
461  *	used, generally each chip is manufactured independently and then joined
462  *	together in the package. For example, on AMD's Zen microarchitecture
463  *	(family 0x17), the package contains several dies (the second meaning of
464  *	chip from above) that are connected together.
465  *
466  * CACHE
467  *
468  *	A cache is a part of the processor that maintains copies of recently
469  *	accessed memory. Caches are split into levels and then into types.
470  *	Commonly there are one to three levels, called level one, two, and
471  *	three. The lower the level, the smaller it is, the closer it is to the
472  *	execution units of the CPU, and the faster it is to access. The layout
473  *	and design of the cache come in many different flavors, consult other
474  *	resources for a discussion of those.
475  *
476  *	Caches are generally split into two types, the instruction and data
477  *	cache. The caches contain what their names suggest, the instruction
478  *	cache has executable program text, while the data cache has all other
479  *	memory that the processor accesses. As of this writing, data is kept
480  *	coherent between all of the caches on x86, so if one modifies program
481  *	text before it is executed, that will be in the data cache, and the
482  *	instruction cache will be synchronized with that change when the
483  *	processor actually executes those instructions. This coherency also
484  *	covers the fact that data could show up in multiple caches.
485  *
486  *	Generally, the lowest level caches are specific to a core. However, the
487  *	last layer cache is shared between some number of cores. The number of
488  *	CPUs sharing this last level cache is important. This has implications
489  *	for the choices that the scheduler makes, as accessing memory that might
490  *	be in a remote cache after thread migration can be quite expensive.
491  *
492  *	Sometimes, the word cache is abbreviated with a '$', because in US
493  *	English the word cache is pronounced the same as cash. So L1D$ refers to
494  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
495  *	in the rest of this theory statement for clarity.
496  *
497  * MEMORY CONTROLLER
498  *
499  *	The memory controller is a component that provides access to DRAM. Each
500  *	memory controller can access a set number of DRAM channels. Each channel
501  *	can have a number of DIMMs (sticks of memory) associated with it. A
502  *	given package may have more than one memory controller. The association
503  *	of the memory controller to a group of cores is important as it is
504  *	cheaper to access memory on the controller that you are associated with.
505  *
506  * NUMA
507  *
508  *	NUMA or non-uniform memory access, describes a way that systems are
509  *	built. On x86, any processor core can address all of the memory in the
510  *	system. However, When using multiple sockets or possibly within a
511  *	multi-chip module, some of that memory is physically closer and some of
512  *	it is further. Memory that is further away is more expensive to access.
513  *	Consider the following image of multiple sockets with memory:
514  *
515  *	+--------+                                                +--------+
516  *	| DIMM A |         +----------+      +----------+         | DIMM D |
517  *	+--------+-+       |          |      |          |       +-+------+-+
518  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
519  *	  +--------+-+     |          |      |          |     +-+------+-+
520  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
521  *	    +--------+                                        +--------+
522  *
523  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
524  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
525  *	access DIMMs A-C and more expensive to access D-F as it has to go
526  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
527  *	D-F are cheaper than A-C. While the socket form is the most common, when
528  *	using multi-chip modules, this can also sometimes occur. For another
529  *	example of this that's more involved, see the AMD topology section.
530  *
531  *
532  * Intel Topology
533  * --------------
534  *
535  * Most Intel processors since Nehalem, (as of this writing the current gen
536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
537  * the package is a single monolithic die. MCMs currently aren't used. Most
538  * parts have three levels of caches, with the L3 cache being shared between
539  * all of the cores on the package. The L1/L2 cache is generally specific to
540  * an individual core. The following image shows at a simplified level what
541  * this looks like. The memory controller is commonly part of something called
542  * the 'Uncore', that used to be separate physical chips that were not a part of
543  * the package, but are now part of the same chip.
544  *
545  *  +-----------------------------------------------------------------------+
546  *  | Package                                                               |
547  *  |  +-------------------+  +-------------------+  +-------------------+  |
548  *  |  | Core              |  | Core              |  | Core              |  |
549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
558  *  |  +-------------------+  +-------------------+  +-------------------+  |
559  *  | +-------------------------------------------------------------------+ |
560  *  | |                         Shared L3 Cache                           | |
561  *  | +-------------------------------------------------------------------+ |
562  *  | +-------------------------------------------------------------------+ |
563  *  | |                        Memory Controller                          | |
564  *  | +-------------------------------------------------------------------+ |
565  *  +-----------------------------------------------------------------------+
566  *
567  * A side effect of this current architecture is that what we care about from a
568  * scheduling and topology perspective, is simplified. In general we care about
569  * understanding which logical CPUs are part of the same core and socket.
570  *
571  * To determine the relationship between threads and cores, Intel initially used
572  * the identifier in the advanced programmable interrupt controller (APIC). They
573  * also added cpuid leaf 4 to give additional information about the number of
574  * threads and CPUs in the processor. With the addition of x2apic (which
575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
576  * additional cpuid topology leaf 0xB was added.
577  *
578  * AMD Topology
579  * ------------
580  *
581  * When discussing AMD topology, we want to break this into three distinct
582  * generations of topology. There's the basic topology that has been used in
583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
587  * additional terminology that's worth talking about.
588  *
589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
590  * that they considered SMT. Whether or not the AMD processors have SMT
591  * influences many things including scheduling and reliability, availability,
592  * and serviceability (RAS) features.
593  *
594  * NODE
595  *
596  *	AMD uses the term node to refer to a die that contains a number of cores
597  *	and I/O resources. Depending on the processor family and model, more
598  *	than one node can be present in the package. When there is more than one
599  *	node this indicates a multi-chip module. Usually each node has its own
600  *	access to memory and I/O devices. This is important and generally
601  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
602  *	result, we track this relationship in the operating system.
603  *
604  *	In processors with an L3 cache, the L3 cache is generally shared across
605  *	the entire node, though the way this is carved up varies from generation
606  *	to generation.
607  *
608  * BULLDOZER
609  *
610  *	Starting with the Bulldozer family (0x15) and continuing until the
611  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
612  *	compute unit. In a compute unit, two traditional cores share a number of
613  *	hardware resources. Critically, they share the FPU, L1 instruction
614  *	cache, and the L2 cache. Several compute units were then combined inside
615  *	of a single node.  Because the integer execution units, L1 data cache,
616  *	and some other resources were not shared between the cores, AMD never
617  *	considered this to be SMT.
618  *
619  * ZEN
620  *
621  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
622  *	is called Zeppelin. These modules are similar to the idea of nodes used
623  *	previously. Each of these nodes has two DRAM channels which all of the
624  *	cores in the node can access uniformly. These nodes are linked together
625  *	in the package, creating a NUMA environment.
626  *
627  *	The Zeppelin die itself contains two different 'core complexes'. Each
628  *	core complex consists of four cores which each have two threads, for a
629  *	total of 8 logical CPUs per complex. Unlike other generations,
630  *	where all the logical CPUs in a given node share the L3 cache, here each
631  *	core complex has its own shared L3 cache.
632  *
633  *	A further thing that we need to consider is that in some configurations,
634  *	particularly with the Threadripper line of processors, not every die
635  *	actually has its memory controllers wired up to actual memory channels.
636  *	This means that some cores have memory attached to them and others
637  *	don't.
638  *
639  *	To put Zen in perspective, consider the following images:
640  *
641  *      +--------------------------------------------------------+
642  *      | Core Complex                                           |
643  *      | +-------------------+    +-------------------+  +---+  |
644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
650  *      | +-------------------+    +-------------------+  | C |  |
651  *      | +-------------------+    +-------------------+  | a |  |
652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
658  *      | +-------------------+    +-------------------+  +---+  |
659  *      |                                                        |
660  *	+--------------------------------------------------------+
661  *
662  *  This first image represents a single Zen core complex that consists of four
663  *  cores.
664  *
665  *
666  *	+--------------------------------------------------------+
667  *	| Zeppelin Die                                           |
668  *	|  +--------------------------------------------------+  |
669  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
670  *	|  +--------------------------------------------------+  |
671  *      |                           HH                           |
672  *	|          +-----------+    HH    +-----------+          |
673  *	|          |           |    HH    |           |          |
674  *	|          |    Core   |==========|    Core   |          |
675  *	|          |  Complex  |==========|  Complex  |          |
676  *	|          |           |    HH    |           |          |
677  *	|          +-----------+    HH    +-----------+          |
678  *      |                           HH                           |
679  *	|  +--------------------------------------------------+  |
680  *	|  |                Memory Controller                 |  |
681  *	|  +--------------------------------------------------+  |
682  *      |                                                        |
683  *	+--------------------------------------------------------+
684  *
685  *  This image represents a single Zeppelin Die. Note how both cores are
686  *  connected to the same memory controller and I/O units. While each core
687  *  complex has its own L3 cache as seen in the first image, they both have
688  *  uniform access to memory.
689  *
690  *
691  *                      PP                     PP
692  *                      PP                     PP
693  *           +----------PP---------------------PP---------+
694  *           |          PP                     PP         |
695  *           |    +-----------+          +-----------+    |
696  *           |    |           |          |           |    |
697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
699  *           |    |           |          |           |    |
700  *           |    +-----------+ooo    ...+-----------+    |
701  *           |          HH      ooo  ...       HH         |
702  *           |          HH        oo..         HH         |
703  *           |          HH        ..oo         HH         |
704  *           |          HH      ...  ooo       HH         |
705  *           |    +-----------+...    ooo+-----------+    |
706  *           |    |           |          |           |    |
707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
709  *           |    |           |          |           |    |
710  *           |    +-----------+          +-----------+    |
711  *           |          PP                     PP         |
712  *           +----------PP---------------------PP---------+
713  *                      PP                     PP
714  *                      PP                     PP
715  *
716  *  This image represents a single Zen package. In this example, it has four
717  *  Zeppelin dies, though some configurations only have a single one. In this
718  *  example, each die is directly connected to the next. Also, each die is
719  *  represented as being connected to memory by the 'M' character and connected
720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
721  *  die is made up of two core complexes, we have multiple different NUMA
722  *  domains that we care about for these systems.
723  *
724  * ZEN 2
725  *
726  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
727  *	each Zeppelin Die had its own I/O die, that has been moved out of the
728  *	core complex in Zen 2. The actual core complex looks pretty similar, but
729  *	now the die actually looks much simpler:
730  *
731  *      +--------------------------------------------------------+
732  *      | Zen 2 Core Complex Die    HH                           |
733  *      |                           HH                           |
734  *      |          +-----------+    HH    +-----------+          |
735  *      |          |           |    HH    |           |          |
736  *      |          |    Core   |==========|    Core   |          |
737  *      |          |  Complex  |==========|  Complex  |          |
738  *      |          |           |    HH    |           |          |
739  *      |          +-----------+    HH    +-----------+          |
740  *      |                           HH                           |
741  *      |                           HH                           |
742  *      +--------------------------------------------------------+
743  *
744  *	From here, when we add the central I/O die, this changes things a bit.
745  *	Each die is connected to the I/O die, rather than trying to interconnect
746  *	them directly. The following image takes the same Zen 1 image that we
747  *	had earlier and shows what it looks like with the I/O die instead:
748  *
749  *                                 PP    PP
750  *                                 PP    PP
751  *           +---------------------PP----PP---------------------+
752  *           |                     PP    PP                     |
753  *           |  +-----------+      PP    PP      +-----------+  |
754  *           |  |           |      PP    PP      |           |  |
755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
757  *           |  |         |o|oooo|          |oooo|o|         |  |
758  *           |  +-----------+    |          |    +-----------+  |
759  *           |                   |   I/O    |                   |
760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
762  *           |                   |          |                   |
763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
765  *           |                   |          |                   |
766  *           |  +-----------+    |          |    +-----------+  |
767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
769  *           |  |    Die    |      PP    PP      |    Die    |  |
770  *           |  |           |      PP    PP      |           |  |
771  *           |  +-----------+      PP    PP      +-----------+  |
772  *           |                     PP    PP                     |
773  *           +---------------------PP----PP---------------------+
774  *                                 PP    PP
775  *                                 PP    PP
776  *
777  *	The above has four core complex dies installed, though the Zen 2 EPYC
778  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
779  *	generally only have one to two. The more notable difference here is how
780  *	everything communicates. Note that memory and PCIe come out of the
781  *	central die. This changes the way that one die accesses a resource. It
782  *	basically always has to go to the I/O die, where as in Zen 1 it may have
783  *	satisfied it locally. In general, this ends up being a better strategy
784  *	for most things, though it is possible to still treat everything in four
785  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
786  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
787  *	now there is only one 'node' present.
788  *
789  * ZEN 3
790  *
791  *	From an architectural perspective, Zen 3 is a much smaller change from
792  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
793  *	its microarchitectural changes. The biggest thing for us is how the die
794  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
795  *	cache. However, in Zen 3, the L3 is now shared between the entire core
796  *	complex die and is no longer partitioned between each core complex. This
797  *	means that all cores on the die can share the same L3 cache. Otherwise,
798  *	the general layout of the overall package with various core complexes
799  *	and an I/O die stays the same. Here's what the Core Complex Die looks
800  *	like in a bit more detail:
801  *
802  *               +-------------------------------------------------+
803  *               | Zen 3 Core Complex Die                          |
804  *               | +-------------------+    +-------------------+  |
805  *               | | Core       +----+ |    | Core       +----+ |  |
806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
811  *               | +-------------------+    +-------------------+  |
812  *               | +-------------------+    +-------------------+  |
813  *               | | Core       +----+ |    | Core       +----+ |  |
814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
819  *               | +-------------------+    +-------------------+  |
820  *               |                                                 |
821  *               | +--------------------------------------------+  |
822  *               | |                 L3 Cache                   |  |
823  *               | +--------------------------------------------+  |
824  *               |                                                 |
825  *               | +-------------------+    +-------------------+  |
826  *               | | Core       +----+ |    | Core       +----+ |  |
827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
832  *               | +-------------------+    +-------------------+  |
833  *               | +-------------------+    +-------------------+  |
834  *               | | Core       +----+ |    | Core       +----+ |  |
835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
840  *               | +-------------------+    +-------------------+  |
841  *               +-------------------------------------------------+
842  *
843  *	While it is not pictured, there are connections from the die to the
844  *	broader data fabric and additional functional blocks to support that
845  *	communication and coherency.
846  *
847  * CPUID LEAVES
848  *
849  * There are a few different CPUID leaves that we can use to try and understand
850  * the actual state of the world. As part of the introduction of family 0xf, AMD
851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
852  * processors that are in the system. Because families before Zen didn't have
853  * SMT, this was always the number of cores that were in the system. However, it
854  * should always be thought of as the number of logical threads to be consistent
855  * between generations. In addition we also get the size of the APIC ID that is
856  * used to represent the number of logical processors. This is important for
857  * deriving topology information.
858  *
859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
860  * bit between Bulldozer and later families, but it is quite useful in
861  * determining the topology information. Because this information has changed
862  * across family generations, it's worth calling out what these mean
863  * explicitly. The registers have the following meanings:
864  *
865  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
866  *		APIC ID, even though on systems without x2apic support, it will
867  *		be limited to 8 bits.
868  *
869  *	%ebx	On Bulldozer-era systems this contains information about the
870  *		number of cores that are in a compute unit (cores that share
871  *		resources). It also contains a per-package compute unit ID that
872  *		identifies which compute unit the logical CPU is a part of.
873  *
874  *		On Zen-era systems this instead contains the number of threads
875  *		per core and the ID of the core that the logical CPU is a part
876  *		of. Note, this ID is unique only to the package, it is not
877  *		globally unique across the entire system.
878  *
879  *	%ecx	This contains the number of nodes that exist in the package. It
880  *		also contains an ID that identifies which node the logical CPU
881  *		is a part of.
882  *
883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
884  * cache layout to determine which logical CPUs are sharing which caches.
885  *
886  * illumos Topology
887  * ----------------
888  *
889  * Based on the above we synthesize the information into several different
890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
891  * of what each member is supposed to represent and their uniqueness. In
892  * general, there are two levels of uniqueness that we care about. We care about
893  * an ID that is globally unique. That means that it will be unique across all
894  * entities in the system. For example, the default logical CPU ID is globally
895  * unique. On the other hand, there is some information that we only care about
896  * being unique within the context of a single package / socket. Here are the
897  * variables that we keep track of and their meaning.
898  *
899  * Several of the values that are asking for an identifier, with the exception
900  * of cpi_apicid, are allowed to be synthetic.
901  *
902  *
903  * cpi_apicid
904  *
905  *	This is the value of the CPU's APIC id. This should be the full 32-bit
906  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
907  *	APIC ID. This value is globally unique between all logical CPUs across
908  *	all packages. This is usually required by the APIC.
909  *
910  * cpi_chipid
911  *
912  *	This value indicates the ID of the package that the logical CPU is a
913  *	part of. This value is allowed to be synthetic. It is usually derived by
914  *	taking the CPU's APIC ID and determining how many bits are used to
915  *	represent CPU cores in the package. All logical CPUs that are part of
916  *	the same package must have the same value.
917  *
918  * cpi_coreid
919  *
920  *	This represents the ID of a CPU core. Two logical CPUs should only have
921  *	the same cpi_coreid value if they are part of the same core. These
922  *	values may be synthetic. On systems that support SMT, this value is
923  *	usually derived from the APIC ID, otherwise it is often synthetic and
924  *	just set to the value of the cpu_id in the cpu_t.
925  *
926  * cpi_pkgcoreid
927  *
928  *	This is similar to the cpi_coreid in that logical CPUs that are part of
929  *	the same core should have the same ID. The main difference is that these
930  *	values are only required to be unique to a given socket.
931  *
932  * cpi_clogid
933  *
934  *	This represents the logical ID of a logical CPU. This value should be
935  *	unique within a given socket for each logical CPU. This is allowed to be
936  *	synthetic, though it is usually based off of the CPU's apic ID. The
937  *	broader system expects that logical CPUs that have are part of the same
938  *	core have contiguous numbers. For example, if there were two threads per
939  *	core, then the core IDs divided by two should be the same and the first
940  *	modulus two should be zero and the second one. For example, IDs 4 and 5
941  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
942  *	6 represent two logical CPUs that are part of different cores.
943  *
944  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
945  *	from the same source, strictly speaking, they don't have to be and the
946  *	two values should be considered logically independent. One should not
947  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
948  *	some kind of relationship. While this is tempting, we've seen cases on
949  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
950  *
951  * cpi_ncpu_per_chip
952  *
953  *	This value indicates the total number of logical CPUs that exist in the
954  *	physical package. Critically, this is not the number of logical CPUs
955  *	that exist for just the single core.
956  *
957  *	This value should be the same for all logical CPUs in the same package.
958  *
959  * cpi_ncore_per_chip
960  *
961  *	This value indicates the total number of physical CPU cores that exist
962  *	in the package. The system compares this value with cpi_ncpu_per_chip to
963  *	determine if simultaneous multi-threading (SMT) is enabled. When
964  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
965  *	the X86FSET_HTT feature is not set. If this value is greater than one,
966  *	than we consider the processor to have the feature X86FSET_CMP, to
967  *	indicate that there is support for more than one core.
968  *
969  *	This value should be the same for all logical CPUs in the same package.
970  *
971  * cpi_procnodes_per_pkg
972  *
973  *	This value indicates the number of 'nodes' that exist in the package.
974  *	When processors are actually a multi-chip module, this represents the
975  *	number of such modules that exist in the package. Currently, on Intel
976  *	based systems this member is always set to 1.
977  *
978  *	This value should be the same for all logical CPUs in the same package.
979  *
980  * cpi_procnodeid
981  *
982  *	This value indicates the ID of the node that the logical CPU is a part
983  *	of. All logical CPUs that are in the same node must have the same value
984  *	here. This value must be unique across all of the packages in the
985  *	system.  On Intel based systems, this is currently set to the value in
986  *	cpi_chipid because there is only one node.
987  *
988  * cpi_cores_per_compunit
989  *
990  *	This value indicates the number of cores that are part of a compute
991  *	unit. See the AMD topology section for this. This member only has real
992  *	meaning currently for AMD Bulldozer family processors. For all other
993  *	processors, this should currently be set to 1.
994  *
995  * cpi_compunitid
996  *
997  *	This indicates the compute unit that the logical CPU belongs to. For
998  *	processors without AMD Bulldozer-style compute units this should be set
999  *	to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *	This indicates the number of logical CPUs that are sharing the same last
1004  *	level cache. This value should be the same for all CPUs that are sharing
1005  *	that cache. The last cache refers to the cache that is closest to memory
1006  *	and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *	This indicates the ID of the last cache that the logical CPU uses. This
1011  *	cache is often shared between multiple logical CPUs and is the cache
1012  *	that is closest to memory and furthest away from the CPU. This value
1013  *	should be the same for a group of logical CPUs only if they actually
1014  *	share the same last level cache. IDs should not overlap between
1015  *	packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *	This indicates the number of bits that are required to represent all of
1020  *	the cores in the system. As cores are derived based on their APIC IDs,
1021  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *	this value to be larger than the actual number of IDs that are present
1023  *	in the system. This is used to size tables by the CMI framework. It is
1024  *	only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *	This indicates the number of bits required to represent all of the IDs
1029  *	that cover the logical CPUs that exist on a given core. It's OK for this
1030  *	value to be larger than the actual number of IDs that are present in the
1031  *	system.  This is used to size tables by the CMI framework. It is
1032  *	only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *
1117  * Each of these requires different sets of mitigations and has different attack
1118  * surfaces. For the most part, this discussion is about protecting the kernel
1119  * from non-kernel executing environments such as user processes and hardware
1120  * virtual machines. Unfortunately, there are a number of user vs. user
1121  * scenarios that exist with these. The rest of this section will describe the
1122  * overall approach that the system has taken to address these as well as their
1123  * shortcomings. Unfortunately, not all of the above have been handled today.
1124  *
1125  * SPECTRE v2, ret2spec, SpectreRSB
1126  *
1127  * The second variant of the spectre attack focuses on performing branch target
1128  * injection. This generally impacts indirect call instructions in the system.
1129  * There are four different ways to mitigate this issue that are commonly
1130  * described today:
1131  *
1132  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1133  *  2. Using Retpolines and RSB Stuffing
1134  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1135  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1136  *
1137  * IBRS uses a feature added to microcode to restrict speculation, among other
1138  * things. This form of mitigation has not been used as it has been generally
1139  * seen as too expensive and requires reactivation upon various transitions in
1140  * the system.
1141  *
1142  * As a less impactful alternative to IBRS, retpolines were developed by
1143  * Google. These basically require one to replace indirect calls with a specific
1144  * trampoline that will cause speculation to fail and break the attack.
1145  * Retpolines require compiler support. We always build with retpolines in the
1146  * external thunk mode. This means that a traditional indirect call is replaced
1147  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1148  * of this is that all indirect function calls are performed through a register.
1149  *
1150  * We have to use a common external location of the thunk and not inline it into
1151  * the callsite so that way we can have a single place to patch these functions.
1152  * As it turns out, we currently have two different forms of retpolines that
1153  * exist in the system:
1154  *
1155  *  1. A full retpoline
1156  *  2. A no-op version
1157  *
1158  * The first one is used in the general case. Historically, there was an
1159  * AMD-specific optimized retopoline variant that was based around using a
1160  * serializing lfence instruction; however, in March 2022 it was announced that
1161  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1162  * use it and it is no longer available in the system.
1163  *
1164  * The third form described above is the most curious. It turns out that the way
1165  * that retpolines are implemented is that they rely on how speculation is
1166  * performed on a 'ret' instruction. Intel has continued to optimize this
1167  * process (which is partly why we need to have return stack buffer stuffing,
1168  * but more on that in a bit) and in processors starting with Cascade Lake
1169  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1170  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1171  *
1172  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1173  * physical core. However, if this is the case, we don't want to use retpolines
1174  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1175  * function (called a thunk) into a jmp instruction. This means that we're still
1176  * paying the cost of an extra jump to the external thunk, but it gives us
1177  * flexibility and the ability to have a single kernel image that works across a
1178  * wide variety of systems and hardware features.
1179  *
1180  * Unfortunately, this alone is insufficient. First, Skylake systems have
1181  * additional speculation for the Return Stack Buffer (RSB) which is used to
1182  * return from call instructions which retpolines take advantage of. However,
1183  * this problem is not just limited to Skylake and is actually more pernicious.
1184  * The SpectreRSB paper introduces several more problems that can arise with
1185  * dealing with this. The RSB can be poisoned just like the indirect branch
1186  * predictor. This means that one needs to clear the RSB when transitioning
1187  * between two different privilege domains. Some examples include:
1188  *
1189  *  - Switching between two different user processes
1190  *  - Going between user land and the kernel
1191  *  - Returning to the kernel from a hardware virtual machine
1192  *
1193  * Mitigating this involves combining a couple of different things. The first is
1194  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1195  * Bridge. When an RSB entry refers to a user address and we're executing in the
1196  * kernel, speculation through it will be stopped when SMEP is enabled. This
1197  * protects against a number of the different cases that we would normally be
1198  * worried about such as when we enter the kernel from user land.
1199  *
1200  * To prevent against additional manipulation of the RSB from other contexts
1201  * such as a non-root VMX context attacking the kernel we first look to
1202  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1203  * nothing else that we need to do to protect the kernel at this time.
1204  *
1205  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1206  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1207  * Currently this is employed on context switch and vmx_exit. The
1208  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1209  *
1210  * If SMEP is not present, then we would have to stuff the RSB every time we
1211  * transitioned from user mode to the kernel, which isn't very practical right
1212  * now.
1213  *
1214  * To fully protect user to user and vmx to vmx attacks from these classes of
1215  * issues, we would also need to allow them to opt into performing an Indirect
1216  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1217  *
1218  * The fourth form of mitigation here is specific to AMD and is called Automated
1219  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1220  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1221  * (extended feature enable register) MSR. This bit basically says that IBRS
1222  * acts as though it is always active when executing at CPL0 and when executing
1223  * in the 'host' context when SEV-SNP is enabled.
1224  *
1225  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1226  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1227  * to the kernel, we must still consider the remaining cases that exist, just
1228  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1229  * traditional technique to work, this is not true on all CPUs. While a write to
1230  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1231  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1232  * guard page is present between user and kernel address spaces and SMEP is
1233  * enabled, then there is no need to clear the RSB at all.
1234  *
1235  * By default, the system will enable RSB stuffing and the required variant of
1236  * retpolines and store that information in the x86_spectrev2_mitigation value.
1237  * This will be evaluated after a microcode update as well, though it is
1238  * expected that microcode updates will not take away features. This may mean
1239  * that a late loaded microcode may not end up in the optimal configuration
1240  * (though this should be rare).
1241  *
1242  * Currently we do not build kmdb with retpolines or perform any additional side
1243  * channel security mitigations for it. One complication with kmdb is that it
1244  * requires its own retpoline thunks and it would need to adjust itself based on
1245  * what the kernel does. The threat model of kmdb is more limited and therefore
1246  * it may make more sense to investigate using prediction barriers as the whole
1247  * system is only executing a single instruction at a time while in kmdb.
1248  *
1249  * SPECTRE v1, v4
1250  *
1251  * The v1 and v4 variants of spectre are not currently mitigated in the
1252  * system and require other classes of changes to occur in the code.
1253  *
1254  * SPECTRE v1 (SWAPGS VARIANT)
1255  *
1256  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1257  * can generally affect any branch-dependent code. The swapgs issue is one
1258  * variant of this. If we are coming in from userspace, we can have code like
1259  * this:
1260  *
1261  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1262  *	je	1f
1263  *	movq	$0, REGOFF_SAVFP(%rsp)
1264  *	swapgs
1265  *	1:
1266  *	movq	%gs:CPU_THREAD, %rax
1267  *
1268  * If an attacker can cause a mis-speculation of the branch here, we could skip
1269  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1270  * load. If subsequent code can act as the usual Spectre cache gadget, this
1271  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1272  * any use of the %gs override.
1273  *
1274  * The other case is also an issue: if we're coming into a trap from kernel
1275  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1276  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1277  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1278  * case, and the fix is the same in both cases (an lfence at the branch target
1279  * 1: in this example), we'll just do it unconditionally.
1280  *
1281  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1282  * harder for user-space to actually set a useful %gsbase value: although it's
1283  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1284  * mitigate anyway.
1285  *
1286  * MELTDOWN
1287  *
1288  * Meltdown, or spectre v3, allowed a user process to read any data in their
1289  * address space regardless of whether or not the page tables in question
1290  * allowed the user to have the ability to read them. The solution to meltdown
1291  * is kernel page table isolation. In this world, there are two page tables that
1292  * are used for a process, one in user land and one in the kernel. To implement
1293  * this we use per-CPU page tables and switch between the user and kernel
1294  * variants when entering and exiting the kernel.  For more information about
1295  * this process and how the trampolines work, please see the big theory
1296  * statements and additional comments in:
1297  *
1298  *  - uts/i86pc/ml/kpti_trampolines.s
1299  *  - uts/i86pc/vm/hat_i86.c
1300  *
1301  * While Meltdown only impacted Intel systems and there are also Intel systems
1302  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1303  * kernel page table isolation enabled. While this may at first seem weird, an
1304  * important thing to remember is that you can't speculatively read an address
1305  * if it's never in your page table at all. Having user processes without kernel
1306  * pages present provides us with an important layer of defense in the kernel
1307  * against any other side channel attacks that exist and have yet to be
1308  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1309  * default, no matter the x86 system.
1310  *
1311  * L1 TERMINAL FAULT
1312  *
1313  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1314  * execution uses page table entries. Effectively, it is two different problems.
1315  * The first is that it ignores the not present bit in the page table entries
1316  * when performing speculative execution. This means that something can
1317  * speculatively read the listed physical address if it's present in the L1
1318  * cache under certain conditions (see Intel's documentation for the full set of
1319  * conditions). Secondly, this can be used to bypass hardware virtualization
1320  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1321  * instructions.
1322  *
1323  * For the non-hardware virtualized case, this is relatively easy to deal with.
1324  * We must make sure that all unmapped pages have an address of zero. This means
1325  * that they could read the first 4k of physical memory; however, we never use
1326  * that first page in the operating system and always skip putting it in our
1327  * memory map, even if firmware tells us we can use it in our memory map. While
1328  * other systems try to put extra metadata in the address and reserved bits,
1329  * which led to this being problematic in those cases, we do not.
1330  *
1331  * For hardware virtual machines things are more complicated. Because they can
1332  * construct their own page tables, it isn't hard for them to perform this
1333  * attack against any physical address. The one wrinkle is that this physical
1334  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1335  * to flush the L1 data cache. We wrap this up in the function
1336  * spec_uarch_flush(). This function is also used in the mitigation of
1337  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1338  * hypervisors such as KVM or bhyve are responsible for performing this before
1339  * entering the guest.
1340  *
1341  * Because this attack takes place in the L1 cache, there's another wrinkle
1342  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1343  * designs. This means that when a thread enters a hardware virtualized context
1344  * and flushes the L1 data cache, the other thread on the processor may then go
1345  * ahead and put new data in it that can be potentially attacked. While one
1346  * solution is to disable SMT on the system, another option that is available is
1347  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1348  * goes through and makes sure that if a HVM is being scheduled on one thread,
1349  * then the thing on the other thread is from the same hardware virtual machine.
1350  * If an interrupt comes in or the guest exits to the broader system, then the
1351  * other SMT thread will be kicked out.
1352  *
1353  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1354  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1355  * perform L1TF related mitigations.
1356  *
1357  * MICROARCHITECTURAL DATA SAMPLING
1358  *
1359  * Microarchitectural data sampling (MDS) is a combination of four discrete
1360  * vulnerabilities that are similar issues affecting various parts of the CPU's
1361  * microarchitectural implementation around load, store, and fill buffers.
1362  * Specifically it is made up of the following subcomponents:
1363  *
1364  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1365  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1366  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1367  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1368  *
1369  * To begin addressing these, Intel has introduced another feature in microcode
1370  * called MD_CLEAR. This changes the verw instruction to operate in a different
1371  * way. This allows us to execute the verw instruction in a particular way to
1372  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1373  * updated when this microcode is present to flush this state.
1374  *
1375  * Primarily we need to flush this state whenever we transition from the kernel
1376  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1377  * little bit different. Here the structures are statically sized when a logical
1378  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1379  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1380  * mwait, or another ACPI method. To perform these flushes, we call
1381  * x86_md_clear() at all of these transition points.
1382  *
1383  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1384  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1385  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1386  * a no-op.
1387  *
1388  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1389  * particular, everything we've discussed above is only valid for a single
1390  * thread executing on a core. In the case where you have hyper-threading
1391  * present, this attack can be performed between threads. The theoretical fix
1392  * for this is to ensure that both threads are always in the same security
1393  * domain. This means that they are executing in the same ring and mutually
1394  * trust each other. Practically speaking, this would mean that a system call
1395  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1396  * Rather than implement this, we recommend that one disables hyper-threading
1397  * through the use of psradm -aS.
1398  *
1399  * TSX ASYNCHRONOUS ABORT
1400  *
1401  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1402  * behaves like MDS, but leverages Intel's transactional instructions as another
1403  * vector. Effectively, when a transaction hits one of these cases (unmapped
1404  * page, various cache snoop activity, etc.) then the same data can be exposed
1405  * as in the case of MDS. This means that you can attack your twin.
1406  *
1407  * Intel has described that there are two different ways that we can mitigate
1408  * this problem on affected processors:
1409  *
1410  *   1) We can use the same techniques used to deal with MDS. Flushing the
1411  *      microarchitectural buffers and disabling hyperthreading will mitigate
1412  *      this in the same way.
1413  *
1414  *   2) Using microcode to disable TSX.
1415  *
1416  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1417  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1418  * That's OK as we're already doing all such mitigations. On the other hand,
1419  * processors with MDS_NO are all supposed to receive microcode updates that
1420  * enumerate support for disabling TSX. In general, we'd rather use this method
1421  * when available as it doesn't require disabling hyperthreading to be
1422  * effective. Currently we basically are relying on microcode for processors
1423  * that enumerate MDS_NO.
1424  *
1425  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1426  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1427  * different powers. The first allows us to cause all transactions to
1428  * immediately abort. The second gives us a means of disabling TSX completely,
1429  * which includes removing it from cpuid. If we have support for this in
1430  * microcode during the first cpuid pass, then we'll disable TSX completely such
1431  * that user land never has a chance to observe the bit. However, if we are late
1432  * loading the microcode, then we must use the functionality to cause
1433  * transactions to automatically abort. This is necessary for user land's sake.
1434  * Once a program sees a cpuid bit, it must not be taken away.
1435  *
1436  * We track whether or not we should do this based on what cpuid pass we're in.
1437  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1438  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1439  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1440  * second time after we do the initial microcode update.  As a result we need to
1441  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1442  * suitable microcode on the current CPU (which happens prior to
1443  * cpuid_pass_ucode()).
1444  *
1445  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1446  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1447  * unfortunate feature in a number of ways, and taking the opportunity to
1448  * finally be able to turn it off is likely to be of benefit in the future.
1449  *
1450  * SUMMARY
1451  *
1452  * The following table attempts to summarize the mitigations for various issues
1453  * and what's done in various places:
1454  *
1455  *  - Spectre v1: Not currently mitigated
1456  *  - swapgs: lfences after swapgs paths
1457  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1458  *  - Meltdown: Kernel Page Table Isolation
1459  *  - Spectre v3a: Updated CPU microcode
1460  *  - Spectre v4: Not currently mitigated
1461  *  - SpectreRSB: SMEP and RSB Stuffing
1462  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1463  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1464  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1465  *
1466  * The following table indicates the x86 feature set bits that indicate that a
1467  * given problem has been solved or a notable feature is present:
1468  *
1469  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1470  *  - MDS_NO: All forms of MDS
1471  *  - TAA_NO: TAA
1472  */
1473 
1474 #include <sys/types.h>
1475 #include <sys/archsystm.h>
1476 #include <sys/x86_archext.h>
1477 #include <sys/kmem.h>
1478 #include <sys/systm.h>
1479 #include <sys/cmn_err.h>
1480 #include <sys/sunddi.h>
1481 #include <sys/sunndi.h>
1482 #include <sys/cpuvar.h>
1483 #include <sys/processor.h>
1484 #include <sys/sysmacros.h>
1485 #include <sys/pg.h>
1486 #include <sys/fp.h>
1487 #include <sys/controlregs.h>
1488 #include <sys/bitmap.h>
1489 #include <sys/auxv_386.h>
1490 #include <sys/memnode.h>
1491 #include <sys/pci_cfgspace.h>
1492 #include <sys/comm_page.h>
1493 #include <sys/mach_mmu.h>
1494 #include <sys/ucode.h>
1495 #include <sys/tsc.h>
1496 #include <sys/kobj.h>
1497 #include <sys/asm_misc.h>
1498 #include <sys/bitmap.h>
1499 
1500 #ifdef __xpv
1501 #include <sys/hypervisor.h>
1502 #else
1503 #include <sys/ontrap.h>
1504 #endif
1505 
1506 uint_t x86_vendor = X86_VENDOR_IntelClone;
1507 uint_t x86_type = X86_TYPE_OTHER;
1508 uint_t x86_clflush_size = 0;
1509 
1510 #if defined(__xpv)
1511 int x86_use_pcid = 0;
1512 int x86_use_invpcid = 0;
1513 #else
1514 int x86_use_pcid = -1;
1515 int x86_use_invpcid = -1;
1516 #endif
1517 
1518 typedef enum {
1519 	X86_SPECTREV2_RETPOLINE,
1520 	X86_SPECTREV2_ENHANCED_IBRS,
1521 	X86_SPECTREV2_AUTO_IBRS,
1522 	X86_SPECTREV2_DISABLED
1523 } x86_spectrev2_mitigation_t;
1524 
1525 uint_t x86_disable_spectrev2 = 0;
1526 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1527     X86_SPECTREV2_RETPOLINE;
1528 
1529 /*
1530  * The mitigation status for TAA:
1531  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1532  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1533  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1534  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1535  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1536  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1537  */
1538 typedef enum {
1539 	X86_TAA_NOTHING,
1540 	X86_TAA_DISABLED,
1541 	X86_TAA_MD_CLEAR,
1542 	X86_TAA_TSX_FORCE_ABORT,
1543 	X86_TAA_TSX_DISABLE,
1544 	X86_TAA_HW_MITIGATED
1545 } x86_taa_mitigation_t;
1546 
1547 uint_t x86_disable_taa = 0;
1548 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1549 
1550 uint_t pentiumpro_bug4046376;
1551 
1552 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1553 
1554 static char *x86_feature_names[NUM_X86_FEATURES] = {
1555 	"lgpg",
1556 	"tsc",
1557 	"msr",
1558 	"mtrr",
1559 	"pge",
1560 	"de",
1561 	"cmov",
1562 	"mmx",
1563 	"mca",
1564 	"pae",
1565 	"cv8",
1566 	"pat",
1567 	"sep",
1568 	"sse",
1569 	"sse2",
1570 	"htt",
1571 	"asysc",
1572 	"nx",
1573 	"sse3",
1574 	"cx16",
1575 	"cmp",
1576 	"tscp",
1577 	"mwait",
1578 	"sse4a",
1579 	"cpuid",
1580 	"ssse3",
1581 	"sse4_1",
1582 	"sse4_2",
1583 	"1gpg",
1584 	"clfsh",
1585 	"64",
1586 	"aes",
1587 	"pclmulqdq",
1588 	"xsave",
1589 	"avx",
1590 	"vmx",
1591 	"svm",
1592 	"topoext",
1593 	"f16c",
1594 	"rdrand",
1595 	"x2apic",
1596 	"avx2",
1597 	"bmi1",
1598 	"bmi2",
1599 	"fma",
1600 	"smep",
1601 	"smap",
1602 	"adx",
1603 	"rdseed",
1604 	"mpx",
1605 	"avx512f",
1606 	"avx512dq",
1607 	"avx512pf",
1608 	"avx512er",
1609 	"avx512cd",
1610 	"avx512bw",
1611 	"avx512vl",
1612 	"avx512fma",
1613 	"avx512vbmi",
1614 	"avx512_vpopcntdq",
1615 	"avx512_4vnniw",
1616 	"avx512_4fmaps",
1617 	"xsaveopt",
1618 	"xsavec",
1619 	"xsaves",
1620 	"sha",
1621 	"umip",
1622 	"pku",
1623 	"ospke",
1624 	"pcid",
1625 	"invpcid",
1626 	"ibrs",
1627 	"ibpb",
1628 	"stibp",
1629 	"ssbd",
1630 	"ssbd_virt",
1631 	"rdcl_no",
1632 	"ibrs_all",
1633 	"rsba",
1634 	"ssb_no",
1635 	"stibp_all",
1636 	"flush_cmd",
1637 	"l1d_vmentry_no",
1638 	"fsgsbase",
1639 	"clflushopt",
1640 	"clwb",
1641 	"monitorx",
1642 	"clzero",
1643 	"xop",
1644 	"fma4",
1645 	"tbm",
1646 	"avx512_vnni",
1647 	"amd_pcec",
1648 	"md_clear",
1649 	"mds_no",
1650 	"core_thermal",
1651 	"pkg_thermal",
1652 	"tsx_ctrl",
1653 	"taa_no",
1654 	"ppin",
1655 	"vaes",
1656 	"vpclmulqdq",
1657 	"lfence_serializing",
1658 	"gfni",
1659 	"avx512_vp2intersect",
1660 	"avx512_bitalg",
1661 	"avx512_vbmi2",
1662 	"avx512_bf16",
1663 	"auto_ibrs"
1664 };
1665 
1666 boolean_t
1667 is_x86_feature(void *featureset, uint_t feature)
1668 {
1669 	ASSERT(feature < NUM_X86_FEATURES);
1670 	return (BT_TEST((ulong_t *)featureset, feature));
1671 }
1672 
1673 void
1674 add_x86_feature(void *featureset, uint_t feature)
1675 {
1676 	ASSERT(feature < NUM_X86_FEATURES);
1677 	BT_SET((ulong_t *)featureset, feature);
1678 }
1679 
1680 void
1681 remove_x86_feature(void *featureset, uint_t feature)
1682 {
1683 	ASSERT(feature < NUM_X86_FEATURES);
1684 	BT_CLEAR((ulong_t *)featureset, feature);
1685 }
1686 
1687 boolean_t
1688 compare_x86_featureset(void *setA, void *setB)
1689 {
1690 	/*
1691 	 * We assume that the unused bits of the bitmap are always zero.
1692 	 */
1693 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1694 		return (B_TRUE);
1695 	} else {
1696 		return (B_FALSE);
1697 	}
1698 }
1699 
1700 void
1701 print_x86_featureset(void *featureset)
1702 {
1703 	uint_t i;
1704 
1705 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1706 		if (is_x86_feature(featureset, i)) {
1707 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1708 			    x86_feature_names[i]);
1709 		}
1710 	}
1711 }
1712 
1713 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1714 static size_t xsave_state_size = 0;
1715 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1716 boolean_t xsave_force_disable = B_FALSE;
1717 extern int disable_smap;
1718 
1719 /*
1720  * This is set to platform type we are running on.
1721  */
1722 static int platform_type = -1;
1723 
1724 #if !defined(__xpv)
1725 /*
1726  * Variable to patch if hypervisor platform detection needs to be
1727  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1728  */
1729 int enable_platform_detection = 1;
1730 #endif
1731 
1732 /*
1733  * monitor/mwait info.
1734  *
1735  * size_actual and buf_actual are the real address and size allocated to get
1736  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1737  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1738  * processor cache-line alignment, but this is not guarantied in the furture.
1739  */
1740 struct mwait_info {
1741 	size_t		mon_min;	/* min size to avoid missed wakeups */
1742 	size_t		mon_max;	/* size to avoid false wakeups */
1743 	size_t		size_actual;	/* size actually allocated */
1744 	void		*buf_actual;	/* memory actually allocated */
1745 	uint32_t	support;	/* processor support of monitor/mwait */
1746 };
1747 
1748 /*
1749  * xsave/xrestor info.
1750  *
1751  * This structure contains HW feature bits and the size of the xsave save area.
1752  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1753  * (xsave_state) to describe the xsave layout. However, at runtime the
1754  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1755  * xsave_state structure simply represents the legacy layout of the beginning
1756  * of the xsave area.
1757  */
1758 struct xsave_info {
1759 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1760 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1761 	size_t		xsav_max_size;  /* max size save area for HW features */
1762 	size_t		ymm_size;	/* AVX: size of ymm save area */
1763 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1764 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1765 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1766 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1767 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1768 	size_t		opmask_size;	/* AVX512: size of opmask save */
1769 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1770 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1771 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1772 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1773 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1774 };
1775 
1776 
1777 /*
1778  * These constants determine how many of the elements of the
1779  * cpuid we cache in the cpuid_info data structure; the
1780  * remaining elements are accessible via the cpuid instruction.
1781  */
1782 
1783 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1784 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1785 #define	NMAX_CPI_TOPO	0x10		/* Sanity check on leaf 8X26, 1F */
1786 
1787 /*
1788  * See the big theory statement for a more detailed explanation of what some of
1789  * these members mean.
1790  */
1791 struct cpuid_info {
1792 	uint_t cpi_pass;		/* last pass completed */
1793 	/*
1794 	 * standard function information
1795 	 */
1796 	uint_t cpi_maxeax;		/* fn 0: %eax */
1797 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1798 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1799 
1800 	uint_t cpi_family;		/* fn 1: extended family */
1801 	uint_t cpi_model;		/* fn 1: extended model */
1802 	uint_t cpi_step;		/* fn 1: stepping */
1803 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1804 					/*		AMD: package/socket # */
1805 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1806 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1807 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1808 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1809 	uint_t cpi_ncache;		/* fn 2: number of elements */
1810 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1811 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1812 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1813 					/* Intel fn: 4, AMD fn: 8000001d */
1814 	struct cpuid_regs **cpi_cache_leaves;	/* Actual leaves from above */
1815 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1816 	struct cpuid_regs cpi_sub7[1];	/* Leaf 7, sub-leaf 1 */
1817 	/*
1818 	 * extended function information
1819 	 */
1820 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1821 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1822 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1823 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1824 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1825 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1826 
1827 	id_t cpi_coreid;		/* same coreid => strands share core */
1828 	int cpi_pkgcoreid;		/* core number within single package */
1829 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1830 					/* Intel: fn 4: %eax[31-26] */
1831 
1832 	/*
1833 	 * These values represent the number of bits that are required to store
1834 	 * information about the number of cores and threads.
1835 	 */
1836 	uint_t cpi_ncore_bits;
1837 	uint_t cpi_nthread_bits;
1838 	/*
1839 	 * supported feature information
1840 	 */
1841 	uint32_t cpi_support[6];
1842 #define	STD_EDX_FEATURES	0
1843 #define	AMD_EDX_FEATURES	1
1844 #define	TM_EDX_FEATURES		2
1845 #define	STD_ECX_FEATURES	3
1846 #define	AMD_ECX_FEATURES	4
1847 #define	STD_EBX_FEATURES	5
1848 	/*
1849 	 * Synthesized information, where known.
1850 	 */
1851 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1852 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1853 	uint32_t cpi_socket;		/* Chip package/socket type */
1854 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1855 
1856 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1857 	uint32_t cpi_apicid;
1858 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1859 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1860 					/* Intel: 1 */
1861 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1862 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1863 
1864 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1865 
1866 	/*
1867 	 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1868 	 * eventually leaf 0x1F (Intel).
1869 	 */
1870 	uint_t cpi_topo_nleaves;
1871 	struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1872 };
1873 
1874 
1875 static struct cpuid_info cpuid_info0;
1876 
1877 /*
1878  * These bit fields are defined by the Intel Application Note AP-485
1879  * "Intel Processor Identification and the CPUID Instruction"
1880  */
1881 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1882 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1883 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1884 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1885 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1886 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1887 
1888 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1889 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1890 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1891 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1892 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1893 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1894 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1895 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1896 
1897 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1898 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1899 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1900 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1901 
1902 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1903 #define	CPI_XMAXEAX_MAX		0x80000100
1904 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1905 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1906 
1907 /*
1908  * Function 4 (Deterministic Cache Parameters) macros
1909  * Defined by Intel Application Note AP-485
1910  */
1911 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1912 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1913 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1914 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1915 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1916 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1917 #define	CPI_CACHE_TYPE_DONE	0
1918 #define	CPI_CACHE_TYPE_DATA	1
1919 #define	CPI_CACHE_TYPE_INSTR	2
1920 #define	CPI_CACHE_TYPE_UNIFIED	3
1921 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1922 
1923 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1924 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1925 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1926 
1927 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1928 
1929 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1930 
1931 
1932 /*
1933  * A couple of shorthand macros to identify "later" P6-family chips
1934  * like the Pentium M and Core.  First, the "older" P6-based stuff
1935  * (loosely defined as "pre-Pentium-4"):
1936  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1937  */
1938 #define	IS_LEGACY_P6(cpi) (			\
1939 	cpi->cpi_family == 6 &&			\
1940 		(cpi->cpi_model == 1 ||		\
1941 		cpi->cpi_model == 3 ||		\
1942 		cpi->cpi_model == 5 ||		\
1943 		cpi->cpi_model == 6 ||		\
1944 		cpi->cpi_model == 7 ||		\
1945 		cpi->cpi_model == 8 ||		\
1946 		cpi->cpi_model == 0xA ||	\
1947 		cpi->cpi_model == 0xB)		\
1948 )
1949 
1950 /* A "new F6" is everything with family 6 that's not the above */
1951 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1952 
1953 /* Extended family/model support */
1954 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1955 	cpi->cpi_family >= 0xf)
1956 
1957 /*
1958  * Info for monitor/mwait idle loop.
1959  *
1960  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1961  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1962  * 2006.
1963  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1964  * Documentation Updates" #33633, Rev 2.05, December 2006.
1965  */
1966 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1967 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1968 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1969 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1970 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1971 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1972 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1973 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1974 /*
1975  * Number of sub-cstates for a given c-state.
1976  */
1977 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1978 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1979 
1980 /*
1981  * XSAVE leaf 0xD enumeration
1982  */
1983 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1984 #define	CPUID_LEAFD_2_YMM_SIZE		256
1985 
1986 /*
1987  * Common extended leaf names to cut down on typos.
1988  */
1989 #define	CPUID_LEAF_EXT_0		0x80000000
1990 #define	CPUID_LEAF_EXT_8		0x80000008
1991 #define	CPUID_LEAF_EXT_1d		0x8000001d
1992 #define	CPUID_LEAF_EXT_1e		0x8000001e
1993 #define	CPUID_LEAF_EXT_21		0x80000021
1994 #define	CPUID_LEAF_EXT_26		0x80000026
1995 
1996 /*
1997  * Functions we consume from cpuid_subr.c;  don't publish these in a header
1998  * file to try and keep people using the expected cpuid_* interfaces.
1999  */
2000 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2001 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2002 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2003 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2004 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2005 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2006 
2007 /*
2008  * Apply up various platform-dependent restrictions where the
2009  * underlying platform restrictions mean the CPU can be marked
2010  * as less capable than its cpuid instruction would imply.
2011  */
2012 #if defined(__xpv)
2013 static void
2014 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2015 {
2016 	switch (eax) {
2017 	case 1: {
2018 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2019 		    0 : CPUID_INTC_EDX_MCA;
2020 		cp->cp_edx &=
2021 		    ~(mcamask |
2022 		    CPUID_INTC_EDX_PSE |
2023 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2024 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2025 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2026 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2027 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2028 		break;
2029 	}
2030 
2031 	case 0x80000001:
2032 		cp->cp_edx &=
2033 		    ~(CPUID_AMD_EDX_PSE |
2034 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2035 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2036 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2037 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2038 		    CPUID_AMD_EDX_TSCP);
2039 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2040 		break;
2041 	default:
2042 		break;
2043 	}
2044 
2045 	switch (vendor) {
2046 	case X86_VENDOR_Intel:
2047 		switch (eax) {
2048 		case 4:
2049 			/*
2050 			 * Zero out the (ncores-per-chip - 1) field
2051 			 */
2052 			cp->cp_eax &= 0x03fffffff;
2053 			break;
2054 		default:
2055 			break;
2056 		}
2057 		break;
2058 	case X86_VENDOR_AMD:
2059 	case X86_VENDOR_HYGON:
2060 		switch (eax) {
2061 
2062 		case 0x80000001:
2063 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2064 			break;
2065 
2066 		case CPUID_LEAF_EXT_8:
2067 			/*
2068 			 * Zero out the (ncores-per-chip - 1) field
2069 			 */
2070 			cp->cp_ecx &= 0xffffff00;
2071 			break;
2072 		default:
2073 			break;
2074 		}
2075 		break;
2076 	default:
2077 		break;
2078 	}
2079 }
2080 #else
2081 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2082 #endif
2083 
2084 /*
2085  *  Some undocumented ways of patching the results of the cpuid
2086  *  instruction to permit running Solaris 10 on future cpus that
2087  *  we don't currently support.  Could be set to non-zero values
2088  *  via settings in eeprom.
2089  */
2090 
2091 uint32_t cpuid_feature_ecx_include;
2092 uint32_t cpuid_feature_ecx_exclude;
2093 uint32_t cpuid_feature_edx_include;
2094 uint32_t cpuid_feature_edx_exclude;
2095 
2096 /*
2097  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2098  */
2099 void
2100 cpuid_alloc_space(cpu_t *cpu)
2101 {
2102 	/*
2103 	 * By convention, cpu0 is the boot cpu, which is set up
2104 	 * before memory allocation is available.  All other cpus get
2105 	 * their cpuid_info struct allocated here.
2106 	 */
2107 	ASSERT(cpu->cpu_id != 0);
2108 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2109 	cpu->cpu_m.mcpu_cpi =
2110 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2111 }
2112 
2113 void
2114 cpuid_free_space(cpu_t *cpu)
2115 {
2116 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2117 	int i;
2118 
2119 	ASSERT(cpi != NULL);
2120 	ASSERT(cpi != &cpuid_info0);
2121 
2122 	/*
2123 	 * Free up any cache leaf related dynamic storage. The first entry was
2124 	 * cached from the standard cpuid storage, so we should not free it.
2125 	 */
2126 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2127 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2128 	if (cpi->cpi_cache_leaf_size > 0)
2129 		kmem_free(cpi->cpi_cache_leaves,
2130 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2131 
2132 	kmem_free(cpi, sizeof (*cpi));
2133 	cpu->cpu_m.mcpu_cpi = NULL;
2134 }
2135 
2136 #if !defined(__xpv)
2137 /*
2138  * Determine the type of the underlying platform. This is used to customize
2139  * initialization of various subsystems (e.g. TSC). determine_platform() must
2140  * only ever be called once to prevent two processors from seeing different
2141  * values of platform_type. Must be called before cpuid_pass_ident(), the
2142  * earliest consumer to execute; the identification pass will call
2143  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2144  */
2145 void
2146 determine_platform(void)
2147 {
2148 	struct cpuid_regs cp;
2149 	uint32_t base;
2150 	uint32_t regs[4];
2151 	char *hvstr = (char *)regs;
2152 
2153 	ASSERT(platform_type == -1);
2154 
2155 	platform_type = HW_NATIVE;
2156 
2157 	if (!enable_platform_detection)
2158 		return;
2159 
2160 	/*
2161 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2162 	 * vendor signature, and set platform type accordingly.
2163 	 *
2164 	 * References:
2165 	 * http://lkml.org/lkml/2008/10/1/246
2166 	 * http://kb.vmware.com/kb/1009458
2167 	 */
2168 	cp.cp_eax = 0x1;
2169 	(void) __cpuid_insn(&cp);
2170 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2171 		cp.cp_eax = 0x40000000;
2172 		(void) __cpuid_insn(&cp);
2173 		regs[0] = cp.cp_ebx;
2174 		regs[1] = cp.cp_ecx;
2175 		regs[2] = cp.cp_edx;
2176 		regs[3] = 0;
2177 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2178 			platform_type = HW_XEN_HVM;
2179 			return;
2180 		}
2181 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2182 			platform_type = HW_VMWARE;
2183 			return;
2184 		}
2185 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2186 			platform_type = HW_KVM;
2187 			return;
2188 		}
2189 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2190 			platform_type = HW_BHYVE;
2191 			return;
2192 		}
2193 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2194 			platform_type = HW_MICROSOFT;
2195 			return;
2196 		}
2197 		if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2198 			platform_type = HW_QEMU_TCG;
2199 			return;
2200 		}
2201 	} else {
2202 		/*
2203 		 * Check older VMware hardware versions. VMware hypervisor is
2204 		 * detected by performing an IN operation to VMware hypervisor
2205 		 * port and checking that value returned in %ebx is VMware
2206 		 * hypervisor magic value.
2207 		 *
2208 		 * References: http://kb.vmware.com/kb/1009458
2209 		 */
2210 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2211 		if (regs[1] == VMWARE_HVMAGIC) {
2212 			platform_type = HW_VMWARE;
2213 			return;
2214 		}
2215 	}
2216 
2217 	/*
2218 	 * Check Xen hypervisor. In a fully virtualized domain,
2219 	 * Xen's pseudo-cpuid function returns a string representing the
2220 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2221 	 * supported cpuid function. We need at least a (base + 2) leaf value
2222 	 * to do what we want to do. Try different base values, since the
2223 	 * hypervisor might use a different one depending on whether Hyper-V
2224 	 * emulation is switched on by default or not.
2225 	 */
2226 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2227 		cp.cp_eax = base;
2228 		(void) __cpuid_insn(&cp);
2229 		regs[0] = cp.cp_ebx;
2230 		regs[1] = cp.cp_ecx;
2231 		regs[2] = cp.cp_edx;
2232 		regs[3] = 0;
2233 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2234 		    cp.cp_eax >= (base + 2)) {
2235 			platform_type &= ~HW_NATIVE;
2236 			platform_type |= HW_XEN_HVM;
2237 			return;
2238 		}
2239 	}
2240 }
2241 
2242 int
2243 get_hwenv(void)
2244 {
2245 	ASSERT(platform_type != -1);
2246 	return (platform_type);
2247 }
2248 
2249 int
2250 is_controldom(void)
2251 {
2252 	return (0);
2253 }
2254 
2255 #else
2256 
2257 int
2258 get_hwenv(void)
2259 {
2260 	return (HW_XEN_PV);
2261 }
2262 
2263 int
2264 is_controldom(void)
2265 {
2266 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2267 }
2268 
2269 #endif	/* __xpv */
2270 
2271 /*
2272  * Gather the extended topology information. This should be the same for both
2273  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2274  */
2275 static void
2276 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2277 {
2278 	uint_t i;
2279 
2280 	for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2281 		struct cpuid_regs *regs = &cpi->cpi_topo[i];
2282 
2283 		bzero(regs, sizeof (struct cpuid_regs));
2284 		regs->cp_eax = leaf;
2285 		regs->cp_ecx = i;
2286 
2287 		(void) __cpuid_insn(regs);
2288 		if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2289 		    CPUID_AMD_8X26_TYPE_DONE) {
2290 			break;
2291 		}
2292 	}
2293 
2294 	cpi->cpi_topo_nleaves = i;
2295 }
2296 
2297 /*
2298  * Make sure that we have gathered all of the CPUID leaves that we might need to
2299  * determine topology. We assume that the standard leaf 1 has already been done
2300  * and that xmaxeax has already been calculated.
2301  */
2302 static void
2303 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2304 {
2305 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2306 
2307 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2308 		struct cpuid_regs *cp;
2309 
2310 		cp = &cpi->cpi_extd[8];
2311 		cp->cp_eax = CPUID_LEAF_EXT_8;
2312 		(void) __cpuid_insn(cp);
2313 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2314 	}
2315 
2316 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2317 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2318 		struct cpuid_regs *cp;
2319 
2320 		cp = &cpi->cpi_extd[0x1e];
2321 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2322 		(void) __cpuid_insn(cp);
2323 	}
2324 
2325 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2326 		cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2327 	}
2328 }
2329 
2330 /*
2331  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2332  * it to everything else. If not, and we're on an AMD system where 8000001e is
2333  * valid, then we use that. Othewrise, we fall back to the default value for the
2334  * APIC ID in leaf 1.
2335  */
2336 static uint32_t
2337 cpuid_gather_apicid(struct cpuid_info *cpi)
2338 {
2339 	/*
2340 	 * Leaf B changes based on the arguments to it. Because we don't cache
2341 	 * it, we need to gather it again.
2342 	 */
2343 	if (cpi->cpi_maxeax >= 0xB) {
2344 		struct cpuid_regs regs;
2345 		struct cpuid_regs *cp;
2346 
2347 		cp = &regs;
2348 		cp->cp_eax = 0xB;
2349 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2350 		(void) __cpuid_insn(cp);
2351 
2352 		if (cp->cp_ebx != 0) {
2353 			return (cp->cp_edx);
2354 		}
2355 	}
2356 
2357 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2358 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2359 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2360 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2361 		return (cpi->cpi_extd[0x1e].cp_eax);
2362 	}
2363 
2364 	return (CPI_APIC_ID(cpi));
2365 }
2366 
2367 /*
2368  * For AMD processors, attempt to calculate the number of chips and cores that
2369  * exist. The way that we do this varies based on the generation, because the
2370  * generations themselves have changed dramatically.
2371  *
2372  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2373  * However, with the advent of family 17h (Zen) it actually tells us the number
2374  * of threads, so we need to look at leaf 0x8000001e if available to determine
2375  * its value. Otherwise, for all prior families, the number of enabled cores is
2376  * the same as threads.
2377  *
2378  * If we do not have leaf 0x80000008, then we assume that this processor does
2379  * not have anything. AMD's older CPUID specification says there's no reason to
2380  * fall back to leaf 1.
2381  *
2382  * In some virtualization cases we will not have leaf 8000001e or it will be
2383  * zero. When that happens we assume the number of threads is one.
2384  */
2385 static void
2386 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2387 {
2388 	uint_t nthreads, nthread_per_core;
2389 
2390 	nthreads = nthread_per_core = 1;
2391 
2392 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2393 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2394 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2395 		nthreads = CPI_CPU_COUNT(cpi);
2396 	}
2397 
2398 	/*
2399 	 * For us to have threads, and know about it, we have to be at least at
2400 	 * family 17h and have the cpuid bit that says we have extended
2401 	 * topology.
2402 	 */
2403 	if (cpi->cpi_family >= 0x17 &&
2404 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2405 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2406 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2407 	}
2408 
2409 	*ncpus = nthreads;
2410 	*ncores = nthreads / nthread_per_core;
2411 }
2412 
2413 /*
2414  * Seed the initial values for the cores and threads for an Intel based
2415  * processor. These values will be overwritten if we detect that the processor
2416  * supports CPUID leaf 0xb.
2417  */
2418 static void
2419 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2420 {
2421 	/*
2422 	 * Only seed the number of physical cores from the first level leaf 4
2423 	 * information. The number of threads there indicate how many share the
2424 	 * L1 cache, which may or may not have anything to do with the number of
2425 	 * logical CPUs per core.
2426 	 */
2427 	if (cpi->cpi_maxeax >= 4) {
2428 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2429 	} else {
2430 		*ncores = 1;
2431 	}
2432 
2433 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2434 		*ncpus = CPI_CPU_COUNT(cpi);
2435 	} else {
2436 		*ncpus = *ncores;
2437 	}
2438 }
2439 
2440 static boolean_t
2441 cpuid_leafB_getids(cpu_t *cpu)
2442 {
2443 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2444 	struct cpuid_regs regs;
2445 	struct cpuid_regs *cp;
2446 
2447 	if (cpi->cpi_maxeax < 0xB)
2448 		return (B_FALSE);
2449 
2450 	cp = &regs;
2451 	cp->cp_eax = 0xB;
2452 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2453 
2454 	(void) __cpuid_insn(cp);
2455 
2456 	/*
2457 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2458 	 * indicates that the extended topology enumeration leaf is
2459 	 * available.
2460 	 */
2461 	if (cp->cp_ebx != 0) {
2462 		uint32_t x2apic_id = 0;
2463 		uint_t coreid_shift = 0;
2464 		uint_t ncpu_per_core = 1;
2465 		uint_t chipid_shift = 0;
2466 		uint_t ncpu_per_chip = 1;
2467 		uint_t i;
2468 		uint_t level;
2469 
2470 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2471 			cp->cp_eax = 0xB;
2472 			cp->cp_ecx = i;
2473 
2474 			(void) __cpuid_insn(cp);
2475 			level = CPI_CPU_LEVEL_TYPE(cp);
2476 
2477 			if (level == 1) {
2478 				x2apic_id = cp->cp_edx;
2479 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2480 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2481 			} else if (level == 2) {
2482 				x2apic_id = cp->cp_edx;
2483 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2484 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2485 			}
2486 		}
2487 
2488 		/*
2489 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2490 		 */
2491 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2492 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2493 		    ncpu_per_core;
2494 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2495 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2496 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2497 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2498 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2499 		cpi->cpi_compunitid = cpi->cpi_coreid;
2500 
2501 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2502 			cpi->cpi_nthread_bits = coreid_shift;
2503 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2504 		}
2505 
2506 		return (B_TRUE);
2507 	} else {
2508 		return (B_FALSE);
2509 	}
2510 }
2511 
2512 static void
2513 cpuid_intel_getids(cpu_t *cpu, void *feature)
2514 {
2515 	uint_t i;
2516 	uint_t chipid_shift = 0;
2517 	uint_t coreid_shift = 0;
2518 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2519 
2520 	/*
2521 	 * There are no compute units or processor nodes currently on Intel.
2522 	 * Always set these to one.
2523 	 */
2524 	cpi->cpi_procnodes_per_pkg = 1;
2525 	cpi->cpi_cores_per_compunit = 1;
2526 
2527 	/*
2528 	 * If cpuid Leaf B is present, use that to try and get this information.
2529 	 * It will be the most accurate for Intel CPUs.
2530 	 */
2531 	if (cpuid_leafB_getids(cpu))
2532 		return;
2533 
2534 	/*
2535 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2536 	 * and ncore_per_chip. These represent the largest power of two values
2537 	 * that we need to cover all of the IDs in the system. Therefore, we use
2538 	 * those values to seed the number of bits needed to cover information
2539 	 * in the case when leaf B is not available. These values will probably
2540 	 * be larger than required, but that's OK.
2541 	 */
2542 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2543 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2544 
2545 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2546 		chipid_shift++;
2547 
2548 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2549 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2550 
2551 	if (is_x86_feature(feature, X86FSET_CMP)) {
2552 		/*
2553 		 * Multi-core (and possibly multi-threaded)
2554 		 * processors.
2555 		 */
2556 		uint_t ncpu_per_core = 0;
2557 
2558 		if (cpi->cpi_ncore_per_chip == 1)
2559 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2560 		else if (cpi->cpi_ncore_per_chip > 1)
2561 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2562 			    cpi->cpi_ncore_per_chip;
2563 		/*
2564 		 * 8bit APIC IDs on dual core Pentiums
2565 		 * look like this:
2566 		 *
2567 		 * +-----------------------+------+------+
2568 		 * | Physical Package ID   |  MC  |  HT  |
2569 		 * +-----------------------+------+------+
2570 		 * <------- chipid -------->
2571 		 * <------- coreid --------------->
2572 		 *			   <--- clogid -->
2573 		 *			   <------>
2574 		 *			   pkgcoreid
2575 		 *
2576 		 * Where the number of bits necessary to
2577 		 * represent MC and HT fields together equals
2578 		 * to the minimum number of bits necessary to
2579 		 * store the value of cpi->cpi_ncpu_per_chip.
2580 		 * Of those bits, the MC part uses the number
2581 		 * of bits necessary to store the value of
2582 		 * cpi->cpi_ncore_per_chip.
2583 		 */
2584 		for (i = 1; i < ncpu_per_core; i <<= 1)
2585 			coreid_shift++;
2586 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2587 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2588 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2589 		/*
2590 		 * Single-core multi-threaded processors.
2591 		 */
2592 		cpi->cpi_coreid = cpi->cpi_chipid;
2593 		cpi->cpi_pkgcoreid = 0;
2594 	} else {
2595 		/*
2596 		 * Single-core single-thread processors.
2597 		 */
2598 		cpi->cpi_coreid = cpu->cpu_id;
2599 		cpi->cpi_pkgcoreid = 0;
2600 	}
2601 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2602 	cpi->cpi_compunitid = cpi->cpi_coreid;
2603 }
2604 
2605 /*
2606  * Historically, AMD has had CMP chips with only a single thread per core.
2607  * However, starting in family 17h (Zen), this has changed and they now have
2608  * multiple threads. Our internal core id needs to be a unique value.
2609  *
2610  * To determine the core id of an AMD system, if we're from a family before 17h,
2611  * then we just use the cpu id, as that gives us a good value that will be
2612  * unique for each core. If instead, we're on family 17h or later, then we need
2613  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2614  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2615  * We can't use the normal core id in that leaf as it's only unique within the
2616  * socket, which is perfect for cpi_pkgcoreid, but not us.
2617  */
2618 static id_t
2619 cpuid_amd_get_coreid(cpu_t *cpu)
2620 {
2621 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2622 
2623 	if (cpi->cpi_family >= 0x17 &&
2624 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2625 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2626 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2627 		if (nthreads > 1) {
2628 			VERIFY3U(nthreads, ==, 2);
2629 			return (cpi->cpi_apicid >> 1);
2630 		}
2631 	}
2632 
2633 	return (cpu->cpu_id);
2634 }
2635 
2636 /*
2637  * IDs on AMD is a more challenging task. This is notable because of the
2638  * following two facts:
2639  *
2640  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2641  *     also no way to get an actual unique core id from the system. As such, we
2642  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2643  *     however, guarantee that sibling cores of a chip will have sequential
2644  *     coreids starting at a multiple of the number of cores per chip - that is
2645  *     usually the case, but if the APIC IDs have been set up in a different
2646  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2647  *
2648  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2649  *     called compute units. These compute units share the L1I cache, L2 cache,
2650  *     and the FPU. To deal with this, a new topology leaf was added in
2651  *     0x8000001e. However, parts of this leaf have different meanings
2652  *     once we get to family 0x17.
2653  */
2654 
2655 static void
2656 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2657 {
2658 	int i, first_half, coreidsz;
2659 	uint32_t nb_caps_reg;
2660 	uint_t node2_1;
2661 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2662 	struct cpuid_regs *cp;
2663 
2664 	/*
2665 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2666 	 * hasn't been stripped by virtualization). We always set the compute
2667 	 * unit id to the same value. Also, initialize the default number of
2668 	 * cores per compute unit and nodes per package. This will be
2669 	 * overwritten when we know information about a particular family.
2670 	 */
2671 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2672 	cpi->cpi_compunitid = cpi->cpi_coreid;
2673 	cpi->cpi_cores_per_compunit = 1;
2674 	cpi->cpi_procnodes_per_pkg = 1;
2675 
2676 	/*
2677 	 * To construct the logical ID, we need to determine how many APIC IDs
2678 	 * are dedicated to the cores and threads. This is provided for us in
2679 	 * 0x80000008. However, if it's not present (say due to virtualization),
2680 	 * then we assume it's one. This should be present on all 64-bit AMD
2681 	 * processors.  It was added in family 0xf (Hammer).
2682 	 */
2683 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2684 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2685 
2686 		/*
2687 		 * In AMD parlance chip is really a node while illumos
2688 		 * uses chip as equivalent to socket/package.
2689 		 */
2690 		if (coreidsz == 0) {
2691 			/* Use legacy method */
2692 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2693 				coreidsz++;
2694 			if (coreidsz == 0)
2695 				coreidsz = 1;
2696 		}
2697 	} else {
2698 		/* Assume single-core part */
2699 		coreidsz = 1;
2700 	}
2701 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2702 
2703 	/*
2704 	 * The package core ID varies depending on the family. While it may be
2705 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2706 	 * this value is the core id in the given node. For non-virtualized
2707 	 * family 17h, we need to take the logical core id and shift off the
2708 	 * threads like we do when getting the core id.  Otherwise, we can use
2709 	 * the clogid as is. When family 17h is virtualized, the clogid should
2710 	 * be sufficient as if we don't have valid data in the leaf, then we
2711 	 * won't think we have SMT, in which case the cpi_clogid should be
2712 	 * sufficient.
2713 	 */
2714 	if (cpi->cpi_family >= 0x17 &&
2715 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2716 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2717 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2718 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2719 		if (nthreads > 1) {
2720 			VERIFY3U(nthreads, ==, 2);
2721 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2722 		} else {
2723 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2724 		}
2725 	} else {
2726 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2727 	}
2728 
2729 	/*
2730 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2731 	 * (bulldozer) or newer, then we can derive all of this from leaf
2732 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2733 	 */
2734 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2735 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2736 		cp = &cpi->cpi_extd[0x1e];
2737 
2738 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2739 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2740 
2741 		/*
2742 		 * For Bulldozer-era CPUs, recalculate the compute unit
2743 		 * information.
2744 		 */
2745 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2746 			cpi->cpi_cores_per_compunit =
2747 			    BITX(cp->cp_ebx, 15, 8) + 1;
2748 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2749 			    (cpi->cpi_ncore_per_chip /
2750 			    cpi->cpi_cores_per_compunit) *
2751 			    (cpi->cpi_procnodeid /
2752 			    cpi->cpi_procnodes_per_pkg);
2753 		}
2754 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2755 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2756 	} else if (cpi->cpi_family == 0x10) {
2757 		/*
2758 		 * See if we are a multi-node processor.
2759 		 * All processors in the system have the same number of nodes
2760 		 */
2761 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2762 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2763 			/* Single-node */
2764 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2765 			    coreidsz);
2766 		} else {
2767 
2768 			/*
2769 			 * Multi-node revision D (2 nodes per package
2770 			 * are supported)
2771 			 */
2772 			cpi->cpi_procnodes_per_pkg = 2;
2773 
2774 			first_half = (cpi->cpi_pkgcoreid <=
2775 			    (cpi->cpi_ncore_per_chip/2 - 1));
2776 
2777 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2778 				/* We are BSP */
2779 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2780 			} else {
2781 
2782 				/* We are AP */
2783 				/* NodeId[2:1] bits to use for reading F3xe8 */
2784 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2785 
2786 				nb_caps_reg =
2787 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2788 
2789 				/*
2790 				 * Check IntNodeNum bit (31:30, but bit 31 is
2791 				 * always 0 on dual-node processors)
2792 				 */
2793 				if (BITX(nb_caps_reg, 30, 30) == 0)
2794 					cpi->cpi_procnodeid = node2_1 +
2795 					    !first_half;
2796 				else
2797 					cpi->cpi_procnodeid = node2_1 +
2798 					    first_half;
2799 			}
2800 		}
2801 	} else {
2802 		cpi->cpi_procnodeid = 0;
2803 	}
2804 
2805 	cpi->cpi_chipid =
2806 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2807 
2808 	cpi->cpi_ncore_bits = coreidsz;
2809 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2810 	    cpi->cpi_ncore_per_chip);
2811 }
2812 
2813 static void
2814 spec_uarch_flush_noop(void)
2815 {
2816 }
2817 
2818 /*
2819  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2820  * MDS-related micro-architectural state that would normally happen by calling
2821  * x86_md_clear().
2822  */
2823 static void
2824 spec_uarch_flush_msr(void)
2825 {
2826 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2827 }
2828 
2829 /*
2830  * This function points to a function that will flush certain
2831  * micro-architectural state on the processor. This flush is used to mitigate
2832  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2833  * function can point to one of three functions:
2834  *
2835  * - A noop which is done because we either are vulnerable, but do not have
2836  *   microcode available to help deal with a fix, or because we aren't
2837  *   vulnerable.
2838  *
2839  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2840  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2841  *   however, it only flushes the MDS related micro-architectural state on the
2842  *   current hyperthread, it does not do anything for the twin.
2843  *
2844  * - x86_md_clear which will flush the MDS related state. This is done when we
2845  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2846  *   (RDCL_NO is set).
2847  */
2848 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2849 
2850 static void
2851 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2852 {
2853 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2854 
2855 	/*
2856 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2857 	 * has been fixed in hardware, it doesn't cover everything related to
2858 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2859 	 * need to mitigate this.
2860 	 */
2861 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2862 	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2863 		return;
2864 	}
2865 
2866 	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2867 		const uint8_t nop = NOP_INSTR;
2868 		uint8_t *md = (uint8_t *)x86_md_clear;
2869 
2870 		*md = nop;
2871 	}
2872 
2873 	membar_producer();
2874 }
2875 
2876 static void
2877 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2878 {
2879 	boolean_t need_l1d, need_mds;
2880 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2881 
2882 	/*
2883 	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2884 	 * hardware, then there's nothing left for us to do for enabling the
2885 	 * flush. We can also go ahead and say that SMT exclusion is
2886 	 * unnecessary.
2887 	 */
2888 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2889 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2890 	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2891 		extern int smt_exclusion;
2892 		smt_exclusion = 0;
2893 		spec_uarch_flush = spec_uarch_flush_noop;
2894 		membar_producer();
2895 		return;
2896 	}
2897 
2898 	/*
2899 	 * The locations where we need to perform an L1D flush are required both
2900 	 * for mitigating L1TF and MDS. When verw support is present in
2901 	 * microcode, then the L1D flush will take care of doing that as well.
2902 	 * However, if we have a system where RDCL_NO is present, but we don't
2903 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2904 	 * L1D flush.
2905 	 */
2906 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2907 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2908 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2909 		need_l1d = B_TRUE;
2910 	} else {
2911 		need_l1d = B_FALSE;
2912 	}
2913 
2914 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2915 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2916 		need_mds = B_TRUE;
2917 	} else {
2918 		need_mds = B_FALSE;
2919 	}
2920 
2921 	if (need_l1d) {
2922 		spec_uarch_flush = spec_uarch_flush_msr;
2923 	} else if (need_mds) {
2924 		spec_uarch_flush = x86_md_clear;
2925 	} else {
2926 		/*
2927 		 * We have no hardware mitigations available to us.
2928 		 */
2929 		spec_uarch_flush = spec_uarch_flush_noop;
2930 	}
2931 	membar_producer();
2932 }
2933 
2934 /*
2935  * We default to enabling RSB mitigations.
2936  *
2937  * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2938  * post-barrier RSB guessing suggests we should enable RSB mitigations always
2939  * unless specifically instructed not to.
2940  *
2941  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
2942  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
2943  * also states that as long as SMEP and we maintain at least one page between
2944  * the kernel and user space (we have much more of a red zone), then we do not
2945  * need to clear the RSB. We constrain this to only when Automatic IRBS is
2946  * present.
2947  */
2948 static void
2949 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2950 {
2951 	const uint8_t ret = RET_INSTR;
2952 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2953 
2954 	switch (mit) {
2955 	case X86_SPECTREV2_AUTO_IBRS:
2956 	case X86_SPECTREV2_DISABLED:
2957 		*stuff = ret;
2958 		break;
2959 	default:
2960 		break;
2961 	}
2962 }
2963 
2964 static void
2965 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2966 {
2967 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2968 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2969 	    "_r14", "_r15" };
2970 	const uint_t nthunks = ARRAY_SIZE(thunks);
2971 	const char *type;
2972 	uint_t i;
2973 
2974 	if (mit == x86_spectrev2_mitigation)
2975 		return;
2976 
2977 	switch (mit) {
2978 	case X86_SPECTREV2_RETPOLINE:
2979 		type = "gen";
2980 		break;
2981 	case X86_SPECTREV2_AUTO_IBRS:
2982 	case X86_SPECTREV2_ENHANCED_IBRS:
2983 	case X86_SPECTREV2_DISABLED:
2984 		type = "jmp";
2985 		break;
2986 	default:
2987 		panic("asked to update retpoline state with unknown state!");
2988 	}
2989 
2990 	for (i = 0; i < nthunks; i++) {
2991 		uintptr_t source, dest;
2992 		int ssize, dsize;
2993 		char sourcebuf[64], destbuf[64];
2994 
2995 		(void) snprintf(destbuf, sizeof (destbuf),
2996 		    "__x86_indirect_thunk%s", thunks[i]);
2997 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2998 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2999 
3000 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3001 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
3002 		VERIFY3U(source, !=, 0);
3003 		VERIFY3U(dest, !=, 0);
3004 		VERIFY3S(dsize, >=, ssize);
3005 		bcopy((void *)source, (void *)dest, ssize);
3006 	}
3007 }
3008 
3009 static void
3010 cpuid_enable_enhanced_ibrs(void)
3011 {
3012 	uint64_t val;
3013 
3014 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3015 	val |= IA32_SPEC_CTRL_IBRS;
3016 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3017 }
3018 
3019 static void
3020 cpuid_enable_auto_ibrs(void)
3021 {
3022 	uint64_t val;
3023 
3024 	val = rdmsr(MSR_AMD_EFER);
3025 	val |= AMD_EFER_AIBRSE;
3026 	wrmsr(MSR_AMD_EFER, val);
3027 }
3028 
3029 /*
3030  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3031  * we can disable TSX, we do so.
3032  *
3033  * This determination is done only on the boot CPU, potentially after loading
3034  * updated microcode.
3035  */
3036 static void
3037 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3038 {
3039 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3040 
3041 	VERIFY(cpu->cpu_id == 0);
3042 
3043 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3044 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3045 		return;
3046 	}
3047 
3048 	if (x86_disable_taa) {
3049 		x86_taa_mitigation = X86_TAA_DISABLED;
3050 		return;
3051 	}
3052 
3053 	/*
3054 	 * If we do not have the ability to disable TSX, then our only
3055 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3056 	 * MDS mitigation as described above.  The latter relies upon us having
3057 	 * configured MDS mitigations correctly! This includes disabling SMT if
3058 	 * we want to cross-CPU-thread protection.
3059 	 */
3060 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3061 		/*
3062 		 * It's not clear whether any parts will enumerate TAA_NO
3063 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3064 		 */
3065 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3066 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3067 			return;
3068 		}
3069 
3070 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3071 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3072 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3073 		} else {
3074 			x86_taa_mitigation = X86_TAA_NOTHING;
3075 		}
3076 		return;
3077 	}
3078 
3079 	/*
3080 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3081 	 * enough in boot.
3082 	 *
3083 	 * Otherwise, we'll fall back to causing transactions to abort as our
3084 	 * mitigation. TSX-using code will always take the fallback path.
3085 	 */
3086 	if (cpi->cpi_pass < 4) {
3087 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3088 	} else {
3089 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3090 	}
3091 }
3092 
3093 /*
3094  * As mentioned, we should only touch the MSR when we've got a suitable
3095  * microcode loaded on this CPU.
3096  */
3097 static void
3098 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3099 {
3100 	uint64_t val;
3101 
3102 	switch (taa) {
3103 	case X86_TAA_TSX_DISABLE:
3104 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3105 			return;
3106 		val = rdmsr(MSR_IA32_TSX_CTRL);
3107 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3108 		wrmsr(MSR_IA32_TSX_CTRL, val);
3109 		break;
3110 	case X86_TAA_TSX_FORCE_ABORT:
3111 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3112 			return;
3113 		val = rdmsr(MSR_IA32_TSX_CTRL);
3114 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3115 		wrmsr(MSR_IA32_TSX_CTRL, val);
3116 		break;
3117 	case X86_TAA_HW_MITIGATED:
3118 	case X86_TAA_MD_CLEAR:
3119 	case X86_TAA_DISABLED:
3120 	case X86_TAA_NOTHING:
3121 		break;
3122 	}
3123 }
3124 
3125 static void
3126 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3127 {
3128 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3129 	x86_spectrev2_mitigation_t v2mit;
3130 
3131 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3132 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3133 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3134 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3135 			add_x86_feature(featureset, X86FSET_IBPB);
3136 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3137 			add_x86_feature(featureset, X86FSET_IBRS);
3138 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3139 			add_x86_feature(featureset, X86FSET_STIBP);
3140 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3141 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3142 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3143 			add_x86_feature(featureset, X86FSET_SSBD);
3144 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3145 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3146 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3147 			add_x86_feature(featureset, X86FSET_SSB_NO);
3148 
3149 		/*
3150 		 * Rather than Enhanced IBRS, AMD has a different feature that
3151 		 * is a bit in EFER that can be enabled and will basically do
3152 		 * the right thing while executing in the kernel.
3153 		 */
3154 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3155 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3156 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3157 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3158 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3159 		}
3160 
3161 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3162 	    cpi->cpi_maxeax >= 7) {
3163 		struct cpuid_regs *ecp;
3164 		ecp = &cpi->cpi_std[7];
3165 
3166 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3167 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3168 		}
3169 
3170 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3171 			add_x86_feature(featureset, X86FSET_IBRS);
3172 			add_x86_feature(featureset, X86FSET_IBPB);
3173 		}
3174 
3175 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3176 			add_x86_feature(featureset, X86FSET_STIBP);
3177 		}
3178 
3179 		/*
3180 		 * Don't read the arch caps MSR on xpv where we lack the
3181 		 * on_trap().
3182 		 */
3183 #ifndef __xpv
3184 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3185 			on_trap_data_t otd;
3186 
3187 			/*
3188 			 * Be paranoid and assume we'll get a #GP.
3189 			 */
3190 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3191 				uint64_t reg;
3192 
3193 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3194 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3195 					add_x86_feature(featureset,
3196 					    X86FSET_RDCL_NO);
3197 				}
3198 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3199 					add_x86_feature(featureset,
3200 					    X86FSET_IBRS_ALL);
3201 				}
3202 				if (reg & IA32_ARCH_CAP_RSBA) {
3203 					add_x86_feature(featureset,
3204 					    X86FSET_RSBA);
3205 				}
3206 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3207 					add_x86_feature(featureset,
3208 					    X86FSET_L1D_VM_NO);
3209 				}
3210 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3211 					add_x86_feature(featureset,
3212 					    X86FSET_SSB_NO);
3213 				}
3214 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3215 					add_x86_feature(featureset,
3216 					    X86FSET_MDS_NO);
3217 				}
3218 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3219 					add_x86_feature(featureset,
3220 					    X86FSET_TSX_CTRL);
3221 				}
3222 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3223 					add_x86_feature(featureset,
3224 					    X86FSET_TAA_NO);
3225 				}
3226 			}
3227 			no_trap();
3228 		}
3229 #endif	/* !__xpv */
3230 
3231 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3232 			add_x86_feature(featureset, X86FSET_SSBD);
3233 
3234 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3235 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3236 	}
3237 
3238 	/*
3239 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3240 	 * will have already run this function and determined what we need to
3241 	 * do. This gives us a hook for per-HW thread mitigations such as
3242 	 * enhanced IBRS, or disabling TSX.
3243 	 */
3244 	if (cpu->cpu_id != 0) {
3245 		switch (x86_spectrev2_mitigation) {
3246 		case X86_SPECTREV2_ENHANCED_IBRS:
3247 			cpuid_enable_enhanced_ibrs();
3248 			break;
3249 		case X86_SPECTREV2_AUTO_IBRS:
3250 			cpuid_enable_auto_ibrs();
3251 			break;
3252 		default:
3253 			break;
3254 		}
3255 
3256 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3257 		return;
3258 	}
3259 
3260 	/*
3261 	 * Go through and initialize various security mechanisms that we should
3262 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3263 	 * TAA.
3264 	 */
3265 
3266 	/*
3267 	 * By default we've come in with retpolines enabled. Check whether we
3268 	 * should disable them or enable enhanced or automatic IBRS. RSB
3269 	 * stuffing is enabled by default. Note, we do not allow the use of AMD
3270 	 * optimized retpolines as it was disclosed by AMD in March 2022 that
3271 	 * they were still vulnerable. Prior to that point, we used them.
3272 	 */
3273 	if (x86_disable_spectrev2 != 0) {
3274 		v2mit = X86_SPECTREV2_DISABLED;
3275 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3276 		cpuid_enable_auto_ibrs();
3277 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3278 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3279 		cpuid_enable_enhanced_ibrs();
3280 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3281 	} else {
3282 		v2mit = X86_SPECTREV2_RETPOLINE;
3283 	}
3284 
3285 	cpuid_patch_retpolines(v2mit);
3286 	cpuid_patch_rsb(v2mit);
3287 	x86_spectrev2_mitigation = v2mit;
3288 	membar_producer();
3289 
3290 	/*
3291 	 * We need to determine what changes are required for mitigating L1TF
3292 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3293 	 * is required.
3294 	 *
3295 	 * If any of these are present, then we need to flush u-arch state at
3296 	 * various points. For MDS, we need to do so whenever we change to a
3297 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3298 	 * flush the L1D cache at VM entry. When we have microcode that handles
3299 	 * MDS, the L1D flush also clears the other u-arch state that the
3300 	 * md_clear does.
3301 	 */
3302 
3303 	/*
3304 	 * Update whether or not we need to be taking explicit action against
3305 	 * MDS.
3306 	 */
3307 	cpuid_update_md_clear(cpu, featureset);
3308 
3309 	/*
3310 	 * Determine whether SMT exclusion is required and whether or not we
3311 	 * need to perform an l1d flush.
3312 	 */
3313 	cpuid_update_l1d_flush(cpu, featureset);
3314 
3315 	/*
3316 	 * Determine what our mitigation strategy should be for TAA and then
3317 	 * also apply TAA mitigations.
3318 	 */
3319 	cpuid_update_tsx(cpu, featureset);
3320 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3321 }
3322 
3323 /*
3324  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3325  */
3326 void
3327 setup_xfem(void)
3328 {
3329 	uint64_t flags = XFEATURE_LEGACY_FP;
3330 
3331 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3332 
3333 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3334 		flags |= XFEATURE_SSE;
3335 
3336 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3337 		flags |= XFEATURE_AVX;
3338 
3339 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3340 		flags |= XFEATURE_AVX512;
3341 
3342 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3343 
3344 	xsave_bv_all = flags;
3345 }
3346 
3347 static void
3348 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3349 {
3350 	struct cpuid_info *cpi;
3351 
3352 	cpi = cpu->cpu_m.mcpu_cpi;
3353 
3354 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3355 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3356 		cpuid_gather_amd_topology_leaves(cpu);
3357 	}
3358 
3359 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3360 
3361 	/*
3362 	 * Before we can calculate the IDs that we should assign to this
3363 	 * processor, we need to understand how many cores and threads it has.
3364 	 */
3365 	switch (cpi->cpi_vendor) {
3366 	case X86_VENDOR_Intel:
3367 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3368 		    &cpi->cpi_ncore_per_chip);
3369 		break;
3370 	case X86_VENDOR_AMD:
3371 	case X86_VENDOR_HYGON:
3372 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3373 		    &cpi->cpi_ncore_per_chip);
3374 		break;
3375 	default:
3376 		/*
3377 		 * If we have some other x86 compatible chip, it's not clear how
3378 		 * they would behave. The most common case is virtualization
3379 		 * today, though there are also 64-bit VIA chips. Assume that
3380 		 * all we can get is the basic Leaf 1 HTT information.
3381 		 */
3382 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3383 			cpi->cpi_ncore_per_chip = 1;
3384 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3385 		}
3386 		break;
3387 	}
3388 
3389 	/*
3390 	 * Based on the calculated number of threads and cores, potentially
3391 	 * assign the HTT and CMT features.
3392 	 */
3393 	if (cpi->cpi_ncore_per_chip > 1) {
3394 		add_x86_feature(featureset, X86FSET_CMP);
3395 	}
3396 
3397 	if (cpi->cpi_ncpu_per_chip > 1 &&
3398 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3399 		add_x86_feature(featureset, X86FSET_HTT);
3400 	}
3401 
3402 	/*
3403 	 * Now that has been set up, we need to go through and calculate all of
3404 	 * the rest of the parameters that exist. If we think the CPU doesn't
3405 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3406 	 * up information in some way. The most likely case for this is
3407 	 * virtualization where we have a lot of partial topology information.
3408 	 */
3409 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3410 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3411 		/*
3412 		 * This is a single core, single-threaded processor.
3413 		 */
3414 		cpi->cpi_procnodes_per_pkg = 1;
3415 		cpi->cpi_cores_per_compunit = 1;
3416 		cpi->cpi_compunitid = 0;
3417 		cpi->cpi_chipid = -1;
3418 		cpi->cpi_clogid = 0;
3419 		cpi->cpi_coreid = cpu->cpu_id;
3420 		cpi->cpi_pkgcoreid = 0;
3421 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3422 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3423 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3424 		} else {
3425 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3426 		}
3427 	} else {
3428 		switch (cpi->cpi_vendor) {
3429 		case X86_VENDOR_Intel:
3430 			cpuid_intel_getids(cpu, featureset);
3431 			break;
3432 		case X86_VENDOR_AMD:
3433 		case X86_VENDOR_HYGON:
3434 			cpuid_amd_getids(cpu, featureset);
3435 			break;
3436 		default:
3437 			/*
3438 			 * In this case, it's hard to say what we should do.
3439 			 * We're going to model them to the OS as single core
3440 			 * threads. We don't have a good identifier for them, so
3441 			 * we're just going to use the cpu id all on a single
3442 			 * chip.
3443 			 *
3444 			 * This case has historically been different from the
3445 			 * case above where we don't have HTT or CMP. While they
3446 			 * could be combined, we've opted to keep it separate to
3447 			 * minimize the risk of topology changes in weird cases.
3448 			 */
3449 			cpi->cpi_procnodes_per_pkg = 1;
3450 			cpi->cpi_cores_per_compunit = 1;
3451 			cpi->cpi_chipid = 0;
3452 			cpi->cpi_coreid = cpu->cpu_id;
3453 			cpi->cpi_clogid = cpu->cpu_id;
3454 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3455 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3456 			cpi->cpi_compunitid = cpi->cpi_coreid;
3457 			break;
3458 		}
3459 	}
3460 }
3461 
3462 /*
3463  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3464  * always gather leaf 6 if it's supported; however, we only look for features on
3465  * Intel systems as AMD does not currently define any of the features we look
3466  * for below.
3467  */
3468 static void
3469 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3470 {
3471 	struct cpuid_regs *cp;
3472 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3473 
3474 	if (cpi->cpi_maxeax < 6) {
3475 		return;
3476 	}
3477 
3478 	cp = &cpi->cpi_std[6];
3479 	cp->cp_eax = 6;
3480 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3481 	(void) __cpuid_insn(cp);
3482 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3483 
3484 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3485 		return;
3486 	}
3487 
3488 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3489 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3490 	}
3491 
3492 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3493 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3494 	}
3495 }
3496 
3497 /*
3498  * This is used when we discover that we have AVX support in cpuid. This
3499  * proceeds to scan for the rest of the AVX derived features.
3500  */
3501 static void
3502 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3503 {
3504 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3505 
3506 	/*
3507 	 * If we don't have AVX, don't bother with most of this.
3508 	 */
3509 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3510 		return;
3511 
3512 	add_x86_feature(featureset, X86FSET_AVX);
3513 
3514 	/*
3515 	 * Intel says we can't check these without also
3516 	 * checking AVX.
3517 	 */
3518 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3519 		add_x86_feature(featureset, X86FSET_F16C);
3520 
3521 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3522 		add_x86_feature(featureset, X86FSET_FMA);
3523 
3524 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3525 		add_x86_feature(featureset, X86FSET_BMI1);
3526 
3527 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3528 		add_x86_feature(featureset, X86FSET_BMI2);
3529 
3530 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3531 		add_x86_feature(featureset, X86FSET_AVX2);
3532 
3533 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3534 		add_x86_feature(featureset, X86FSET_VAES);
3535 
3536 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3537 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3538 
3539 	/*
3540 	 * The rest of the AVX features require AVX512. Do not check them unless
3541 	 * it is present.
3542 	 */
3543 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3544 		return;
3545 	add_x86_feature(featureset, X86FSET_AVX512F);
3546 
3547 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3548 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3549 
3550 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3551 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3552 
3553 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3554 		add_x86_feature(featureset, X86FSET_AVX512PF);
3555 
3556 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3557 		add_x86_feature(featureset, X86FSET_AVX512ER);
3558 
3559 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3560 		add_x86_feature(featureset, X86FSET_AVX512CD);
3561 
3562 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3563 		add_x86_feature(featureset, X86FSET_AVX512BW);
3564 
3565 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3566 		add_x86_feature(featureset, X86FSET_AVX512VL);
3567 
3568 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3569 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3570 
3571 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3572 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3573 
3574 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3575 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3576 
3577 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3578 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3579 
3580 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3581 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3582 
3583 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3584 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3585 
3586 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3587 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3588 
3589 	/*
3590 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3591 	 * we don't need to.
3592 	 */
3593 	if (cpi->cpi_std[7].cp_eax < 1)
3594 		return;
3595 
3596 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3597 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3598 }
3599 
3600 /*
3601  * PPIN is the protected processor inventory number. On AMD this is an actual
3602  * feature bit. However, on Intel systems we need to read the platform
3603  * information MSR if we're on a specific model.
3604  */
3605 #if !defined(__xpv)
3606 static void
3607 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3608 {
3609 	on_trap_data_t otd;
3610 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3611 
3612 	switch (cpi->cpi_vendor) {
3613 	case X86_VENDOR_AMD:
3614 		/*
3615 		 * This leaf will have already been gathered in the topology
3616 		 * functions.
3617 		 */
3618 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3619 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3620 				add_x86_feature(featureset, X86FSET_PPIN);
3621 			}
3622 		}
3623 		break;
3624 	case X86_VENDOR_Intel:
3625 		if (cpi->cpi_family != 6)
3626 			break;
3627 		switch (cpi->cpi_model) {
3628 		case INTC_MODEL_IVYBRIDGE_XEON:
3629 		case INTC_MODEL_HASWELL_XEON:
3630 		case INTC_MODEL_BROADWELL_XEON:
3631 		case INTC_MODEL_BROADWELL_XEON_D:
3632 		case INTC_MODEL_SKYLAKE_XEON:
3633 		case INTC_MODEL_ICELAKE_XEON:
3634 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3635 				uint64_t value;
3636 
3637 				value = rdmsr(MSR_PLATFORM_INFO);
3638 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3639 					add_x86_feature(featureset,
3640 					    X86FSET_PPIN);
3641 				}
3642 			}
3643 			no_trap();
3644 			break;
3645 		default:
3646 			break;
3647 		}
3648 		break;
3649 	default:
3650 		break;
3651 	}
3652 }
3653 #endif	/* ! __xpv */
3654 
3655 static void
3656 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3657 {
3658 	uchar_t *featureset = (uchar_t *)arg;
3659 
3660 	/*
3661 	 * We don't run on any processor that doesn't have cpuid, and could not
3662 	 * possibly have arrived here.
3663 	 */
3664 	add_x86_feature(featureset, X86FSET_CPUID);
3665 }
3666 
3667 static void
3668 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3669 {
3670 	struct cpuid_info *cpi;
3671 	struct cpuid_regs *cp;
3672 
3673 	/*
3674 	 * We require that virtual/native detection be complete and that PCI
3675 	 * config space access has been set up; at present there is no reliable
3676 	 * way to determine the latter.
3677 	 */
3678 #if !defined(__xpv)
3679 	ASSERT3S(platform_type, !=, -1);
3680 #endif	/* !__xpv */
3681 
3682 	cpi = cpu->cpu_m.mcpu_cpi;
3683 	ASSERT(cpi != NULL);
3684 
3685 	cp = &cpi->cpi_std[0];
3686 	cp->cp_eax = 0;
3687 	cpi->cpi_maxeax = __cpuid_insn(cp);
3688 	{
3689 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3690 		*iptr++ = cp->cp_ebx;
3691 		*iptr++ = cp->cp_edx;
3692 		*iptr++ = cp->cp_ecx;
3693 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3694 	}
3695 
3696 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3697 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3698 
3699 	/*
3700 	 * Limit the range in case of weird hardware
3701 	 */
3702 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3703 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3704 	if (cpi->cpi_maxeax < 1)
3705 		return;
3706 
3707 	cp = &cpi->cpi_std[1];
3708 	cp->cp_eax = 1;
3709 	(void) __cpuid_insn(cp);
3710 
3711 	/*
3712 	 * Extract identifying constants for easy access.
3713 	 */
3714 	cpi->cpi_model = CPI_MODEL(cpi);
3715 	cpi->cpi_family = CPI_FAMILY(cpi);
3716 
3717 	if (cpi->cpi_family == 0xf)
3718 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3719 
3720 	/*
3721 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3722 	 * Intel, and presumably everyone else, uses model == 0xf, as
3723 	 * one would expect (max value means possible overflow).  Sigh.
3724 	 */
3725 
3726 	switch (cpi->cpi_vendor) {
3727 	case X86_VENDOR_Intel:
3728 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3729 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3730 		break;
3731 	case X86_VENDOR_AMD:
3732 		if (CPI_FAMILY(cpi) == 0xf)
3733 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3734 		break;
3735 	case X86_VENDOR_HYGON:
3736 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3737 		break;
3738 	default:
3739 		if (cpi->cpi_model == 0xf)
3740 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3741 		break;
3742 	}
3743 
3744 	cpi->cpi_step = CPI_STEP(cpi);
3745 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3746 
3747 	/*
3748 	 * Synthesize chip "revision" and socket type
3749 	 */
3750 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3751 	    cpi->cpi_model, cpi->cpi_step);
3752 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3753 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3754 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3755 	    cpi->cpi_model, cpi->cpi_step);
3756 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3757 	    cpi->cpi_model, cpi->cpi_step);
3758 }
3759 
3760 static void
3761 cpuid_pass_basic(cpu_t *cpu, void *arg)
3762 {
3763 	uchar_t *featureset = (uchar_t *)arg;
3764 	uint32_t mask_ecx, mask_edx;
3765 	struct cpuid_info *cpi;
3766 	struct cpuid_regs *cp;
3767 	int xcpuid;
3768 #if !defined(__xpv)
3769 	extern int idle_cpu_prefer_mwait;
3770 #endif
3771 
3772 	cpi = cpu->cpu_m.mcpu_cpi;
3773 	ASSERT(cpi != NULL);
3774 
3775 	if (cpi->cpi_maxeax < 1)
3776 		return;
3777 
3778 	/*
3779 	 * This was filled during the identification pass.
3780 	 */
3781 	cp = &cpi->cpi_std[1];
3782 
3783 	/*
3784 	 * *default* assumptions:
3785 	 * - believe %edx feature word
3786 	 * - ignore %ecx feature word
3787 	 * - 32-bit virtual and physical addressing
3788 	 */
3789 	mask_edx = 0xffffffff;
3790 	mask_ecx = 0;
3791 
3792 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3793 
3794 	switch (cpi->cpi_vendor) {
3795 	case X86_VENDOR_Intel:
3796 		if (cpi->cpi_family == 5)
3797 			x86_type = X86_TYPE_P5;
3798 		else if (IS_LEGACY_P6(cpi)) {
3799 			x86_type = X86_TYPE_P6;
3800 			pentiumpro_bug4046376 = 1;
3801 			/*
3802 			 * Clear the SEP bit when it was set erroneously
3803 			 */
3804 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3805 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3806 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3807 			x86_type = X86_TYPE_P4;
3808 			/*
3809 			 * We don't currently depend on any of the %ecx
3810 			 * features until Prescott, so we'll only check
3811 			 * this from P4 onwards.  We might want to revisit
3812 			 * that idea later.
3813 			 */
3814 			mask_ecx = 0xffffffff;
3815 		} else if (cpi->cpi_family > 0xf)
3816 			mask_ecx = 0xffffffff;
3817 		/*
3818 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3819 		 * to obtain the monitor linesize.
3820 		 */
3821 		if (cpi->cpi_maxeax < 5)
3822 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3823 		break;
3824 	case X86_VENDOR_IntelClone:
3825 	default:
3826 		break;
3827 	case X86_VENDOR_AMD:
3828 #if defined(OPTERON_ERRATUM_108)
3829 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3830 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3831 			cpi->cpi_model = 0xc;
3832 		} else
3833 #endif
3834 		if (cpi->cpi_family == 5) {
3835 			/*
3836 			 * AMD K5 and K6
3837 			 *
3838 			 * These CPUs have an incomplete implementation
3839 			 * of MCA/MCE which we mask away.
3840 			 */
3841 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3842 
3843 			/*
3844 			 * Model 0 uses the wrong (APIC) bit
3845 			 * to indicate PGE.  Fix it here.
3846 			 */
3847 			if (cpi->cpi_model == 0) {
3848 				if (cp->cp_edx & 0x200) {
3849 					cp->cp_edx &= ~0x200;
3850 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3851 				}
3852 			}
3853 
3854 			/*
3855 			 * Early models had problems w/ MMX; disable.
3856 			 */
3857 			if (cpi->cpi_model < 6)
3858 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3859 		}
3860 
3861 		/*
3862 		 * For newer families, SSE3 and CX16, at least, are valid;
3863 		 * enable all
3864 		 */
3865 		if (cpi->cpi_family >= 0xf)
3866 			mask_ecx = 0xffffffff;
3867 		/*
3868 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3869 		 * to obtain the monitor linesize.
3870 		 */
3871 		if (cpi->cpi_maxeax < 5)
3872 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3873 
3874 #if !defined(__xpv)
3875 		/*
3876 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3877 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3878 		 * know for certain that in at least family 17h, per AMD, mwait
3879 		 * is preferred. Families in-between are less certain.
3880 		 */
3881 		if (cpi->cpi_family < 0x17) {
3882 			idle_cpu_prefer_mwait = 0;
3883 		}
3884 #endif
3885 
3886 		break;
3887 	case X86_VENDOR_HYGON:
3888 		/* Enable all for Hygon Dhyana CPU */
3889 		mask_ecx = 0xffffffff;
3890 		break;
3891 	case X86_VENDOR_TM:
3892 		/*
3893 		 * workaround the NT workaround in CMS 4.1
3894 		 */
3895 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3896 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3897 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3898 		break;
3899 	case X86_VENDOR_Centaur:
3900 		/*
3901 		 * workaround the NT workarounds again
3902 		 */
3903 		if (cpi->cpi_family == 6)
3904 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3905 		break;
3906 	case X86_VENDOR_Cyrix:
3907 		/*
3908 		 * We rely heavily on the probing in locore
3909 		 * to actually figure out what parts, if any,
3910 		 * of the Cyrix cpuid instruction to believe.
3911 		 */
3912 		switch (x86_type) {
3913 		case X86_TYPE_CYRIX_486:
3914 			mask_edx = 0;
3915 			break;
3916 		case X86_TYPE_CYRIX_6x86:
3917 			mask_edx = 0;
3918 			break;
3919 		case X86_TYPE_CYRIX_6x86L:
3920 			mask_edx =
3921 			    CPUID_INTC_EDX_DE |
3922 			    CPUID_INTC_EDX_CX8;
3923 			break;
3924 		case X86_TYPE_CYRIX_6x86MX:
3925 			mask_edx =
3926 			    CPUID_INTC_EDX_DE |
3927 			    CPUID_INTC_EDX_MSR |
3928 			    CPUID_INTC_EDX_CX8 |
3929 			    CPUID_INTC_EDX_PGE |
3930 			    CPUID_INTC_EDX_CMOV |
3931 			    CPUID_INTC_EDX_MMX;
3932 			break;
3933 		case X86_TYPE_CYRIX_GXm:
3934 			mask_edx =
3935 			    CPUID_INTC_EDX_MSR |
3936 			    CPUID_INTC_EDX_CX8 |
3937 			    CPUID_INTC_EDX_CMOV |
3938 			    CPUID_INTC_EDX_MMX;
3939 			break;
3940 		case X86_TYPE_CYRIX_MediaGX:
3941 			break;
3942 		case X86_TYPE_CYRIX_MII:
3943 		case X86_TYPE_VIA_CYRIX_III:
3944 			mask_edx =
3945 			    CPUID_INTC_EDX_DE |
3946 			    CPUID_INTC_EDX_TSC |
3947 			    CPUID_INTC_EDX_MSR |
3948 			    CPUID_INTC_EDX_CX8 |
3949 			    CPUID_INTC_EDX_PGE |
3950 			    CPUID_INTC_EDX_CMOV |
3951 			    CPUID_INTC_EDX_MMX;
3952 			break;
3953 		default:
3954 			break;
3955 		}
3956 		break;
3957 	}
3958 
3959 #if defined(__xpv)
3960 	/*
3961 	 * Do not support MONITOR/MWAIT under a hypervisor
3962 	 */
3963 	mask_ecx &= ~CPUID_INTC_ECX_MON;
3964 	/*
3965 	 * Do not support XSAVE under a hypervisor for now
3966 	 */
3967 	xsave_force_disable = B_TRUE;
3968 
3969 #endif	/* __xpv */
3970 
3971 	if (xsave_force_disable) {
3972 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3973 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3974 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3975 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3976 	}
3977 
3978 	/*
3979 	 * Now we've figured out the masks that determine
3980 	 * which bits we choose to believe, apply the masks
3981 	 * to the feature words, then map the kernel's view
3982 	 * of these feature words into its feature word.
3983 	 */
3984 	cp->cp_edx &= mask_edx;
3985 	cp->cp_ecx &= mask_ecx;
3986 
3987 	/*
3988 	 * apply any platform restrictions (we don't call this
3989 	 * immediately after __cpuid_insn here, because we need the
3990 	 * workarounds applied above first)
3991 	 */
3992 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3993 
3994 	/*
3995 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3996 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
3997 	 * 7 has sub-leaves determined by ecx.
3998 	 */
3999 	if (cpi->cpi_maxeax >= 7) {
4000 		struct cpuid_regs *ecp;
4001 		ecp = &cpi->cpi_std[7];
4002 		ecp->cp_eax = 7;
4003 		ecp->cp_ecx = 0;
4004 		(void) __cpuid_insn(ecp);
4005 
4006 		/*
4007 		 * If XSAVE has been disabled, just ignore all of the
4008 		 * extended-save-area dependent flags here. By removing most of
4009 		 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
4010 		 * end up looking at additional xsave dependent leaves right
4011 		 * now.
4012 		 */
4013 		if (xsave_force_disable) {
4014 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4015 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4016 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4017 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4018 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4019 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4020 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4021 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4022 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4023 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4024 		}
4025 
4026 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4027 			add_x86_feature(featureset, X86FSET_SMEP);
4028 
4029 		/*
4030 		 * We check disable_smap here in addition to in startup_smap()
4031 		 * to ensure CPUs that aren't the boot CPU don't accidentally
4032 		 * include it in the feature set and thus generate a mismatched
4033 		 * x86 feature set across CPUs.
4034 		 */
4035 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4036 		    disable_smap == 0)
4037 			add_x86_feature(featureset, X86FSET_SMAP);
4038 
4039 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
4040 			add_x86_feature(featureset, X86FSET_RDSEED);
4041 
4042 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4043 			add_x86_feature(featureset, X86FSET_ADX);
4044 
4045 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4046 			add_x86_feature(featureset, X86FSET_FSGSBASE);
4047 
4048 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4049 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4050 
4051 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4052 			add_x86_feature(featureset, X86FSET_INVPCID);
4053 
4054 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4055 			add_x86_feature(featureset, X86FSET_UMIP);
4056 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4057 			add_x86_feature(featureset, X86FSET_PKU);
4058 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4059 			add_x86_feature(featureset, X86FSET_OSPKE);
4060 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4061 			add_x86_feature(featureset, X86FSET_GFNI);
4062 
4063 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4064 			add_x86_feature(featureset, X86FSET_CLWB);
4065 
4066 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4067 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4068 				add_x86_feature(featureset, X86FSET_MPX);
4069 		}
4070 
4071 		/*
4072 		 * If we have subleaf 1 available, grab and store that. This is
4073 		 * used for more AVX and related features.
4074 		 */
4075 		if (ecp->cp_eax >= 1) {
4076 			struct cpuid_regs *c71;
4077 			c71 = &cpi->cpi_sub7[0];
4078 			c71->cp_eax = 7;
4079 			c71->cp_ecx = 1;
4080 			(void) __cpuid_insn(c71);
4081 		}
4082 	}
4083 
4084 	/*
4085 	 * fold in overrides from the "eeprom" mechanism
4086 	 */
4087 	cp->cp_edx |= cpuid_feature_edx_include;
4088 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4089 
4090 	cp->cp_ecx |= cpuid_feature_ecx_include;
4091 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4092 
4093 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4094 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4095 	}
4096 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4097 		add_x86_feature(featureset, X86FSET_TSC);
4098 	}
4099 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4100 		add_x86_feature(featureset, X86FSET_MSR);
4101 	}
4102 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4103 		add_x86_feature(featureset, X86FSET_MTRR);
4104 	}
4105 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4106 		add_x86_feature(featureset, X86FSET_PGE);
4107 	}
4108 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4109 		add_x86_feature(featureset, X86FSET_CMOV);
4110 	}
4111 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4112 		add_x86_feature(featureset, X86FSET_MMX);
4113 	}
4114 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4115 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4116 		add_x86_feature(featureset, X86FSET_MCA);
4117 	}
4118 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4119 		add_x86_feature(featureset, X86FSET_PAE);
4120 	}
4121 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4122 		add_x86_feature(featureset, X86FSET_CX8);
4123 	}
4124 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4125 		add_x86_feature(featureset, X86FSET_CX16);
4126 	}
4127 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4128 		add_x86_feature(featureset, X86FSET_PAT);
4129 	}
4130 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4131 		add_x86_feature(featureset, X86FSET_SEP);
4132 	}
4133 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4134 		/*
4135 		 * In our implementation, fxsave/fxrstor
4136 		 * are prerequisites before we'll even
4137 		 * try and do SSE things.
4138 		 */
4139 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4140 			add_x86_feature(featureset, X86FSET_SSE);
4141 		}
4142 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4143 			add_x86_feature(featureset, X86FSET_SSE2);
4144 		}
4145 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4146 			add_x86_feature(featureset, X86FSET_SSE3);
4147 		}
4148 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4149 			add_x86_feature(featureset, X86FSET_SSSE3);
4150 		}
4151 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4152 			add_x86_feature(featureset, X86FSET_SSE4_1);
4153 		}
4154 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4155 			add_x86_feature(featureset, X86FSET_SSE4_2);
4156 		}
4157 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4158 			add_x86_feature(featureset, X86FSET_AES);
4159 		}
4160 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4161 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4162 		}
4163 
4164 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4165 			add_x86_feature(featureset, X86FSET_SHA);
4166 
4167 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4168 			add_x86_feature(featureset, X86FSET_XSAVE);
4169 
4170 			/* We only test AVX & AVX512 when there is XSAVE */
4171 			cpuid_basic_avx(cpu, featureset);
4172 		}
4173 	}
4174 
4175 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4176 		add_x86_feature(featureset, X86FSET_PCID);
4177 	}
4178 
4179 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4180 		add_x86_feature(featureset, X86FSET_X2APIC);
4181 	}
4182 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4183 		add_x86_feature(featureset, X86FSET_DE);
4184 	}
4185 #if !defined(__xpv)
4186 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4187 
4188 		/*
4189 		 * We require the CLFLUSH instruction for erratum workaround
4190 		 * to use MONITOR/MWAIT.
4191 		 */
4192 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4193 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4194 			add_x86_feature(featureset, X86FSET_MWAIT);
4195 		} else {
4196 			extern int idle_cpu_assert_cflush_monitor;
4197 
4198 			/*
4199 			 * All processors we are aware of which have
4200 			 * MONITOR/MWAIT also have CLFLUSH.
4201 			 */
4202 			if (idle_cpu_assert_cflush_monitor) {
4203 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4204 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4205 			}
4206 		}
4207 	}
4208 #endif	/* __xpv */
4209 
4210 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4211 		add_x86_feature(featureset, X86FSET_VMX);
4212 	}
4213 
4214 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4215 		add_x86_feature(featureset, X86FSET_RDRAND);
4216 
4217 	/*
4218 	 * Only need it first time, rest of the cpus would follow suit.
4219 	 * we only capture this for the bootcpu.
4220 	 */
4221 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4222 		add_x86_feature(featureset, X86FSET_CLFSH);
4223 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4224 	}
4225 	if (is_x86_feature(featureset, X86FSET_PAE))
4226 		cpi->cpi_pabits = 36;
4227 
4228 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4229 		struct cpuid_regs r, *ecp;
4230 
4231 		ecp = &r;
4232 		ecp->cp_eax = 0xD;
4233 		ecp->cp_ecx = 1;
4234 		ecp->cp_edx = ecp->cp_ebx = 0;
4235 		(void) __cpuid_insn(ecp);
4236 
4237 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4238 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4239 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4240 			add_x86_feature(featureset, X86FSET_XSAVEC);
4241 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4242 			add_x86_feature(featureset, X86FSET_XSAVES);
4243 
4244 		/*
4245 		 * Zen 2 family processors suffer from erratum 1386 that causes
4246 		 * xsaves to not function correctly in some circumstances. There
4247 		 * are no supervisor states in Zen 2 and earlier. Practically
4248 		 * speaking this has no impact for us as we currently do not
4249 		 * leverage compressed xsave formats. To safeguard against
4250 		 * issues in the future where we may opt to using it, we remove
4251 		 * it from the feature set now. While Matisse has a microcode
4252 		 * update available with a fix, not all Zen 2 CPUs do so it's
4253 		 * simpler for the moment to unconditionally remove it.
4254 		 */
4255 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4256 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4257 			remove_x86_feature(featureset, X86FSET_XSAVES);
4258 		}
4259 	}
4260 
4261 	/*
4262 	 * Work on the "extended" feature information, doing
4263 	 * some basic initialization to be used in the extended pass.
4264 	 */
4265 	xcpuid = 0;
4266 	switch (cpi->cpi_vendor) {
4267 	case X86_VENDOR_Intel:
4268 		/*
4269 		 * On KVM we know we will have proper support for extended
4270 		 * cpuid.
4271 		 */
4272 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4273 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4274 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4275 			xcpuid++;
4276 		break;
4277 	case X86_VENDOR_AMD:
4278 		if (cpi->cpi_family > 5 ||
4279 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4280 			xcpuid++;
4281 		break;
4282 	case X86_VENDOR_Cyrix:
4283 		/*
4284 		 * Only these Cyrix CPUs are -known- to support
4285 		 * extended cpuid operations.
4286 		 */
4287 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4288 		    x86_type == X86_TYPE_CYRIX_GXm)
4289 			xcpuid++;
4290 		break;
4291 	case X86_VENDOR_HYGON:
4292 	case X86_VENDOR_Centaur:
4293 	case X86_VENDOR_TM:
4294 	default:
4295 		xcpuid++;
4296 		break;
4297 	}
4298 
4299 	if (xcpuid) {
4300 		cp = &cpi->cpi_extd[0];
4301 		cp->cp_eax = CPUID_LEAF_EXT_0;
4302 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4303 	}
4304 
4305 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4306 
4307 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4308 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4309 
4310 		switch (cpi->cpi_vendor) {
4311 		case X86_VENDOR_Intel:
4312 		case X86_VENDOR_AMD:
4313 		case X86_VENDOR_HYGON:
4314 			if (cpi->cpi_xmaxeax < 0x80000001)
4315 				break;
4316 			cp = &cpi->cpi_extd[1];
4317 			cp->cp_eax = 0x80000001;
4318 			(void) __cpuid_insn(cp);
4319 
4320 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4321 			    cpi->cpi_family == 5 &&
4322 			    cpi->cpi_model == 6 &&
4323 			    cpi->cpi_step == 6) {
4324 				/*
4325 				 * K6 model 6 uses bit 10 to indicate SYSC
4326 				 * Later models use bit 11. Fix it here.
4327 				 */
4328 				if (cp->cp_edx & 0x400) {
4329 					cp->cp_edx &= ~0x400;
4330 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4331 				}
4332 			}
4333 
4334 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4335 
4336 			/*
4337 			 * Compute the additions to the kernel's feature word.
4338 			 */
4339 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4340 				add_x86_feature(featureset, X86FSET_NX);
4341 			}
4342 
4343 			/*
4344 			 * Regardless whether or not we boot 64-bit,
4345 			 * we should have a way to identify whether
4346 			 * the CPU is capable of running 64-bit.
4347 			 */
4348 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4349 				add_x86_feature(featureset, X86FSET_64);
4350 			}
4351 
4352 			/* 1 GB large page - enable only for 64 bit kernel */
4353 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4354 				add_x86_feature(featureset, X86FSET_1GPG);
4355 			}
4356 
4357 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4358 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4359 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4360 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4361 				add_x86_feature(featureset, X86FSET_SSE4A);
4362 			}
4363 
4364 			/*
4365 			 * It's really tricky to support syscall/sysret in
4366 			 * the i386 kernel; we rely on sysenter/sysexit
4367 			 * instead.  In the amd64 kernel, things are -way-
4368 			 * better.
4369 			 */
4370 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4371 				add_x86_feature(featureset, X86FSET_ASYSC);
4372 			}
4373 
4374 			/*
4375 			 * While we're thinking about system calls, note
4376 			 * that AMD processors don't support sysenter
4377 			 * in long mode at all, so don't try to program them.
4378 			 */
4379 			if (x86_vendor == X86_VENDOR_AMD ||
4380 			    x86_vendor == X86_VENDOR_HYGON) {
4381 				remove_x86_feature(featureset, X86FSET_SEP);
4382 			}
4383 
4384 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4385 				add_x86_feature(featureset, X86FSET_TSCP);
4386 			}
4387 
4388 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4389 				add_x86_feature(featureset, X86FSET_SVM);
4390 			}
4391 
4392 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4393 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4394 			}
4395 
4396 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4397 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4398 			}
4399 
4400 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4401 				add_x86_feature(featureset, X86FSET_XOP);
4402 			}
4403 
4404 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4405 				add_x86_feature(featureset, X86FSET_FMA4);
4406 			}
4407 
4408 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4409 				add_x86_feature(featureset, X86FSET_TBM);
4410 			}
4411 
4412 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4413 				add_x86_feature(featureset, X86FSET_MONITORX);
4414 			}
4415 			break;
4416 		default:
4417 			break;
4418 		}
4419 
4420 		/*
4421 		 * Get CPUID data about processor cores and hyperthreads.
4422 		 */
4423 		switch (cpi->cpi_vendor) {
4424 		case X86_VENDOR_Intel:
4425 			if (cpi->cpi_maxeax >= 4) {
4426 				cp = &cpi->cpi_std[4];
4427 				cp->cp_eax = 4;
4428 				cp->cp_ecx = 0;
4429 				(void) __cpuid_insn(cp);
4430 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4431 			}
4432 			/*FALLTHROUGH*/
4433 		case X86_VENDOR_AMD:
4434 		case X86_VENDOR_HYGON:
4435 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4436 				break;
4437 			cp = &cpi->cpi_extd[8];
4438 			cp->cp_eax = CPUID_LEAF_EXT_8;
4439 			(void) __cpuid_insn(cp);
4440 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4441 			    cp);
4442 
4443 			/*
4444 			 * AMD uses ebx for some extended functions.
4445 			 */
4446 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4447 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4448 				/*
4449 				 * While we're here, check for the AMD "Error
4450 				 * Pointer Zero/Restore" feature. This can be
4451 				 * used to setup the FP save handlers
4452 				 * appropriately.
4453 				 */
4454 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4455 					cpi->cpi_fp_amd_save = 0;
4456 				} else {
4457 					cpi->cpi_fp_amd_save = 1;
4458 				}
4459 
4460 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4461 					add_x86_feature(featureset,
4462 					    X86FSET_CLZERO);
4463 				}
4464 			}
4465 
4466 			/*
4467 			 * Virtual and physical address limits from
4468 			 * cpuid override previously guessed values.
4469 			 */
4470 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4471 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4472 			break;
4473 		default:
4474 			break;
4475 		}
4476 
4477 		/*
4478 		 * Get CPUID data about TSC Invariance in Deep C-State.
4479 		 */
4480 		switch (cpi->cpi_vendor) {
4481 		case X86_VENDOR_Intel:
4482 		case X86_VENDOR_AMD:
4483 		case X86_VENDOR_HYGON:
4484 			if (cpi->cpi_maxeax >= 7) {
4485 				cp = &cpi->cpi_extd[7];
4486 				cp->cp_eax = 0x80000007;
4487 				cp->cp_ecx = 0;
4488 				(void) __cpuid_insn(cp);
4489 			}
4490 			break;
4491 		default:
4492 			break;
4493 		}
4494 	}
4495 
4496 	/*
4497 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4498 	 * run and thus gathered some of its dependent leaves.
4499 	 */
4500 	cpuid_basic_topology(cpu, featureset);
4501 	cpuid_basic_thermal(cpu, featureset);
4502 #if !defined(__xpv)
4503 	cpuid_basic_ppin(cpu, featureset);
4504 #endif
4505 
4506 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4507 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4508 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4509 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4510 			/* Special handling for AMD FP not necessary. */
4511 			cpi->cpi_fp_amd_save = 0;
4512 		} else {
4513 			cpi->cpi_fp_amd_save = 1;
4514 		}
4515 	}
4516 
4517 	/*
4518 	 * Check (and potentially set) if lfence is serializing.
4519 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4520 	 */
4521 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4522 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4523 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4524 		/*
4525 		 * The AMD white paper Software Techniques For Managing
4526 		 * Speculation on AMD Processors details circumstances for when
4527 		 * lfence instructions are serializing.
4528 		 *
4529 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4530 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4531 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4532 		 * committed to supporting that MSR on all later CPUs.
4533 		 */
4534 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4535 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4536 		} else if (cpi->cpi_family >= 0x10) {
4537 #if !defined(__xpv)
4538 			uint64_t val;
4539 
4540 			/*
4541 			 * Be careful when attempting to enable the bit, and
4542 			 * verify that it was actually set in case we are
4543 			 * running in a hypervisor which is less than faithful
4544 			 * about its emulation of this feature.
4545 			 */
4546 			on_trap_data_t otd;
4547 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4548 				val = rdmsr(MSR_AMD_DE_CFG);
4549 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4550 				wrmsr(MSR_AMD_DE_CFG, val);
4551 				val = rdmsr(MSR_AMD_DE_CFG);
4552 			} else {
4553 				val = 0;
4554 			}
4555 			no_trap();
4556 
4557 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4558 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4559 			}
4560 #endif
4561 		}
4562 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4563 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4564 		/*
4565 		 * Documentation and other OSes indicate that lfence is always
4566 		 * serializing on Intel CPUs.
4567 		 */
4568 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4569 	}
4570 
4571 
4572 	/*
4573 	 * Check the processor leaves that are used for security features. Grab
4574 	 * any additional processor-specific leaves that we may not have yet.
4575 	 */
4576 	switch (cpi->cpi_vendor) {
4577 	case X86_VENDOR_AMD:
4578 	case X86_VENDOR_HYGON:
4579 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4580 			cp = &cpi->cpi_extd[7];
4581 			cp->cp_eax = CPUID_LEAF_EXT_21;
4582 			cp->cp_ecx = 0;
4583 			(void) __cpuid_insn(cp);
4584 		}
4585 		break;
4586 	default:
4587 		break;
4588 	}
4589 
4590 	cpuid_scan_security(cpu, featureset);
4591 }
4592 
4593 /*
4594  * Make copies of the cpuid table entries we depend on, in
4595  * part for ease of parsing now, in part so that we have only
4596  * one place to correct any of it, in part for ease of
4597  * later export to userland, and in part so we can look at
4598  * this stuff in a crash dump.
4599  */
4600 
4601 static void
4602 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4603 {
4604 	uint_t n, nmax;
4605 	int i;
4606 	struct cpuid_regs *cp;
4607 	uint8_t *dp;
4608 	uint32_t *iptr;
4609 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4610 
4611 	if (cpi->cpi_maxeax < 1)
4612 		return;
4613 
4614 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4615 		nmax = NMAX_CPI_STD;
4616 	/*
4617 	 * (We already handled n == 0 and n == 1 in the basic pass)
4618 	 */
4619 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4620 		/*
4621 		 * leaves 6 and 7 were handled in the basic pass
4622 		 */
4623 		if (n == 6 || n == 7)
4624 			continue;
4625 
4626 		cp->cp_eax = n;
4627 
4628 		/*
4629 		 * CPUID function 4 expects %ecx to be initialized
4630 		 * with an index which indicates which cache to return
4631 		 * information about. The OS is expected to call function 4
4632 		 * with %ecx set to 0, 1, 2, ... until it returns with
4633 		 * EAX[4:0] set to 0, which indicates there are no more
4634 		 * caches.
4635 		 *
4636 		 * Here, populate cpi_std[4] with the information returned by
4637 		 * function 4 when %ecx == 0, and do the rest in a later pass
4638 		 * when dynamic memory allocation becomes available.
4639 		 *
4640 		 * Note: we need to explicitly initialize %ecx here, since
4641 		 * function 4 may have been previously invoked.
4642 		 */
4643 		if (n == 4)
4644 			cp->cp_ecx = 0;
4645 
4646 		(void) __cpuid_insn(cp);
4647 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4648 		switch (n) {
4649 		case 2:
4650 			/*
4651 			 * "the lower 8 bits of the %eax register
4652 			 * contain a value that identifies the number
4653 			 * of times the cpuid [instruction] has to be
4654 			 * executed to obtain a complete image of the
4655 			 * processor's caching systems."
4656 			 *
4657 			 * How *do* they make this stuff up?
4658 			 */
4659 			cpi->cpi_ncache = sizeof (*cp) *
4660 			    BITX(cp->cp_eax, 7, 0);
4661 			if (cpi->cpi_ncache == 0)
4662 				break;
4663 			cpi->cpi_ncache--;	/* skip count byte */
4664 
4665 			/*
4666 			 * Well, for now, rather than attempt to implement
4667 			 * this slightly dubious algorithm, we just look
4668 			 * at the first 15 ..
4669 			 */
4670 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4671 				cpi->cpi_ncache = sizeof (*cp) - 1;
4672 
4673 			dp = cpi->cpi_cacheinfo;
4674 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4675 				uint8_t *p = (void *)&cp->cp_eax;
4676 				for (i = 1; i < 4; i++)
4677 					if (p[i] != 0)
4678 						*dp++ = p[i];
4679 			}
4680 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4681 				uint8_t *p = (void *)&cp->cp_ebx;
4682 				for (i = 0; i < 4; i++)
4683 					if (p[i] != 0)
4684 						*dp++ = p[i];
4685 			}
4686 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4687 				uint8_t *p = (void *)&cp->cp_ecx;
4688 				for (i = 0; i < 4; i++)
4689 					if (p[i] != 0)
4690 						*dp++ = p[i];
4691 			}
4692 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4693 				uint8_t *p = (void *)&cp->cp_edx;
4694 				for (i = 0; i < 4; i++)
4695 					if (p[i] != 0)
4696 						*dp++ = p[i];
4697 			}
4698 			break;
4699 
4700 		case 3:	/* Processor serial number, if PSN supported */
4701 			break;
4702 
4703 		case 4:	/* Deterministic cache parameters */
4704 			break;
4705 
4706 		case 5:	/* Monitor/Mwait parameters */
4707 		{
4708 			size_t mwait_size;
4709 
4710 			/*
4711 			 * check cpi_mwait.support which was set in
4712 			 * cpuid_pass_basic()
4713 			 */
4714 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4715 				break;
4716 
4717 			/*
4718 			 * Protect ourself from insane mwait line size.
4719 			 * Workaround for incomplete hardware emulator(s).
4720 			 */
4721 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4722 			if (mwait_size < sizeof (uint32_t) ||
4723 			    !ISP2(mwait_size)) {
4724 #if DEBUG
4725 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4726 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4727 #endif
4728 				break;
4729 			}
4730 
4731 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4732 			cpi->cpi_mwait.mon_max = mwait_size;
4733 			if (MWAIT_EXTENSION(cpi)) {
4734 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4735 				if (MWAIT_INT_ENABLE(cpi))
4736 					cpi->cpi_mwait.support |=
4737 					    MWAIT_ECX_INT_ENABLE;
4738 			}
4739 			break;
4740 		}
4741 		default:
4742 			break;
4743 		}
4744 	}
4745 
4746 	/*
4747 	 * XSAVE enumeration
4748 	 */
4749 	if (cpi->cpi_maxeax >= 0xD) {
4750 		struct cpuid_regs regs;
4751 		boolean_t cpuid_d_valid = B_TRUE;
4752 
4753 		cp = &regs;
4754 		cp->cp_eax = 0xD;
4755 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4756 
4757 		(void) __cpuid_insn(cp);
4758 
4759 		/*
4760 		 * Sanity checks for debug
4761 		 */
4762 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4763 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4764 			cpuid_d_valid = B_FALSE;
4765 		}
4766 
4767 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4768 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4769 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4770 
4771 		/*
4772 		 * If the hw supports AVX, get the size and offset in the save
4773 		 * area for the ymm state.
4774 		 */
4775 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4776 			cp->cp_eax = 0xD;
4777 			cp->cp_ecx = 2;
4778 			cp->cp_edx = cp->cp_ebx = 0;
4779 
4780 			(void) __cpuid_insn(cp);
4781 
4782 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4783 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4784 				cpuid_d_valid = B_FALSE;
4785 			}
4786 
4787 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4788 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4789 		}
4790 
4791 		/*
4792 		 * If the hw supports MPX, get the size and offset in the
4793 		 * save area for BNDREGS and BNDCSR.
4794 		 */
4795 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4796 			cp->cp_eax = 0xD;
4797 			cp->cp_ecx = 3;
4798 			cp->cp_edx = cp->cp_ebx = 0;
4799 
4800 			(void) __cpuid_insn(cp);
4801 
4802 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4803 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4804 
4805 			cp->cp_eax = 0xD;
4806 			cp->cp_ecx = 4;
4807 			cp->cp_edx = cp->cp_ebx = 0;
4808 
4809 			(void) __cpuid_insn(cp);
4810 
4811 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4812 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4813 		}
4814 
4815 		/*
4816 		 * If the hw supports AVX512, get the size and offset in the
4817 		 * save area for the opmask registers and zmm state.
4818 		 */
4819 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4820 			cp->cp_eax = 0xD;
4821 			cp->cp_ecx = 5;
4822 			cp->cp_edx = cp->cp_ebx = 0;
4823 
4824 			(void) __cpuid_insn(cp);
4825 
4826 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4827 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4828 
4829 			cp->cp_eax = 0xD;
4830 			cp->cp_ecx = 6;
4831 			cp->cp_edx = cp->cp_ebx = 0;
4832 
4833 			(void) __cpuid_insn(cp);
4834 
4835 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4836 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4837 
4838 			cp->cp_eax = 0xD;
4839 			cp->cp_ecx = 7;
4840 			cp->cp_edx = cp->cp_ebx = 0;
4841 
4842 			(void) __cpuid_insn(cp);
4843 
4844 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4845 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4846 		}
4847 
4848 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4849 			xsave_state_size = 0;
4850 		} else if (cpuid_d_valid) {
4851 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4852 		} else {
4853 			/* Broken CPUID 0xD, probably in HVM */
4854 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4855 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4856 			    ", ymm_size = %d, ymm_offset = %d\n",
4857 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4858 			    cpi->cpi_xsave.xsav_hw_features_high,
4859 			    (int)cpi->cpi_xsave.xsav_max_size,
4860 			    (int)cpi->cpi_xsave.ymm_size,
4861 			    (int)cpi->cpi_xsave.ymm_offset);
4862 
4863 			if (xsave_state_size != 0) {
4864 				/*
4865 				 * This must be a non-boot CPU. We cannot
4866 				 * continue, because boot cpu has already
4867 				 * enabled XSAVE.
4868 				 */
4869 				ASSERT(cpu->cpu_id != 0);
4870 				cmn_err(CE_PANIC, "cpu%d: we have already "
4871 				    "enabled XSAVE on boot cpu, cannot "
4872 				    "continue.", cpu->cpu_id);
4873 			} else {
4874 				/*
4875 				 * If we reached here on the boot CPU, it's also
4876 				 * almost certain that we'll reach here on the
4877 				 * non-boot CPUs. When we're here on a boot CPU
4878 				 * we should disable the feature, on a non-boot
4879 				 * CPU we need to confirm that we have.
4880 				 */
4881 				if (cpu->cpu_id == 0) {
4882 					remove_x86_feature(x86_featureset,
4883 					    X86FSET_XSAVE);
4884 					remove_x86_feature(x86_featureset,
4885 					    X86FSET_AVX);
4886 					remove_x86_feature(x86_featureset,
4887 					    X86FSET_F16C);
4888 					remove_x86_feature(x86_featureset,
4889 					    X86FSET_BMI1);
4890 					remove_x86_feature(x86_featureset,
4891 					    X86FSET_BMI2);
4892 					remove_x86_feature(x86_featureset,
4893 					    X86FSET_FMA);
4894 					remove_x86_feature(x86_featureset,
4895 					    X86FSET_AVX2);
4896 					remove_x86_feature(x86_featureset,
4897 					    X86FSET_MPX);
4898 					remove_x86_feature(x86_featureset,
4899 					    X86FSET_AVX512F);
4900 					remove_x86_feature(x86_featureset,
4901 					    X86FSET_AVX512DQ);
4902 					remove_x86_feature(x86_featureset,
4903 					    X86FSET_AVX512PF);
4904 					remove_x86_feature(x86_featureset,
4905 					    X86FSET_AVX512ER);
4906 					remove_x86_feature(x86_featureset,
4907 					    X86FSET_AVX512CD);
4908 					remove_x86_feature(x86_featureset,
4909 					    X86FSET_AVX512BW);
4910 					remove_x86_feature(x86_featureset,
4911 					    X86FSET_AVX512VL);
4912 					remove_x86_feature(x86_featureset,
4913 					    X86FSET_AVX512FMA);
4914 					remove_x86_feature(x86_featureset,
4915 					    X86FSET_AVX512VBMI);
4916 					remove_x86_feature(x86_featureset,
4917 					    X86FSET_AVX512VNNI);
4918 					remove_x86_feature(x86_featureset,
4919 					    X86FSET_AVX512VPOPCDQ);
4920 					remove_x86_feature(x86_featureset,
4921 					    X86FSET_AVX512NNIW);
4922 					remove_x86_feature(x86_featureset,
4923 					    X86FSET_AVX512FMAPS);
4924 					remove_x86_feature(x86_featureset,
4925 					    X86FSET_VAES);
4926 					remove_x86_feature(x86_featureset,
4927 					    X86FSET_VPCLMULQDQ);
4928 					remove_x86_feature(x86_featureset,
4929 					    X86FSET_GFNI);
4930 					remove_x86_feature(x86_featureset,
4931 					    X86FSET_AVX512_VP2INT);
4932 					remove_x86_feature(x86_featureset,
4933 					    X86FSET_AVX512_BITALG);
4934 					remove_x86_feature(x86_featureset,
4935 					    X86FSET_AVX512_VBMI2);
4936 					remove_x86_feature(x86_featureset,
4937 					    X86FSET_AVX512_BF16);
4938 
4939 					xsave_force_disable = B_TRUE;
4940 				} else {
4941 					VERIFY(is_x86_feature(x86_featureset,
4942 					    X86FSET_XSAVE) == B_FALSE);
4943 				}
4944 			}
4945 		}
4946 	}
4947 
4948 
4949 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4950 		return;
4951 
4952 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4953 		nmax = NMAX_CPI_EXTD;
4954 	/*
4955 	 * Copy the extended properties, fixing them as we go. While we start at
4956 	 * 2 because we've already handled a few cases in the basic pass, the
4957 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
4958 	 */
4959 	iptr = (void *)cpi->cpi_brandstr;
4960 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4961 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4962 		(void) __cpuid_insn(cp);
4963 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4964 		    cp);
4965 		switch (n) {
4966 		case 2:
4967 		case 3:
4968 		case 4:
4969 			/*
4970 			 * Extract the brand string
4971 			 */
4972 			*iptr++ = cp->cp_eax;
4973 			*iptr++ = cp->cp_ebx;
4974 			*iptr++ = cp->cp_ecx;
4975 			*iptr++ = cp->cp_edx;
4976 			break;
4977 		case 5:
4978 			switch (cpi->cpi_vendor) {
4979 			case X86_VENDOR_AMD:
4980 				/*
4981 				 * The Athlon and Duron were the first
4982 				 * parts to report the sizes of the
4983 				 * TLB for large pages. Before then,
4984 				 * we don't trust the data.
4985 				 */
4986 				if (cpi->cpi_family < 6 ||
4987 				    (cpi->cpi_family == 6 &&
4988 				    cpi->cpi_model < 1))
4989 					cp->cp_eax = 0;
4990 				break;
4991 			default:
4992 				break;
4993 			}
4994 			break;
4995 		case 6:
4996 			switch (cpi->cpi_vendor) {
4997 			case X86_VENDOR_AMD:
4998 				/*
4999 				 * The Athlon and Duron were the first
5000 				 * AMD parts with L2 TLB's.
5001 				 * Before then, don't trust the data.
5002 				 */
5003 				if (cpi->cpi_family < 6 ||
5004 				    (cpi->cpi_family == 6 &&
5005 				    cpi->cpi_model < 1))
5006 					cp->cp_eax = cp->cp_ebx = 0;
5007 				/*
5008 				 * AMD Duron rev A0 reports L2
5009 				 * cache size incorrectly as 1K
5010 				 * when it is really 64K
5011 				 */
5012 				if (cpi->cpi_family == 6 &&
5013 				    cpi->cpi_model == 3 &&
5014 				    cpi->cpi_step == 0) {
5015 					cp->cp_ecx &= 0xffff;
5016 					cp->cp_ecx |= 0x400000;
5017 				}
5018 				break;
5019 			case X86_VENDOR_Cyrix:	/* VIA C3 */
5020 				/*
5021 				 * VIA C3 processors are a bit messed
5022 				 * up w.r.t. encoding cache sizes in %ecx
5023 				 */
5024 				if (cpi->cpi_family != 6)
5025 					break;
5026 				/*
5027 				 * model 7 and 8 were incorrectly encoded
5028 				 *
5029 				 * xxx is model 8 really broken?
5030 				 */
5031 				if (cpi->cpi_model == 7 ||
5032 				    cpi->cpi_model == 8)
5033 					cp->cp_ecx =
5034 					    BITX(cp->cp_ecx, 31, 24) << 16 |
5035 					    BITX(cp->cp_ecx, 23, 16) << 12 |
5036 					    BITX(cp->cp_ecx, 15, 8) << 8 |
5037 					    BITX(cp->cp_ecx, 7, 0);
5038 				/*
5039 				 * model 9 stepping 1 has wrong associativity
5040 				 */
5041 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5042 					cp->cp_ecx |= 8 << 12;
5043 				break;
5044 			case X86_VENDOR_Intel:
5045 				/*
5046 				 * Extended L2 Cache features function.
5047 				 * First appeared on Prescott.
5048 				 */
5049 			default:
5050 				break;
5051 			}
5052 			break;
5053 		default:
5054 			break;
5055 		}
5056 	}
5057 }
5058 
5059 static const char *
5060 intel_cpubrand(const struct cpuid_info *cpi)
5061 {
5062 	int i;
5063 
5064 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5065 
5066 	switch (cpi->cpi_family) {
5067 	case 5:
5068 		return ("Intel Pentium(r)");
5069 	case 6:
5070 		switch (cpi->cpi_model) {
5071 			uint_t celeron, xeon;
5072 			const struct cpuid_regs *cp;
5073 		case 0:
5074 		case 1:
5075 		case 2:
5076 			return ("Intel Pentium(r) Pro");
5077 		case 3:
5078 		case 4:
5079 			return ("Intel Pentium(r) II");
5080 		case 6:
5081 			return ("Intel Celeron(r)");
5082 		case 5:
5083 		case 7:
5084 			celeron = xeon = 0;
5085 			cp = &cpi->cpi_std[2];	/* cache info */
5086 
5087 			for (i = 1; i < 4; i++) {
5088 				uint_t tmp;
5089 
5090 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5091 				if (tmp == 0x40)
5092 					celeron++;
5093 				if (tmp >= 0x44 && tmp <= 0x45)
5094 					xeon++;
5095 			}
5096 
5097 			for (i = 0; i < 2; i++) {
5098 				uint_t tmp;
5099 
5100 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5101 				if (tmp == 0x40)
5102 					celeron++;
5103 				else if (tmp >= 0x44 && tmp <= 0x45)
5104 					xeon++;
5105 			}
5106 
5107 			for (i = 0; i < 4; i++) {
5108 				uint_t tmp;
5109 
5110 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5111 				if (tmp == 0x40)
5112 					celeron++;
5113 				else if (tmp >= 0x44 && tmp <= 0x45)
5114 					xeon++;
5115 			}
5116 
5117 			for (i = 0; i < 4; i++) {
5118 				uint_t tmp;
5119 
5120 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5121 				if (tmp == 0x40)
5122 					celeron++;
5123 				else if (tmp >= 0x44 && tmp <= 0x45)
5124 					xeon++;
5125 			}
5126 
5127 			if (celeron)
5128 				return ("Intel Celeron(r)");
5129 			if (xeon)
5130 				return (cpi->cpi_model == 5 ?
5131 				    "Intel Pentium(r) II Xeon(tm)" :
5132 				    "Intel Pentium(r) III Xeon(tm)");
5133 			return (cpi->cpi_model == 5 ?
5134 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5135 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5136 		default:
5137 			break;
5138 		}
5139 	default:
5140 		break;
5141 	}
5142 
5143 	/* BrandID is present if the field is nonzero */
5144 	if (cpi->cpi_brandid != 0) {
5145 		static const struct {
5146 			uint_t bt_bid;
5147 			const char *bt_str;
5148 		} brand_tbl[] = {
5149 			{ 0x1,	"Intel(r) Celeron(r)" },
5150 			{ 0x2,	"Intel(r) Pentium(r) III" },
5151 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5152 			{ 0x4,	"Intel(r) Pentium(r) III" },
5153 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5154 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5155 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5156 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5157 			{ 0xa,	"Intel(r) Celeron(r)" },
5158 			{ 0xb,	"Intel(r) Xeon(tm)" },
5159 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5160 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5161 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5162 			{ 0x11, "Mobile Genuine Intel(r)" },
5163 			{ 0x12, "Intel(r) Celeron(r) M" },
5164 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5165 			{ 0x14, "Intel(r) Celeron(r)" },
5166 			{ 0x15, "Mobile Genuine Intel(r)" },
5167 			{ 0x16,	"Intel(r) Pentium(r) M" },
5168 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5169 		};
5170 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5171 		uint_t sgn;
5172 
5173 		sgn = (cpi->cpi_family << 8) |
5174 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5175 
5176 		for (i = 0; i < btblmax; i++)
5177 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5178 				break;
5179 		if (i < btblmax) {
5180 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5181 				return ("Intel(r) Celeron(r)");
5182 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5183 				return ("Intel(r) Xeon(tm) MP");
5184 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5185 				return ("Intel(r) Xeon(tm)");
5186 			return (brand_tbl[i].bt_str);
5187 		}
5188 	}
5189 
5190 	return (NULL);
5191 }
5192 
5193 static const char *
5194 amd_cpubrand(const struct cpuid_info *cpi)
5195 {
5196 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5197 
5198 	switch (cpi->cpi_family) {
5199 	case 5:
5200 		switch (cpi->cpi_model) {
5201 		case 0:
5202 		case 1:
5203 		case 2:
5204 		case 3:
5205 		case 4:
5206 		case 5:
5207 			return ("AMD-K5(r)");
5208 		case 6:
5209 		case 7:
5210 			return ("AMD-K6(r)");
5211 		case 8:
5212 			return ("AMD-K6(r)-2");
5213 		case 9:
5214 			return ("AMD-K6(r)-III");
5215 		default:
5216 			return ("AMD (family 5)");
5217 		}
5218 	case 6:
5219 		switch (cpi->cpi_model) {
5220 		case 1:
5221 			return ("AMD-K7(tm)");
5222 		case 0:
5223 		case 2:
5224 		case 4:
5225 			return ("AMD Athlon(tm)");
5226 		case 3:
5227 		case 7:
5228 			return ("AMD Duron(tm)");
5229 		case 6:
5230 		case 8:
5231 		case 10:
5232 			/*
5233 			 * Use the L2 cache size to distinguish
5234 			 */
5235 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5236 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5237 		default:
5238 			return ("AMD (family 6)");
5239 		}
5240 	default:
5241 		break;
5242 	}
5243 
5244 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5245 	    cpi->cpi_brandid != 0) {
5246 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5247 		case 3:
5248 			return ("AMD Opteron(tm) UP 1xx");
5249 		case 4:
5250 			return ("AMD Opteron(tm) DP 2xx");
5251 		case 5:
5252 			return ("AMD Opteron(tm) MP 8xx");
5253 		default:
5254 			return ("AMD Opteron(tm)");
5255 		}
5256 	}
5257 
5258 	return (NULL);
5259 }
5260 
5261 static const char *
5262 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5263 {
5264 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5265 
5266 	switch (type) {
5267 	case X86_TYPE_CYRIX_6x86:
5268 		return ("Cyrix 6x86");
5269 	case X86_TYPE_CYRIX_6x86L:
5270 		return ("Cyrix 6x86L");
5271 	case X86_TYPE_CYRIX_6x86MX:
5272 		return ("Cyrix 6x86MX");
5273 	case X86_TYPE_CYRIX_GXm:
5274 		return ("Cyrix GXm");
5275 	case X86_TYPE_CYRIX_MediaGX:
5276 		return ("Cyrix MediaGX");
5277 	case X86_TYPE_CYRIX_MII:
5278 		return ("Cyrix M2");
5279 	case X86_TYPE_VIA_CYRIX_III:
5280 		return ("VIA Cyrix M3");
5281 	default:
5282 		/*
5283 		 * Have another wild guess ..
5284 		 */
5285 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5286 			return ("Cyrix 5x86");
5287 		else if (cpi->cpi_family == 5) {
5288 			switch (cpi->cpi_model) {
5289 			case 2:
5290 				return ("Cyrix 6x86");	/* Cyrix M1 */
5291 			case 4:
5292 				return ("Cyrix MediaGX");
5293 			default:
5294 				break;
5295 			}
5296 		} else if (cpi->cpi_family == 6) {
5297 			switch (cpi->cpi_model) {
5298 			case 0:
5299 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5300 			case 5:
5301 			case 6:
5302 			case 7:
5303 			case 8:
5304 			case 9:
5305 				return ("VIA C3");
5306 			default:
5307 				break;
5308 			}
5309 		}
5310 		break;
5311 	}
5312 	return (NULL);
5313 }
5314 
5315 /*
5316  * This only gets called in the case that the CPU extended
5317  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5318  * aren't available, or contain null bytes for some reason.
5319  */
5320 static void
5321 fabricate_brandstr(struct cpuid_info *cpi)
5322 {
5323 	const char *brand = NULL;
5324 
5325 	switch (cpi->cpi_vendor) {
5326 	case X86_VENDOR_Intel:
5327 		brand = intel_cpubrand(cpi);
5328 		break;
5329 	case X86_VENDOR_AMD:
5330 		brand = amd_cpubrand(cpi);
5331 		break;
5332 	case X86_VENDOR_Cyrix:
5333 		brand = cyrix_cpubrand(cpi, x86_type);
5334 		break;
5335 	case X86_VENDOR_NexGen:
5336 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5337 			brand = "NexGen Nx586";
5338 		break;
5339 	case X86_VENDOR_Centaur:
5340 		if (cpi->cpi_family == 5)
5341 			switch (cpi->cpi_model) {
5342 			case 4:
5343 				brand = "Centaur C6";
5344 				break;
5345 			case 8:
5346 				brand = "Centaur C2";
5347 				break;
5348 			case 9:
5349 				brand = "Centaur C3";
5350 				break;
5351 			default:
5352 				break;
5353 			}
5354 		break;
5355 	case X86_VENDOR_Rise:
5356 		if (cpi->cpi_family == 5 &&
5357 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5358 			brand = "Rise mP6";
5359 		break;
5360 	case X86_VENDOR_SiS:
5361 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5362 			brand = "SiS 55x";
5363 		break;
5364 	case X86_VENDOR_TM:
5365 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5366 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5367 		break;
5368 	case X86_VENDOR_NSC:
5369 	case X86_VENDOR_UMC:
5370 	default:
5371 		break;
5372 	}
5373 	if (brand) {
5374 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5375 		return;
5376 	}
5377 
5378 	/*
5379 	 * If all else fails ...
5380 	 */
5381 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5382 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5383 	    cpi->cpi_model, cpi->cpi_step);
5384 }
5385 
5386 /*
5387  * This routine is called just after kernel memory allocation
5388  * becomes available on cpu0, and as part of mp_startup() on
5389  * the other cpus.
5390  *
5391  * Fixup the brand string, and collect any information from cpuid
5392  * that requires dynamically allocated storage to represent.
5393  */
5394 
5395 static void
5396 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5397 {
5398 	int	i, max, shft, level, size;
5399 	struct cpuid_regs regs;
5400 	struct cpuid_regs *cp;
5401 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5402 
5403 	/*
5404 	 * Deterministic cache parameters
5405 	 *
5406 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5407 	 * values that are present are currently defined to be the same. This
5408 	 * means we can use the same logic to parse it as long as we use the
5409 	 * appropriate leaf to get the data. If you're updating this, make sure
5410 	 * you're careful about which vendor supports which aspect.
5411 	 *
5412 	 * Take this opportunity to detect the number of threads sharing the
5413 	 * last level cache, and construct a corresponding cache id. The
5414 	 * respective cpuid_info members are initialized to the default case of
5415 	 * "no last level cache sharing".
5416 	 */
5417 	cpi->cpi_ncpu_shr_last_cache = 1;
5418 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5419 
5420 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5421 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5422 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5423 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5424 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5425 		uint32_t leaf;
5426 
5427 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5428 			leaf = 4;
5429 		} else {
5430 			leaf = CPUID_LEAF_EXT_1d;
5431 		}
5432 
5433 		/*
5434 		 * Find the # of elements (size) returned by the leaf and along
5435 		 * the way detect last level cache sharing details.
5436 		 */
5437 		bzero(&regs, sizeof (regs));
5438 		cp = &regs;
5439 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5440 			cp->cp_eax = leaf;
5441 			cp->cp_ecx = i;
5442 
5443 			(void) __cpuid_insn(cp);
5444 
5445 			if (CPI_CACHE_TYPE(cp) == 0)
5446 				break;
5447 			level = CPI_CACHE_LVL(cp);
5448 			if (level > max) {
5449 				max = level;
5450 				cpi->cpi_ncpu_shr_last_cache =
5451 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5452 			}
5453 		}
5454 		cpi->cpi_cache_leaf_size = size = i;
5455 
5456 		/*
5457 		 * Allocate the cpi_cache_leaves array. The first element
5458 		 * references the regs for the corresponding leaf with %ecx set
5459 		 * to 0. This was gathered in cpuid_pass_extended().
5460 		 */
5461 		if (size > 0) {
5462 			cpi->cpi_cache_leaves =
5463 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5464 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5465 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5466 			} else {
5467 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5468 			}
5469 
5470 			/*
5471 			 * Allocate storage to hold the additional regs
5472 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5473 			 *
5474 			 * The regs for the leaf, %ecx == 0 has already
5475 			 * been allocated as indicated above.
5476 			 */
5477 			for (i = 1; i < size; i++) {
5478 				cp = cpi->cpi_cache_leaves[i] =
5479 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5480 				cp->cp_eax = leaf;
5481 				cp->cp_ecx = i;
5482 
5483 				(void) __cpuid_insn(cp);
5484 			}
5485 		}
5486 		/*
5487 		 * Determine the number of bits needed to represent
5488 		 * the number of CPUs sharing the last level cache.
5489 		 *
5490 		 * Shift off that number of bits from the APIC id to
5491 		 * derive the cache id.
5492 		 */
5493 		shft = 0;
5494 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5495 			shft++;
5496 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5497 	}
5498 
5499 	/*
5500 	 * Now fixup the brand string
5501 	 */
5502 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5503 		fabricate_brandstr(cpi);
5504 	} else {
5505 
5506 		/*
5507 		 * If we successfully extracted a brand string from the cpuid
5508 		 * instruction, clean it up by removing leading spaces and
5509 		 * similar junk.
5510 		 */
5511 		if (cpi->cpi_brandstr[0]) {
5512 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5513 			char *src, *dst;
5514 
5515 			dst = src = (char *)cpi->cpi_brandstr;
5516 			src[maxlen - 1] = '\0';
5517 			/*
5518 			 * strip leading spaces
5519 			 */
5520 			while (*src == ' ')
5521 				src++;
5522 			/*
5523 			 * Remove any 'Genuine' or "Authentic" prefixes
5524 			 */
5525 			if (strncmp(src, "Genuine ", 8) == 0)
5526 				src += 8;
5527 			if (strncmp(src, "Authentic ", 10) == 0)
5528 				src += 10;
5529 
5530 			/*
5531 			 * Now do an in-place copy.
5532 			 * Map (R) to (r) and (TM) to (tm).
5533 			 * The era of teletypes is long gone, and there's
5534 			 * -really- no need to shout.
5535 			 */
5536 			while (*src != '\0') {
5537 				if (src[0] == '(') {
5538 					if (strncmp(src + 1, "R)", 2) == 0) {
5539 						(void) strncpy(dst, "(r)", 3);
5540 						src += 3;
5541 						dst += 3;
5542 						continue;
5543 					}
5544 					if (strncmp(src + 1, "TM)", 3) == 0) {
5545 						(void) strncpy(dst, "(tm)", 4);
5546 						src += 4;
5547 						dst += 4;
5548 						continue;
5549 					}
5550 				}
5551 				*dst++ = *src++;
5552 			}
5553 			*dst = '\0';
5554 
5555 			/*
5556 			 * Finally, remove any trailing spaces
5557 			 */
5558 			while (--dst > cpi->cpi_brandstr)
5559 				if (*dst == ' ')
5560 					*dst = '\0';
5561 				else
5562 					break;
5563 		} else
5564 			fabricate_brandstr(cpi);
5565 	}
5566 }
5567 
5568 typedef struct {
5569 	uint32_t avm_av;
5570 	uint32_t avm_feat;
5571 } av_feat_map_t;
5572 
5573 /*
5574  * These arrays are used to map features that we should add based on x86
5575  * features that are present. As a large number depend on kernel features,
5576  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5577  * There is an array of these for each hwcap word. Some features aren't tracked
5578  * in the kernel x86 featureset and that's ok. They will not show up in here.
5579  */
5580 static const av_feat_map_t x86fset_to_av1[] = {
5581 	{ AV_386_CX8, X86FSET_CX8 },
5582 	{ AV_386_SEP, X86FSET_SEP },
5583 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5584 	{ AV_386_CMOV, X86FSET_CMOV },
5585 	{ AV_386_FXSR, X86FSET_SSE },
5586 	{ AV_386_SSE, X86FSET_SSE },
5587 	{ AV_386_SSE2, X86FSET_SSE2 },
5588 	{ AV_386_SSE3, X86FSET_SSE3 },
5589 	{ AV_386_CX16, X86FSET_CX16 },
5590 	{ AV_386_TSCP, X86FSET_TSCP },
5591 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5592 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5593 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5594 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5595 	{ AV_386_AES, X86FSET_AES },
5596 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5597 	{ AV_386_XSAVE, X86FSET_XSAVE },
5598 	{ AV_386_AVX, X86FSET_AVX },
5599 	{ AV_386_VMX, X86FSET_VMX },
5600 	{ AV_386_AMD_SVM, X86FSET_SVM }
5601 };
5602 
5603 static const av_feat_map_t x86fset_to_av2[] = {
5604 	{ AV_386_2_F16C, X86FSET_F16C },
5605 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5606 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5607 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5608 	{ AV_386_2_FMA, X86FSET_FMA },
5609 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5610 	{ AV_386_2_ADX, X86FSET_ADX },
5611 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5612 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5613 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5614 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5615 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5616 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5617 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5618 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5619 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5620 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5621 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5622 	{ AV_386_2_SHA, X86FSET_SHA },
5623 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5624 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5625 	{ AV_386_2_CLWB, X86FSET_CLWB },
5626 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5627 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5628 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5629 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5630 	{ AV_386_2_VAES, X86FSET_VAES },
5631 	{ AV_386_2_GFNI, X86FSET_GFNI },
5632 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5633 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5634 };
5635 
5636 static const av_feat_map_t x86fset_to_av3[] = {
5637 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5638 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5639 };
5640 
5641 /*
5642  * This routine is called out of bind_hwcap() much later in the life
5643  * of the kernel (post_startup()).  The job of this routine is to resolve
5644  * the hardware feature support and kernel support for those features into
5645  * what we're actually going to tell applications via the aux vector.
5646  *
5647  * Most of the aux vector is derived from the x86_featureset array vector where
5648  * a given feature indicates that an aux vector should be plumbed through. This
5649  * allows the kernel to use one tracking mechanism for these based on whether or
5650  * not it has the required hardware support (most often xsave). Most newer
5651  * features are added there in case we need them in the kernel. Otherwise,
5652  * features are evaluated based on looking at the cpuid features that remain. If
5653  * you find yourself wanting to clear out cpuid features for some reason, they
5654  * should instead be driven by the feature set so we have a consistent view.
5655  */
5656 
5657 static void
5658 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5659 {
5660 	uint_t *hwcap_out = (uint_t *)arg;
5661 	struct cpuid_info *cpi;
5662 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5663 
5664 	cpi = cpu->cpu_m.mcpu_cpi;
5665 
5666 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5667 		if (is_x86_feature(x86_featureset,
5668 		    x86fset_to_av1[i].avm_feat)) {
5669 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5670 		}
5671 	}
5672 
5673 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5674 		if (is_x86_feature(x86_featureset,
5675 		    x86fset_to_av2[i].avm_feat)) {
5676 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5677 		}
5678 	}
5679 
5680 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5681 		if (is_x86_feature(x86_featureset,
5682 		    x86fset_to_av3[i].avm_feat)) {
5683 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5684 		}
5685 	}
5686 
5687 	/*
5688 	 * From here on out we're working through features that don't have
5689 	 * corresponding kernel feature flags for various reasons that are
5690 	 * mostly just due to the historical implementation.
5691 	 */
5692 	if (cpi->cpi_maxeax >= 1) {
5693 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5694 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5695 
5696 		*edx = CPI_FEATURES_EDX(cpi);
5697 		*ecx = CPI_FEATURES_ECX(cpi);
5698 
5699 		/*
5700 		 * [no explicit support required beyond x87 fp context]
5701 		 */
5702 		if (!fpu_exists)
5703 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5704 
5705 		/*
5706 		 * Now map the supported feature vector to things that we
5707 		 * think userland will care about.
5708 		 */
5709 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5710 			hwcap_flags |= AV_386_MOVBE;
5711 
5712 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5713 			hwcap_flags |= AV_386_POPCNT;
5714 		if (*edx & CPUID_INTC_EDX_FPU)
5715 			hwcap_flags |= AV_386_FPU;
5716 		if (*edx & CPUID_INTC_EDX_MMX)
5717 			hwcap_flags |= AV_386_MMX;
5718 		if (*edx & CPUID_INTC_EDX_TSC)
5719 			hwcap_flags |= AV_386_TSC;
5720 	}
5721 
5722 	/*
5723 	 * Check a few miscellaneous features.
5724 	 */
5725 	if (cpi->cpi_xmaxeax < 0x80000001)
5726 		goto resolve_done;
5727 
5728 	switch (cpi->cpi_vendor) {
5729 		uint32_t *edx, *ecx;
5730 
5731 	case X86_VENDOR_Intel:
5732 		/*
5733 		 * Seems like Intel duplicated what we necessary
5734 		 * here to make the initial crop of 64-bit OS's work.
5735 		 * Hopefully, those are the only "extended" bits
5736 		 * they'll add.
5737 		 */
5738 		/*FALLTHROUGH*/
5739 
5740 	case X86_VENDOR_AMD:
5741 	case X86_VENDOR_HYGON:
5742 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5743 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5744 
5745 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5746 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5747 
5748 		/*
5749 		 * [no explicit support required beyond
5750 		 * x87 fp context and exception handlers]
5751 		 */
5752 		if (!fpu_exists)
5753 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5754 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5755 
5756 		/*
5757 		 * Now map the supported feature vector to
5758 		 * things that we think userland will care about.
5759 		 */
5760 		if (*edx & CPUID_AMD_EDX_MMXamd)
5761 			hwcap_flags |= AV_386_AMD_MMX;
5762 		if (*edx & CPUID_AMD_EDX_3DNow)
5763 			hwcap_flags |= AV_386_AMD_3DNow;
5764 		if (*edx & CPUID_AMD_EDX_3DNowx)
5765 			hwcap_flags |= AV_386_AMD_3DNowx;
5766 
5767 		switch (cpi->cpi_vendor) {
5768 		case X86_VENDOR_AMD:
5769 		case X86_VENDOR_HYGON:
5770 			if (*ecx & CPUID_AMD_ECX_AHF64)
5771 				hwcap_flags |= AV_386_AHF;
5772 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5773 				hwcap_flags |= AV_386_AMD_LZCNT;
5774 			break;
5775 
5776 		case X86_VENDOR_Intel:
5777 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5778 				hwcap_flags |= AV_386_AMD_LZCNT;
5779 			/*
5780 			 * Aarrgh.
5781 			 * Intel uses a different bit in the same word.
5782 			 */
5783 			if (*ecx & CPUID_INTC_ECX_AHF64)
5784 				hwcap_flags |= AV_386_AHF;
5785 			break;
5786 		default:
5787 			break;
5788 		}
5789 		break;
5790 
5791 	default:
5792 		break;
5793 	}
5794 
5795 resolve_done:
5796 	if (hwcap_out != NULL) {
5797 		hwcap_out[0] = hwcap_flags;
5798 		hwcap_out[1] = hwcap_flags_2;
5799 		hwcap_out[2] = hwcap_flags_3;
5800 	}
5801 }
5802 
5803 
5804 /*
5805  * Simulate the cpuid instruction using the data we previously
5806  * captured about this CPU.  We try our best to return the truth
5807  * about the hardware, independently of kernel support.
5808  */
5809 uint32_t
5810 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5811 {
5812 	struct cpuid_info *cpi;
5813 	struct cpuid_regs *xcp;
5814 
5815 	if (cpu == NULL)
5816 		cpu = CPU;
5817 	cpi = cpu->cpu_m.mcpu_cpi;
5818 
5819 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5820 
5821 	/*
5822 	 * CPUID data is cached in two separate places: cpi_std for standard
5823 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5824 	 */
5825 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5826 		xcp = &cpi->cpi_std[cp->cp_eax];
5827 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5828 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5829 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5830 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5831 	} else {
5832 		/*
5833 		 * The caller is asking for data from an input parameter which
5834 		 * the kernel has not cached.  In this case we go fetch from
5835 		 * the hardware and return the data directly to the user.
5836 		 */
5837 		return (__cpuid_insn(cp));
5838 	}
5839 
5840 	cp->cp_eax = xcp->cp_eax;
5841 	cp->cp_ebx = xcp->cp_ebx;
5842 	cp->cp_ecx = xcp->cp_ecx;
5843 	cp->cp_edx = xcp->cp_edx;
5844 	return (cp->cp_eax);
5845 }
5846 
5847 boolean_t
5848 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5849 {
5850 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5851 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5852 }
5853 
5854 int
5855 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5856 {
5857 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5858 
5859 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5860 }
5861 
5862 int
5863 cpuid_is_cmt(cpu_t *cpu)
5864 {
5865 	if (cpu == NULL)
5866 		cpu = CPU;
5867 
5868 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5869 
5870 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5871 }
5872 
5873 /*
5874  * AMD and Intel both implement the 64-bit variant of the syscall
5875  * instruction (syscallq), so if there's -any- support for syscall,
5876  * cpuid currently says "yes, we support this".
5877  *
5878  * However, Intel decided to -not- implement the 32-bit variant of the
5879  * syscall instruction, so we provide a predicate to allow our caller
5880  * to test that subtlety here.
5881  *
5882  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5883  *	even in the case where the hardware would in fact support it.
5884  */
5885 /*ARGSUSED*/
5886 int
5887 cpuid_syscall32_insn(cpu_t *cpu)
5888 {
5889 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5890 
5891 #if !defined(__xpv)
5892 	if (cpu == NULL)
5893 		cpu = CPU;
5894 
5895 	/*CSTYLED*/
5896 	{
5897 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5898 
5899 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5900 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5901 		    cpi->cpi_xmaxeax >= 0x80000001 &&
5902 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5903 			return (1);
5904 	}
5905 #endif
5906 	return (0);
5907 }
5908 
5909 int
5910 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5911 {
5912 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5913 
5914 	static const char fmt[] =
5915 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5916 	static const char fmt_ht[] =
5917 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5918 
5919 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5920 
5921 	if (cpuid_is_cmt(cpu))
5922 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5923 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5924 		    cpi->cpi_family, cpi->cpi_model,
5925 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5926 	return (snprintf(s, n, fmt,
5927 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5928 	    cpi->cpi_family, cpi->cpi_model,
5929 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5930 }
5931 
5932 const char *
5933 cpuid_getvendorstr(cpu_t *cpu)
5934 {
5935 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5936 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5937 }
5938 
5939 uint_t
5940 cpuid_getvendor(cpu_t *cpu)
5941 {
5942 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5943 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5944 }
5945 
5946 uint_t
5947 cpuid_getfamily(cpu_t *cpu)
5948 {
5949 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5950 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5951 }
5952 
5953 uint_t
5954 cpuid_getmodel(cpu_t *cpu)
5955 {
5956 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5957 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5958 }
5959 
5960 uint_t
5961 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5962 {
5963 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5964 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5965 }
5966 
5967 uint_t
5968 cpuid_get_ncore_per_chip(cpu_t *cpu)
5969 {
5970 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5971 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5972 }
5973 
5974 uint_t
5975 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5976 {
5977 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5978 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5979 }
5980 
5981 id_t
5982 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5983 {
5984 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5985 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5986 }
5987 
5988 uint_t
5989 cpuid_getstep(cpu_t *cpu)
5990 {
5991 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5992 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5993 }
5994 
5995 uint_t
5996 cpuid_getsig(struct cpu *cpu)
5997 {
5998 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5999 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6000 }
6001 
6002 uint32_t
6003 cpuid_getchiprev(struct cpu *cpu)
6004 {
6005 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6006 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6007 }
6008 
6009 const char *
6010 cpuid_getchiprevstr(struct cpu *cpu)
6011 {
6012 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6013 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6014 }
6015 
6016 uint32_t
6017 cpuid_getsockettype(struct cpu *cpu)
6018 {
6019 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6020 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6021 }
6022 
6023 const char *
6024 cpuid_getsocketstr(cpu_t *cpu)
6025 {
6026 	static const char *socketstr = NULL;
6027 	struct cpuid_info *cpi;
6028 
6029 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6030 	cpi = cpu->cpu_m.mcpu_cpi;
6031 
6032 	/* Assume that socket types are the same across the system */
6033 	if (socketstr == NULL)
6034 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6035 		    cpi->cpi_model, cpi->cpi_step);
6036 
6037 
6038 	return (socketstr);
6039 }
6040 
6041 x86_uarchrev_t
6042 cpuid_getuarchrev(cpu_t *cpu)
6043 {
6044 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6045 }
6046 
6047 int
6048 cpuid_get_chipid(cpu_t *cpu)
6049 {
6050 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6051 
6052 	if (cpuid_is_cmt(cpu))
6053 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6054 	return (cpu->cpu_id);
6055 }
6056 
6057 id_t
6058 cpuid_get_coreid(cpu_t *cpu)
6059 {
6060 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6061 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6062 }
6063 
6064 int
6065 cpuid_get_pkgcoreid(cpu_t *cpu)
6066 {
6067 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6068 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6069 }
6070 
6071 int
6072 cpuid_get_clogid(cpu_t *cpu)
6073 {
6074 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6075 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6076 }
6077 
6078 int
6079 cpuid_get_cacheid(cpu_t *cpu)
6080 {
6081 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6082 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6083 }
6084 
6085 uint_t
6086 cpuid_get_procnodeid(cpu_t *cpu)
6087 {
6088 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6089 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6090 }
6091 
6092 uint_t
6093 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6094 {
6095 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6096 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6097 }
6098 
6099 uint_t
6100 cpuid_get_compunitid(cpu_t *cpu)
6101 {
6102 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6103 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6104 }
6105 
6106 uint_t
6107 cpuid_get_cores_per_compunit(cpu_t *cpu)
6108 {
6109 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6110 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6111 }
6112 
6113 uint32_t
6114 cpuid_get_apicid(cpu_t *cpu)
6115 {
6116 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6117 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6118 		return (UINT32_MAX);
6119 	} else {
6120 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6121 	}
6122 }
6123 
6124 void
6125 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6126 {
6127 	struct cpuid_info *cpi;
6128 
6129 	if (cpu == NULL)
6130 		cpu = CPU;
6131 	cpi = cpu->cpu_m.mcpu_cpi;
6132 
6133 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6134 
6135 	if (pabits)
6136 		*pabits = cpi->cpi_pabits;
6137 	if (vabits)
6138 		*vabits = cpi->cpi_vabits;
6139 }
6140 
6141 size_t
6142 cpuid_get_xsave_size(void)
6143 {
6144 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6145 	    sizeof (struct xsave_state)));
6146 }
6147 
6148 /*
6149  * Export information about known offsets to the kernel. We only care about
6150  * things we have actually enabled support for in %xcr0.
6151  */
6152 void
6153 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6154 {
6155 	size_t size, off;
6156 
6157 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6158 
6159 	if (sizep == NULL)
6160 		sizep = &size;
6161 	if (offp == NULL)
6162 		offp = &off;
6163 
6164 	switch (bit) {
6165 	case XFEATURE_LEGACY_FP:
6166 	case XFEATURE_SSE:
6167 		*sizep = sizeof (struct fxsave_state);
6168 		*offp = 0;
6169 		break;
6170 	case XFEATURE_AVX:
6171 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6172 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6173 		break;
6174 	case XFEATURE_AVX512_OPMASK:
6175 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6176 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6177 		break;
6178 	case XFEATURE_AVX512_ZMM:
6179 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6180 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6181 		break;
6182 	case XFEATURE_AVX512_HI_ZMM:
6183 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6184 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6185 		break;
6186 	default:
6187 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6188 	}
6189 }
6190 
6191 /*
6192  * Return true if the CPUs on this system require 'pointer clearing' for the
6193  * floating point error pointer exception handling. In the past, this has been
6194  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6195  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6196  * feature bit and is reflected in the cpi_fp_amd_save member.
6197  */
6198 boolean_t
6199 cpuid_need_fp_excp_handling(void)
6200 {
6201 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6202 	    cpuid_info0.cpi_fp_amd_save != 0);
6203 }
6204 
6205 /*
6206  * Returns the number of data TLB entries for a corresponding
6207  * pagesize.  If it can't be computed, or isn't known, the
6208  * routine returns zero.  If you ask about an architecturally
6209  * impossible pagesize, the routine will panic (so that the
6210  * hat implementor knows that things are inconsistent.)
6211  */
6212 uint_t
6213 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6214 {
6215 	struct cpuid_info *cpi;
6216 	uint_t dtlb_nent = 0;
6217 
6218 	if (cpu == NULL)
6219 		cpu = CPU;
6220 	cpi = cpu->cpu_m.mcpu_cpi;
6221 
6222 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6223 
6224 	/*
6225 	 * Check the L2 TLB info
6226 	 */
6227 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6228 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6229 
6230 		switch (pagesize) {
6231 
6232 		case 4 * 1024:
6233 			/*
6234 			 * All zero in the top 16 bits of the register
6235 			 * indicates a unified TLB. Size is in low 16 bits.
6236 			 */
6237 			if ((cp->cp_ebx & 0xffff0000) == 0)
6238 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6239 			else
6240 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6241 			break;
6242 
6243 		case 2 * 1024 * 1024:
6244 			if ((cp->cp_eax & 0xffff0000) == 0)
6245 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6246 			else
6247 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6248 			break;
6249 
6250 		default:
6251 			panic("unknown L2 pagesize");
6252 			/*NOTREACHED*/
6253 		}
6254 	}
6255 
6256 	if (dtlb_nent != 0)
6257 		return (dtlb_nent);
6258 
6259 	/*
6260 	 * No L2 TLB support for this size, try L1.
6261 	 */
6262 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6263 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6264 
6265 		switch (pagesize) {
6266 		case 4 * 1024:
6267 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6268 			break;
6269 		case 2 * 1024 * 1024:
6270 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6271 			break;
6272 		default:
6273 			panic("unknown L1 d-TLB pagesize");
6274 			/*NOTREACHED*/
6275 		}
6276 	}
6277 
6278 	return (dtlb_nent);
6279 }
6280 
6281 /*
6282  * Return 0 if the erratum is not present or not applicable, positive
6283  * if it is, and negative if the status of the erratum is unknown.
6284  *
6285  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6286  * Processors" #25759, Rev 3.57, August 2005
6287  */
6288 int
6289 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6290 {
6291 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6292 	uint_t eax;
6293 
6294 	/*
6295 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6296 	 * a legacy (32-bit) AMD CPU.
6297 	 */
6298 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6299 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6300 	    cpi->cpi_family == 6) {
6301 		return (0);
6302 	}
6303 
6304 	eax = cpi->cpi_std[1].cp_eax;
6305 
6306 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6307 #define	SH_B3(eax)	(eax == 0xf51)
6308 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6309 
6310 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6311 
6312 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6313 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6314 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6315 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6316 
6317 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6318 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6319 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6320 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6321 
6322 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6323 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6324 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6325 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6326 #define	BH_E4(eax)	(eax == 0x20fb1)
6327 #define	SH_E5(eax)	(eax == 0x20f42)
6328 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6329 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6330 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6331 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6332 			    DH_E6(eax) || JH_E6(eax))
6333 
6334 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6335 #define	DR_B0(eax)	(eax == 0x100f20)
6336 #define	DR_B1(eax)	(eax == 0x100f21)
6337 #define	DR_BA(eax)	(eax == 0x100f2a)
6338 #define	DR_B2(eax)	(eax == 0x100f22)
6339 #define	DR_B3(eax)	(eax == 0x100f23)
6340 #define	RB_C0(eax)	(eax == 0x100f40)
6341 
6342 	switch (erratum) {
6343 	case 1:
6344 		return (cpi->cpi_family < 0x10);
6345 	case 51:	/* what does the asterisk mean? */
6346 		return (B(eax) || SH_C0(eax) || CG(eax));
6347 	case 52:
6348 		return (B(eax));
6349 	case 57:
6350 		return (cpi->cpi_family <= 0x11);
6351 	case 58:
6352 		return (B(eax));
6353 	case 60:
6354 		return (cpi->cpi_family <= 0x11);
6355 	case 61:
6356 	case 62:
6357 	case 63:
6358 	case 64:
6359 	case 65:
6360 	case 66:
6361 	case 68:
6362 	case 69:
6363 	case 70:
6364 	case 71:
6365 		return (B(eax));
6366 	case 72:
6367 		return (SH_B0(eax));
6368 	case 74:
6369 		return (B(eax));
6370 	case 75:
6371 		return (cpi->cpi_family < 0x10);
6372 	case 76:
6373 		return (B(eax));
6374 	case 77:
6375 		return (cpi->cpi_family <= 0x11);
6376 	case 78:
6377 		return (B(eax) || SH_C0(eax));
6378 	case 79:
6379 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6380 	case 80:
6381 	case 81:
6382 	case 82:
6383 		return (B(eax));
6384 	case 83:
6385 		return (B(eax) || SH_C0(eax) || CG(eax));
6386 	case 85:
6387 		return (cpi->cpi_family < 0x10);
6388 	case 86:
6389 		return (SH_C0(eax) || CG(eax));
6390 	case 88:
6391 		return (B(eax) || SH_C0(eax));
6392 	case 89:
6393 		return (cpi->cpi_family < 0x10);
6394 	case 90:
6395 		return (B(eax) || SH_C0(eax) || CG(eax));
6396 	case 91:
6397 	case 92:
6398 		return (B(eax) || SH_C0(eax));
6399 	case 93:
6400 		return (SH_C0(eax));
6401 	case 94:
6402 		return (B(eax) || SH_C0(eax) || CG(eax));
6403 	case 95:
6404 		return (B(eax) || SH_C0(eax));
6405 	case 96:
6406 		return (B(eax) || SH_C0(eax) || CG(eax));
6407 	case 97:
6408 	case 98:
6409 		return (SH_C0(eax) || CG(eax));
6410 	case 99:
6411 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6412 	case 100:
6413 		return (B(eax) || SH_C0(eax));
6414 	case 101:
6415 	case 103:
6416 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6417 	case 104:
6418 		return (SH_C0(eax) || CG(eax) || D0(eax));
6419 	case 105:
6420 	case 106:
6421 	case 107:
6422 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6423 	case 108:
6424 		return (DH_CG(eax));
6425 	case 109:
6426 		return (SH_C0(eax) || CG(eax) || D0(eax));
6427 	case 110:
6428 		return (D0(eax) || EX(eax));
6429 	case 111:
6430 		return (CG(eax));
6431 	case 112:
6432 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6433 	case 113:
6434 		return (eax == 0x20fc0);
6435 	case 114:
6436 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6437 	case 115:
6438 		return (SH_E0(eax) || JH_E1(eax));
6439 	case 116:
6440 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6441 	case 117:
6442 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6443 	case 118:
6444 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6445 		    JH_E6(eax));
6446 	case 121:
6447 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6448 	case 122:
6449 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6450 	case 123:
6451 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6452 	case 131:
6453 		return (cpi->cpi_family < 0x10);
6454 	case 6336786:
6455 
6456 		/*
6457 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6458 		 * if this is a K8 family or newer processor. We're testing for
6459 		 * this 'erratum' to determine whether or not we have a constant
6460 		 * TSC.
6461 		 *
6462 		 * Our current fix for this is to disable the C1-Clock ramping.
6463 		 * However, this doesn't work on newer processor families nor
6464 		 * does it work when virtualized as those devices don't exist.
6465 		 */
6466 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6467 			return (0);
6468 		}
6469 
6470 		if (CPI_FAMILY(cpi) == 0xf) {
6471 			struct cpuid_regs regs;
6472 			regs.cp_eax = 0x80000007;
6473 			(void) __cpuid_insn(&regs);
6474 			return (!(regs.cp_edx & 0x100));
6475 		}
6476 		return (0);
6477 	case 147:
6478 		/*
6479 		 * This erratum (K8 #147) is not present on family 10 and newer.
6480 		 */
6481 		if (cpi->cpi_family >= 0x10) {
6482 			return (0);
6483 		}
6484 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6485 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6486 
6487 	case 6671130:
6488 		/*
6489 		 * check for processors (pre-Shanghai) that do not provide
6490 		 * optimal management of 1gb ptes in its tlb.
6491 		 */
6492 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6493 
6494 	case 298:
6495 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6496 		    DR_B2(eax) || RB_C0(eax));
6497 
6498 	case 721:
6499 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6500 
6501 	default:
6502 		return (-1);
6503 
6504 	}
6505 }
6506 
6507 /*
6508  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6509  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6510  */
6511 int
6512 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6513 {
6514 	struct cpuid_info	*cpi;
6515 	uint_t			osvwid;
6516 	static int		osvwfeature = -1;
6517 	uint64_t		osvwlength;
6518 
6519 
6520 	cpi = cpu->cpu_m.mcpu_cpi;
6521 
6522 	/* confirm OSVW supported */
6523 	if (osvwfeature == -1) {
6524 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6525 	} else {
6526 		/* assert that osvw feature setting is consistent on all cpus */
6527 		ASSERT(osvwfeature ==
6528 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6529 	}
6530 	if (!osvwfeature)
6531 		return (-1);
6532 
6533 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6534 
6535 	switch (erratum) {
6536 	case 298:	/* osvwid is 0 */
6537 		osvwid = 0;
6538 		if (osvwlength <= (uint64_t)osvwid) {
6539 			/* osvwid 0 is unknown */
6540 			return (-1);
6541 		}
6542 
6543 		/*
6544 		 * Check the OSVW STATUS MSR to determine the state
6545 		 * of the erratum where:
6546 		 *   0 - fixed by HW
6547 		 *   1 - BIOS has applied the workaround when BIOS
6548 		 *   workaround is available. (Or for other errata,
6549 		 *   OS workaround is required.)
6550 		 * For a value of 1, caller will confirm that the
6551 		 * erratum 298 workaround has indeed been applied by BIOS.
6552 		 *
6553 		 * A 1 may be set in cpus that have a HW fix
6554 		 * in a mixed cpu system. Regarding erratum 298:
6555 		 *   In a multiprocessor platform, the workaround above
6556 		 *   should be applied to all processors regardless of
6557 		 *   silicon revision when an affected processor is
6558 		 *   present.
6559 		 */
6560 
6561 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6562 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6563 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6564 
6565 	default:
6566 		return (-1);
6567 	}
6568 }
6569 
6570 static const char assoc_str[] = "associativity";
6571 static const char line_str[] = "line-size";
6572 static const char size_str[] = "size";
6573 
6574 static void
6575 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6576     uint32_t val)
6577 {
6578 	char buf[128];
6579 
6580 	/*
6581 	 * ndi_prop_update_int() is used because it is desirable for
6582 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6583 	 */
6584 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6585 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6586 }
6587 
6588 /*
6589  * Intel-style cache/tlb description
6590  *
6591  * Standard cpuid level 2 gives a randomly ordered
6592  * selection of tags that index into a table that describes
6593  * cache and tlb properties.
6594  */
6595 
6596 static const char l1_icache_str[] = "l1-icache";
6597 static const char l1_dcache_str[] = "l1-dcache";
6598 static const char l2_cache_str[] = "l2-cache";
6599 static const char l3_cache_str[] = "l3-cache";
6600 static const char itlb4k_str[] = "itlb-4K";
6601 static const char dtlb4k_str[] = "dtlb-4K";
6602 static const char itlb2M_str[] = "itlb-2M";
6603 static const char itlb4M_str[] = "itlb-4M";
6604 static const char dtlb4M_str[] = "dtlb-4M";
6605 static const char dtlb24_str[] = "dtlb0-2M-4M";
6606 static const char itlb424_str[] = "itlb-4K-2M-4M";
6607 static const char itlb24_str[] = "itlb-2M-4M";
6608 static const char dtlb44_str[] = "dtlb-4K-4M";
6609 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6610 static const char sl2_cache_str[] = "sectored-l2-cache";
6611 static const char itrace_str[] = "itrace-cache";
6612 static const char sl3_cache_str[] = "sectored-l3-cache";
6613 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6614 
6615 static const struct cachetab {
6616 	uint8_t		ct_code;
6617 	uint8_t		ct_assoc;
6618 	uint16_t	ct_line_size;
6619 	size_t		ct_size;
6620 	const char	*ct_label;
6621 } intel_ctab[] = {
6622 	/*
6623 	 * maintain descending order!
6624 	 *
6625 	 * Codes ignored - Reason
6626 	 * ----------------------
6627 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6628 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6629 	 */
6630 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6631 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6632 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6633 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6634 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6635 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6636 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6637 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6638 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6639 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6640 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6641 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6642 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6643 	{ 0xc0, 4, 0, 8, dtlb44_str },
6644 	{ 0xba, 4, 0, 64, dtlb4k_str },
6645 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6646 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6647 	{ 0xb2, 4, 0, 64, itlb4k_str },
6648 	{ 0xb0, 4, 0, 128, itlb4k_str },
6649 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6650 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6651 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6652 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6653 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6654 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6655 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6656 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6657 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6658 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6659 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6660 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6661 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6662 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6663 	{ 0x73, 8, 0, 64*1024, itrace_str},
6664 	{ 0x72, 8, 0, 32*1024, itrace_str},
6665 	{ 0x71, 8, 0, 16*1024, itrace_str},
6666 	{ 0x70, 8, 0, 12*1024, itrace_str},
6667 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6668 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6669 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6670 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6671 	{ 0x5d, 0, 0, 256, dtlb44_str},
6672 	{ 0x5c, 0, 0, 128, dtlb44_str},
6673 	{ 0x5b, 0, 0, 64, dtlb44_str},
6674 	{ 0x5a, 4, 0, 32, dtlb24_str},
6675 	{ 0x59, 0, 0, 16, dtlb4k_str},
6676 	{ 0x57, 4, 0, 16, dtlb4k_str},
6677 	{ 0x56, 4, 0, 16, dtlb4M_str},
6678 	{ 0x55, 0, 0, 7, itlb24_str},
6679 	{ 0x52, 0, 0, 256, itlb424_str},
6680 	{ 0x51, 0, 0, 128, itlb424_str},
6681 	{ 0x50, 0, 0, 64, itlb424_str},
6682 	{ 0x4f, 0, 0, 32, itlb4k_str},
6683 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6684 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6685 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6686 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6687 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6688 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6689 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6690 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6691 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6692 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6693 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6694 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6695 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6696 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6697 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6698 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6699 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6700 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6701 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6702 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6703 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6704 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6705 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6706 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6707 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6708 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6709 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6710 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6711 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6712 	{ 0x0b, 4, 0, 4, itlb4M_str},
6713 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6714 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6715 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6716 	{ 0x05, 4, 0, 32, dtlb4M_str},
6717 	{ 0x04, 4, 0, 8, dtlb4M_str},
6718 	{ 0x03, 4, 0, 64, dtlb4k_str},
6719 	{ 0x02, 4, 0, 2, itlb4M_str},
6720 	{ 0x01, 4, 0, 32, itlb4k_str},
6721 	{ 0 }
6722 };
6723 
6724 static const struct cachetab cyrix_ctab[] = {
6725 	{ 0x70, 4, 0, 32, "tlb-4K" },
6726 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6727 	{ 0 }
6728 };
6729 
6730 /*
6731  * Search a cache table for a matching entry
6732  */
6733 static const struct cachetab *
6734 find_cacheent(const struct cachetab *ct, uint_t code)
6735 {
6736 	if (code != 0) {
6737 		for (; ct->ct_code != 0; ct++)
6738 			if (ct->ct_code <= code)
6739 				break;
6740 		if (ct->ct_code == code)
6741 			return (ct);
6742 	}
6743 	return (NULL);
6744 }
6745 
6746 /*
6747  * Populate cachetab entry with L2 or L3 cache-information using
6748  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6749  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6750  * information is found.
6751  */
6752 static int
6753 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6754 {
6755 	uint32_t level, i;
6756 	int ret = 0;
6757 
6758 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6759 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6760 
6761 		if (level == 2 || level == 3) {
6762 			ct->ct_assoc =
6763 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6764 			ct->ct_line_size =
6765 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6766 			ct->ct_size = ct->ct_assoc *
6767 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6768 			    ct->ct_line_size *
6769 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6770 
6771 			if (level == 2) {
6772 				ct->ct_label = l2_cache_str;
6773 			} else if (level == 3) {
6774 				ct->ct_label = l3_cache_str;
6775 			}
6776 			ret = 1;
6777 		}
6778 	}
6779 
6780 	return (ret);
6781 }
6782 
6783 /*
6784  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6785  * The walk is terminated if the walker returns non-zero.
6786  */
6787 static void
6788 intel_walk_cacheinfo(struct cpuid_info *cpi,
6789     void *arg, int (*func)(void *, const struct cachetab *))
6790 {
6791 	const struct cachetab *ct;
6792 	struct cachetab des_49_ct, des_b1_ct;
6793 	uint8_t *dp;
6794 	int i;
6795 
6796 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6797 		return;
6798 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6799 		/*
6800 		 * For overloaded descriptor 0x49 we use cpuid function 4
6801 		 * if supported by the current processor, to create
6802 		 * cache information.
6803 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6804 		 * to disambiguate the cache information.
6805 		 */
6806 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6807 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6808 				ct = &des_49_ct;
6809 		} else if (*dp == 0xb1) {
6810 			des_b1_ct.ct_code = 0xb1;
6811 			des_b1_ct.ct_assoc = 4;
6812 			des_b1_ct.ct_line_size = 0;
6813 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6814 				des_b1_ct.ct_size = 8;
6815 				des_b1_ct.ct_label = itlb2M_str;
6816 			} else {
6817 				des_b1_ct.ct_size = 4;
6818 				des_b1_ct.ct_label = itlb4M_str;
6819 			}
6820 			ct = &des_b1_ct;
6821 		} else {
6822 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6823 				continue;
6824 			}
6825 		}
6826 
6827 		if (func(arg, ct) != 0) {
6828 			break;
6829 		}
6830 	}
6831 }
6832 
6833 /*
6834  * (Like the Intel one, except for Cyrix CPUs)
6835  */
6836 static void
6837 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6838     void *arg, int (*func)(void *, const struct cachetab *))
6839 {
6840 	const struct cachetab *ct;
6841 	uint8_t *dp;
6842 	int i;
6843 
6844 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6845 		return;
6846 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6847 		/*
6848 		 * Search Cyrix-specific descriptor table first ..
6849 		 */
6850 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6851 			if (func(arg, ct) != 0)
6852 				break;
6853 			continue;
6854 		}
6855 		/*
6856 		 * .. else fall back to the Intel one
6857 		 */
6858 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6859 			if (func(arg, ct) != 0)
6860 				break;
6861 			continue;
6862 		}
6863 	}
6864 }
6865 
6866 /*
6867  * A cacheinfo walker that adds associativity, line-size, and size properties
6868  * to the devinfo node it is passed as an argument.
6869  */
6870 static int
6871 add_cacheent_props(void *arg, const struct cachetab *ct)
6872 {
6873 	dev_info_t *devi = arg;
6874 
6875 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6876 	if (ct->ct_line_size != 0)
6877 		add_cache_prop(devi, ct->ct_label, line_str,
6878 		    ct->ct_line_size);
6879 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6880 	return (0);
6881 }
6882 
6883 
6884 static const char fully_assoc[] = "fully-associative?";
6885 
6886 /*
6887  * AMD style cache/tlb description
6888  *
6889  * Extended functions 5 and 6 directly describe properties of
6890  * tlbs and various cache levels.
6891  */
6892 static void
6893 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6894 {
6895 	switch (assoc) {
6896 	case 0:	/* reserved; ignore */
6897 		break;
6898 	default:
6899 		add_cache_prop(devi, label, assoc_str, assoc);
6900 		break;
6901 	case 0xff:
6902 		add_cache_prop(devi, label, fully_assoc, 1);
6903 		break;
6904 	}
6905 }
6906 
6907 static void
6908 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6909 {
6910 	if (size == 0)
6911 		return;
6912 	add_cache_prop(devi, label, size_str, size);
6913 	add_amd_assoc(devi, label, assoc);
6914 }
6915 
6916 static void
6917 add_amd_cache(dev_info_t *devi, const char *label,
6918     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6919 {
6920 	if (size == 0 || line_size == 0)
6921 		return;
6922 	add_amd_assoc(devi, label, assoc);
6923 	/*
6924 	 * Most AMD parts have a sectored cache. Multiple cache lines are
6925 	 * associated with each tag. A sector consists of all cache lines
6926 	 * associated with a tag. For example, the AMD K6-III has a sector
6927 	 * size of 2 cache lines per tag.
6928 	 */
6929 	if (lines_per_tag != 0)
6930 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6931 	add_cache_prop(devi, label, line_str, line_size);
6932 	add_cache_prop(devi, label, size_str, size * 1024);
6933 }
6934 
6935 static void
6936 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6937 {
6938 	switch (assoc) {
6939 	case 0:	/* off */
6940 		break;
6941 	case 1:
6942 	case 2:
6943 	case 4:
6944 		add_cache_prop(devi, label, assoc_str, assoc);
6945 		break;
6946 	case 6:
6947 		add_cache_prop(devi, label, assoc_str, 8);
6948 		break;
6949 	case 8:
6950 		add_cache_prop(devi, label, assoc_str, 16);
6951 		break;
6952 	case 0xf:
6953 		add_cache_prop(devi, label, fully_assoc, 1);
6954 		break;
6955 	default: /* reserved; ignore */
6956 		break;
6957 	}
6958 }
6959 
6960 static void
6961 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6962 {
6963 	if (size == 0 || assoc == 0)
6964 		return;
6965 	add_amd_l2_assoc(devi, label, assoc);
6966 	add_cache_prop(devi, label, size_str, size);
6967 }
6968 
6969 static void
6970 add_amd_l2_cache(dev_info_t *devi, const char *label,
6971     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6972 {
6973 	if (size == 0 || assoc == 0 || line_size == 0)
6974 		return;
6975 	add_amd_l2_assoc(devi, label, assoc);
6976 	if (lines_per_tag != 0)
6977 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6978 	add_cache_prop(devi, label, line_str, line_size);
6979 	add_cache_prop(devi, label, size_str, size * 1024);
6980 }
6981 
6982 static void
6983 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6984 {
6985 	struct cpuid_regs *cp;
6986 
6987 	if (cpi->cpi_xmaxeax < 0x80000005)
6988 		return;
6989 	cp = &cpi->cpi_extd[5];
6990 
6991 	/*
6992 	 * 4M/2M L1 TLB configuration
6993 	 *
6994 	 * We report the size for 2M pages because AMD uses two
6995 	 * TLB entries for one 4M page.
6996 	 */
6997 	add_amd_tlb(devi, "dtlb-2M",
6998 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6999 	add_amd_tlb(devi, "itlb-2M",
7000 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7001 
7002 	/*
7003 	 * 4K L1 TLB configuration
7004 	 */
7005 
7006 	switch (cpi->cpi_vendor) {
7007 		uint_t nentries;
7008 	case X86_VENDOR_TM:
7009 		if (cpi->cpi_family >= 5) {
7010 			/*
7011 			 * Crusoe processors have 256 TLB entries, but
7012 			 * cpuid data format constrains them to only
7013 			 * reporting 255 of them.
7014 			 */
7015 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7016 				nentries = 256;
7017 			/*
7018 			 * Crusoe processors also have a unified TLB
7019 			 */
7020 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7021 			    nentries);
7022 			break;
7023 		}
7024 		/*FALLTHROUGH*/
7025 	default:
7026 		add_amd_tlb(devi, itlb4k_str,
7027 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7028 		add_amd_tlb(devi, dtlb4k_str,
7029 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7030 		break;
7031 	}
7032 
7033 	/*
7034 	 * data L1 cache configuration
7035 	 */
7036 
7037 	add_amd_cache(devi, l1_dcache_str,
7038 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7039 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7040 
7041 	/*
7042 	 * code L1 cache configuration
7043 	 */
7044 
7045 	add_amd_cache(devi, l1_icache_str,
7046 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7047 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7048 
7049 	if (cpi->cpi_xmaxeax < 0x80000006)
7050 		return;
7051 	cp = &cpi->cpi_extd[6];
7052 
7053 	/* Check for a unified L2 TLB for large pages */
7054 
7055 	if (BITX(cp->cp_eax, 31, 16) == 0)
7056 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7057 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7058 	else {
7059 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7060 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7061 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7062 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7063 	}
7064 
7065 	/* Check for a unified L2 TLB for 4K pages */
7066 
7067 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7068 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7069 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7070 	} else {
7071 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7072 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7073 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7074 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7075 	}
7076 
7077 	add_amd_l2_cache(devi, l2_cache_str,
7078 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7079 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7080 }
7081 
7082 /*
7083  * There are two basic ways that the x86 world describes it cache
7084  * and tlb architecture - Intel's way and AMD's way.
7085  *
7086  * Return which flavor of cache architecture we should use
7087  */
7088 static int
7089 x86_which_cacheinfo(struct cpuid_info *cpi)
7090 {
7091 	switch (cpi->cpi_vendor) {
7092 	case X86_VENDOR_Intel:
7093 		if (cpi->cpi_maxeax >= 2)
7094 			return (X86_VENDOR_Intel);
7095 		break;
7096 	case X86_VENDOR_AMD:
7097 		/*
7098 		 * The K5 model 1 was the first part from AMD that reported
7099 		 * cache sizes via extended cpuid functions.
7100 		 */
7101 		if (cpi->cpi_family > 5 ||
7102 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7103 			return (X86_VENDOR_AMD);
7104 		break;
7105 	case X86_VENDOR_HYGON:
7106 		return (X86_VENDOR_AMD);
7107 	case X86_VENDOR_TM:
7108 		if (cpi->cpi_family >= 5)
7109 			return (X86_VENDOR_AMD);
7110 		/*FALLTHROUGH*/
7111 	default:
7112 		/*
7113 		 * If they have extended CPU data for 0x80000005
7114 		 * then we assume they have AMD-format cache
7115 		 * information.
7116 		 *
7117 		 * If not, and the vendor happens to be Cyrix,
7118 		 * then try our-Cyrix specific handler.
7119 		 *
7120 		 * If we're not Cyrix, then assume we're using Intel's
7121 		 * table-driven format instead.
7122 		 */
7123 		if (cpi->cpi_xmaxeax >= 0x80000005)
7124 			return (X86_VENDOR_AMD);
7125 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7126 			return (X86_VENDOR_Cyrix);
7127 		else if (cpi->cpi_maxeax >= 2)
7128 			return (X86_VENDOR_Intel);
7129 		break;
7130 	}
7131 	return (-1);
7132 }
7133 
7134 void
7135 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7136     struct cpuid_info *cpi)
7137 {
7138 	dev_info_t *cpu_devi;
7139 	int create;
7140 
7141 	cpu_devi = (dev_info_t *)dip;
7142 
7143 	/* device_type */
7144 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7145 	    "device_type", "cpu");
7146 
7147 	/* reg */
7148 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7149 	    "reg", cpu_id);
7150 
7151 	/* cpu-mhz, and clock-frequency */
7152 	if (cpu_freq > 0) {
7153 		long long mul;
7154 
7155 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7156 		    "cpu-mhz", cpu_freq);
7157 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7158 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7159 			    "clock-frequency", (int)mul);
7160 	}
7161 
7162 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7163 
7164 	/* vendor-id */
7165 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7166 	    "vendor-id", cpi->cpi_vendorstr);
7167 
7168 	if (cpi->cpi_maxeax == 0) {
7169 		return;
7170 	}
7171 
7172 	/*
7173 	 * family, model, and step
7174 	 */
7175 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7176 	    "family", CPI_FAMILY(cpi));
7177 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7178 	    "cpu-model", CPI_MODEL(cpi));
7179 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7180 	    "stepping-id", CPI_STEP(cpi));
7181 
7182 	/* type */
7183 	switch (cpi->cpi_vendor) {
7184 	case X86_VENDOR_Intel:
7185 		create = 1;
7186 		break;
7187 	default:
7188 		create = 0;
7189 		break;
7190 	}
7191 	if (create)
7192 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7193 		    "type", CPI_TYPE(cpi));
7194 
7195 	/* ext-family */
7196 	switch (cpi->cpi_vendor) {
7197 	case X86_VENDOR_Intel:
7198 	case X86_VENDOR_AMD:
7199 		create = cpi->cpi_family >= 0xf;
7200 		break;
7201 	case X86_VENDOR_HYGON:
7202 		create = 1;
7203 		break;
7204 	default:
7205 		create = 0;
7206 		break;
7207 	}
7208 	if (create)
7209 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7210 		    "ext-family", CPI_FAMILY_XTD(cpi));
7211 
7212 	/* ext-model */
7213 	switch (cpi->cpi_vendor) {
7214 	case X86_VENDOR_Intel:
7215 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7216 		break;
7217 	case X86_VENDOR_AMD:
7218 		create = CPI_FAMILY(cpi) == 0xf;
7219 		break;
7220 	case X86_VENDOR_HYGON:
7221 		create = 1;
7222 		break;
7223 	default:
7224 		create = 0;
7225 		break;
7226 	}
7227 	if (create)
7228 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7229 		    "ext-model", CPI_MODEL_XTD(cpi));
7230 
7231 	/* generation */
7232 	switch (cpi->cpi_vendor) {
7233 	case X86_VENDOR_AMD:
7234 	case X86_VENDOR_HYGON:
7235 		/*
7236 		 * AMD K5 model 1 was the first part to support this
7237 		 */
7238 		create = cpi->cpi_xmaxeax >= 0x80000001;
7239 		break;
7240 	default:
7241 		create = 0;
7242 		break;
7243 	}
7244 	if (create)
7245 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7246 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7247 
7248 	/* brand-id */
7249 	switch (cpi->cpi_vendor) {
7250 	case X86_VENDOR_Intel:
7251 		/*
7252 		 * brand id first appeared on Pentium III Xeon model 8,
7253 		 * and Celeron model 8 processors and Opteron
7254 		 */
7255 		create = cpi->cpi_family > 6 ||
7256 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7257 		break;
7258 	case X86_VENDOR_AMD:
7259 		create = cpi->cpi_family >= 0xf;
7260 		break;
7261 	case X86_VENDOR_HYGON:
7262 		create = 1;
7263 		break;
7264 	default:
7265 		create = 0;
7266 		break;
7267 	}
7268 	if (create && cpi->cpi_brandid != 0) {
7269 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7270 		    "brand-id", cpi->cpi_brandid);
7271 	}
7272 
7273 	/* chunks, and apic-id */
7274 	switch (cpi->cpi_vendor) {
7275 		/*
7276 		 * first available on Pentium IV and Opteron (K8)
7277 		 */
7278 	case X86_VENDOR_Intel:
7279 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7280 		break;
7281 	case X86_VENDOR_AMD:
7282 		create = cpi->cpi_family >= 0xf;
7283 		break;
7284 	case X86_VENDOR_HYGON:
7285 		create = 1;
7286 		break;
7287 	default:
7288 		create = 0;
7289 		break;
7290 	}
7291 	if (create) {
7292 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7293 		    "chunks", CPI_CHUNKS(cpi));
7294 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7295 		    "apic-id", cpi->cpi_apicid);
7296 		if (cpi->cpi_chipid >= 0) {
7297 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7298 			    "chip#", cpi->cpi_chipid);
7299 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7300 			    "clog#", cpi->cpi_clogid);
7301 		}
7302 	}
7303 
7304 	/* cpuid-features */
7305 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7306 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7307 
7308 
7309 	/* cpuid-features-ecx */
7310 	switch (cpi->cpi_vendor) {
7311 	case X86_VENDOR_Intel:
7312 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7313 		break;
7314 	case X86_VENDOR_AMD:
7315 		create = cpi->cpi_family >= 0xf;
7316 		break;
7317 	case X86_VENDOR_HYGON:
7318 		create = 1;
7319 		break;
7320 	default:
7321 		create = 0;
7322 		break;
7323 	}
7324 	if (create)
7325 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7326 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7327 
7328 	/* ext-cpuid-features */
7329 	switch (cpi->cpi_vendor) {
7330 	case X86_VENDOR_Intel:
7331 	case X86_VENDOR_AMD:
7332 	case X86_VENDOR_HYGON:
7333 	case X86_VENDOR_Cyrix:
7334 	case X86_VENDOR_TM:
7335 	case X86_VENDOR_Centaur:
7336 		create = cpi->cpi_xmaxeax >= 0x80000001;
7337 		break;
7338 	default:
7339 		create = 0;
7340 		break;
7341 	}
7342 	if (create) {
7343 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7344 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7345 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7346 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7347 	}
7348 
7349 	/*
7350 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7351 	 * model 1, and Cyrix GXm.  On earlier models we try and
7352 	 * simulate something similar .. so this string should always
7353 	 * same -something- about the processor, however lame.
7354 	 */
7355 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7356 	    "brand-string", cpi->cpi_brandstr);
7357 
7358 	/*
7359 	 * Finally, cache and tlb information
7360 	 */
7361 	switch (x86_which_cacheinfo(cpi)) {
7362 	case X86_VENDOR_Intel:
7363 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7364 		break;
7365 	case X86_VENDOR_Cyrix:
7366 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7367 		break;
7368 	case X86_VENDOR_AMD:
7369 		amd_cache_info(cpi, cpu_devi);
7370 		break;
7371 	default:
7372 		break;
7373 	}
7374 }
7375 
7376 struct l2info {
7377 	int *l2i_csz;
7378 	int *l2i_lsz;
7379 	int *l2i_assoc;
7380 	int l2i_ret;
7381 };
7382 
7383 /*
7384  * A cacheinfo walker that fetches the size, line-size and associativity
7385  * of the L2 cache
7386  */
7387 static int
7388 intel_l2cinfo(void *arg, const struct cachetab *ct)
7389 {
7390 	struct l2info *l2i = arg;
7391 	int *ip;
7392 
7393 	if (ct->ct_label != l2_cache_str &&
7394 	    ct->ct_label != sl2_cache_str)
7395 		return (0);	/* not an L2 -- keep walking */
7396 
7397 	if ((ip = l2i->l2i_csz) != NULL)
7398 		*ip = ct->ct_size;
7399 	if ((ip = l2i->l2i_lsz) != NULL)
7400 		*ip = ct->ct_line_size;
7401 	if ((ip = l2i->l2i_assoc) != NULL)
7402 		*ip = ct->ct_assoc;
7403 	l2i->l2i_ret = ct->ct_size;
7404 	return (1);		/* was an L2 -- terminate walk */
7405 }
7406 
7407 /*
7408  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7409  *
7410  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7411  *	value is the associativity, the associativity for the L2 cache and
7412  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7413  *	an index into the amd_afd[] array to determine the associativity.
7414  *	-1 is undefined. 0 is fully associative.
7415  */
7416 
7417 static int amd_afd[] =
7418 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7419 
7420 static void
7421 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7422 {
7423 	struct cpuid_regs *cp;
7424 	uint_t size, assoc;
7425 	int i;
7426 	int *ip;
7427 
7428 	if (cpi->cpi_xmaxeax < 0x80000006)
7429 		return;
7430 	cp = &cpi->cpi_extd[6];
7431 
7432 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7433 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7434 		uint_t cachesz = size * 1024;
7435 		assoc = amd_afd[i];
7436 
7437 		ASSERT(assoc != -1);
7438 
7439 		if ((ip = l2i->l2i_csz) != NULL)
7440 			*ip = cachesz;
7441 		if ((ip = l2i->l2i_lsz) != NULL)
7442 			*ip = BITX(cp->cp_ecx, 7, 0);
7443 		if ((ip = l2i->l2i_assoc) != NULL)
7444 			*ip = assoc;
7445 		l2i->l2i_ret = cachesz;
7446 	}
7447 }
7448 
7449 int
7450 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7451 {
7452 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7453 	struct l2info __l2info, *l2i = &__l2info;
7454 
7455 	l2i->l2i_csz = csz;
7456 	l2i->l2i_lsz = lsz;
7457 	l2i->l2i_assoc = assoc;
7458 	l2i->l2i_ret = -1;
7459 
7460 	switch (x86_which_cacheinfo(cpi)) {
7461 	case X86_VENDOR_Intel:
7462 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7463 		break;
7464 	case X86_VENDOR_Cyrix:
7465 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7466 		break;
7467 	case X86_VENDOR_AMD:
7468 		amd_l2cacheinfo(cpi, l2i);
7469 		break;
7470 	default:
7471 		break;
7472 	}
7473 	return (l2i->l2i_ret);
7474 }
7475 
7476 #if !defined(__xpv)
7477 
7478 uint32_t *
7479 cpuid_mwait_alloc(cpu_t *cpu)
7480 {
7481 	uint32_t	*ret;
7482 	size_t		mwait_size;
7483 
7484 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7485 
7486 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7487 	if (mwait_size == 0)
7488 		return (NULL);
7489 
7490 	/*
7491 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7492 	 * allocations.  mwait_size is currently cache line sized.  Neither
7493 	 * of these implementation details are guarantied to be true in the
7494 	 * future.
7495 	 *
7496 	 * First try allocating mwait_size as kmem_alloc() currently returns
7497 	 * correctly aligned memory.  If kmem_alloc() does not return
7498 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7499 	 *
7500 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7501 	 * decide to free this memory.
7502 	 */
7503 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7504 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7505 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7506 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7507 		*ret = MWAIT_RUNNING;
7508 		return (ret);
7509 	} else {
7510 		kmem_free(ret, mwait_size);
7511 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7512 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7513 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7514 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7515 		*ret = MWAIT_RUNNING;
7516 		return (ret);
7517 	}
7518 }
7519 
7520 void
7521 cpuid_mwait_free(cpu_t *cpu)
7522 {
7523 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7524 		return;
7525 	}
7526 
7527 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7528 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7529 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7530 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7531 	}
7532 
7533 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7534 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7535 }
7536 
7537 void
7538 patch_tsc_read(int flag)
7539 {
7540 	size_t cnt;
7541 
7542 	switch (flag) {
7543 	case TSC_NONE:
7544 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7545 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7546 		break;
7547 	case TSC_RDTSC_LFENCE:
7548 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7549 		(void) memcpy((void *)tsc_read,
7550 		    (void *)&_tsc_lfence_start, cnt);
7551 		break;
7552 	case TSC_TSCP:
7553 		cnt = &_tscp_end - &_tscp_start;
7554 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7555 		break;
7556 	default:
7557 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7558 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7559 		break;
7560 	}
7561 	tsc_type = flag;
7562 }
7563 
7564 int
7565 cpuid_deep_cstates_supported(void)
7566 {
7567 	struct cpuid_info *cpi;
7568 	struct cpuid_regs regs;
7569 
7570 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7571 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7572 
7573 	cpi = CPU->cpu_m.mcpu_cpi;
7574 
7575 	switch (cpi->cpi_vendor) {
7576 	case X86_VENDOR_Intel:
7577 		if (cpi->cpi_xmaxeax < 0x80000007)
7578 			return (0);
7579 
7580 		/*
7581 		 * Does TSC run at a constant rate in all C-states?
7582 		 */
7583 		regs.cp_eax = 0x80000007;
7584 		(void) __cpuid_insn(&regs);
7585 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7586 
7587 	default:
7588 		return (0);
7589 	}
7590 }
7591 
7592 #endif	/* !__xpv */
7593 
7594 void
7595 post_startup_cpu_fixups(void)
7596 {
7597 #ifndef __xpv
7598 	/*
7599 	 * Some AMD processors support C1E state. Entering this state will
7600 	 * cause the local APIC timer to stop, which we can't deal with at
7601 	 * this time.
7602 	 */
7603 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7604 		on_trap_data_t otd;
7605 		uint64_t reg;
7606 
7607 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7608 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7609 			/* Disable C1E state if it is enabled by BIOS */
7610 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7611 			    AMD_ACTONCMPHALT_MASK) {
7612 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7613 				    AMD_ACTONCMPHALT_SHIFT);
7614 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7615 			}
7616 		}
7617 		no_trap();
7618 	}
7619 #endif	/* !__xpv */
7620 }
7621 
7622 void
7623 enable_pcid(void)
7624 {
7625 	if (x86_use_pcid == -1)
7626 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7627 
7628 	if (x86_use_invpcid == -1) {
7629 		x86_use_invpcid = is_x86_feature(x86_featureset,
7630 		    X86FSET_INVPCID);
7631 	}
7632 
7633 	if (!x86_use_pcid)
7634 		return;
7635 
7636 	/*
7637 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7638 	 * bits; better make sure there's nothing there.
7639 	 */
7640 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7641 
7642 	setcr4(getcr4() | CR4_PCIDE);
7643 }
7644 
7645 /*
7646  * Setup necessary registers to enable XSAVE feature on this processor.
7647  * This function needs to be called early enough, so that no xsave/xrstor
7648  * ops will execute on the processor before the MSRs are properly set up.
7649  *
7650  * Current implementation has the following assumption:
7651  * - cpuid_pass_basic() is done, so that X86 features are known.
7652  * - fpu_probe() is done, so that fp_save_mech is chosen.
7653  */
7654 void
7655 xsave_setup_msr(cpu_t *cpu)
7656 {
7657 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7658 	ASSERT(fp_save_mech == FP_XSAVE);
7659 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7660 
7661 	/* Enable OSXSAVE in CR4. */
7662 	setcr4(getcr4() | CR4_OSXSAVE);
7663 	/*
7664 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7665 	 * correct value.
7666 	 */
7667 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7668 	setup_xfem();
7669 }
7670 
7671 /*
7672  * Starting with the Westmere processor the local
7673  * APIC timer will continue running in all C-states,
7674  * including the deepest C-states.
7675  */
7676 int
7677 cpuid_arat_supported(void)
7678 {
7679 	struct cpuid_info *cpi;
7680 	struct cpuid_regs regs;
7681 
7682 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7683 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7684 
7685 	cpi = CPU->cpu_m.mcpu_cpi;
7686 
7687 	switch (cpi->cpi_vendor) {
7688 	case X86_VENDOR_Intel:
7689 		/*
7690 		 * Always-running Local APIC Timer is
7691 		 * indicated by CPUID.6.EAX[2].
7692 		 */
7693 		if (cpi->cpi_maxeax >= 6) {
7694 			regs.cp_eax = 6;
7695 			(void) cpuid_insn(NULL, &regs);
7696 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7697 		} else {
7698 			return (0);
7699 		}
7700 	default:
7701 		return (0);
7702 	}
7703 }
7704 
7705 /*
7706  * Check support for Intel ENERGY_PERF_BIAS feature
7707  */
7708 int
7709 cpuid_iepb_supported(struct cpu *cp)
7710 {
7711 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7712 	struct cpuid_regs regs;
7713 
7714 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7715 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7716 
7717 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7718 		return (0);
7719 	}
7720 
7721 	/*
7722 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7723 	 * capability bit CPUID.6.ECX.3
7724 	 */
7725 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7726 		return (0);
7727 
7728 	regs.cp_eax = 0x6;
7729 	(void) cpuid_insn(NULL, &regs);
7730 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7731 }
7732 
7733 /*
7734  * Check support for TSC deadline timer
7735  *
7736  * TSC deadline timer provides a superior software programming
7737  * model over local APIC timer that eliminates "time drifts".
7738  * Instead of specifying a relative time, software specifies an
7739  * absolute time as the target at which the processor should
7740  * generate a timer event.
7741  */
7742 int
7743 cpuid_deadline_tsc_supported(void)
7744 {
7745 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7746 	struct cpuid_regs regs;
7747 
7748 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7749 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7750 
7751 	switch (cpi->cpi_vendor) {
7752 	case X86_VENDOR_Intel:
7753 		if (cpi->cpi_maxeax >= 1) {
7754 			regs.cp_eax = 1;
7755 			(void) cpuid_insn(NULL, &regs);
7756 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7757 		} else {
7758 			return (0);
7759 		}
7760 	default:
7761 		return (0);
7762 	}
7763 }
7764 
7765 #if !defined(__xpv)
7766 /*
7767  * Patch in versions of bcopy for high performance Intel Nhm processors
7768  * and later...
7769  */
7770 void
7771 patch_memops(uint_t vendor)
7772 {
7773 	size_t cnt, i;
7774 	caddr_t to, from;
7775 
7776 	if ((vendor == X86_VENDOR_Intel) &&
7777 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7778 		cnt = &bcopy_patch_end - &bcopy_patch_start;
7779 		to = &bcopy_ck_size;
7780 		from = &bcopy_patch_start;
7781 		for (i = 0; i < cnt; i++) {
7782 			*to++ = *from++;
7783 		}
7784 	}
7785 }
7786 #endif  /*  !__xpv */
7787 
7788 /*
7789  * We're being asked to tell the system how many bits are required to represent
7790  * the various thread and strand IDs. While it's tempting to derive this based
7791  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7792  * correct. Instead, this needs to be based on the number of bits that the APIC
7793  * allows for these different configurations. We only update these to a larger
7794  * value if we find one.
7795  */
7796 void
7797 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7798 {
7799 	struct cpuid_info *cpi;
7800 
7801 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7802 	cpi = cpu->cpu_m.mcpu_cpi;
7803 
7804 	if (cpi->cpi_ncore_bits > *core_nbits) {
7805 		*core_nbits = cpi->cpi_ncore_bits;
7806 	}
7807 
7808 	if (cpi->cpi_nthread_bits > *strand_nbits) {
7809 		*strand_nbits = cpi->cpi_nthread_bits;
7810 	}
7811 }
7812 
7813 void
7814 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7815 {
7816 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7817 	struct cpuid_regs cp;
7818 
7819 	/*
7820 	 * Reread the CPUID portions that we need for various security
7821 	 * information.
7822 	 */
7823 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7824 		/*
7825 		 * Check if we now have leaf 7 available to us.
7826 		 */
7827 		if (cpi->cpi_maxeax < 7) {
7828 			bzero(&cp, sizeof (cp));
7829 			cp.cp_eax = 0;
7830 			cpi->cpi_maxeax = __cpuid_insn(&cp);
7831 			if (cpi->cpi_maxeax < 7)
7832 				return;
7833 		}
7834 
7835 		bzero(&cp, sizeof (cp));
7836 		cp.cp_eax = 7;
7837 		cp.cp_ecx = 0;
7838 		(void) __cpuid_insn(&cp);
7839 		cpi->cpi_std[7] = cp;
7840 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7841 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
7842 		/* No xcpuid support */
7843 		if (cpi->cpi_family < 5 ||
7844 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7845 			return;
7846 
7847 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7848 			bzero(&cp, sizeof (cp));
7849 			cp.cp_eax = CPUID_LEAF_EXT_0;
7850 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7851 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7852 				return;
7853 			}
7854 		}
7855 
7856 		/*
7857 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
7858 		 * leaf 0x21. So we also check that.
7859 		 */
7860 		bzero(&cp, sizeof (cp));
7861 		cp.cp_eax = CPUID_LEAF_EXT_8;
7862 		(void) __cpuid_insn(&cp);
7863 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7864 		cpi->cpi_extd[8] = cp;
7865 
7866 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
7867 			return;
7868 		}
7869 
7870 		bzero(&cp, sizeof (cp));
7871 		cp.cp_eax = CPUID_LEAF_EXT_21;
7872 		(void) __cpuid_insn(&cp);
7873 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
7874 		cpi->cpi_extd[0x21] = cp;
7875 	} else {
7876 		/*
7877 		 * Nothing to do here. Return an empty set which has already
7878 		 * been zeroed for us.
7879 		 */
7880 		return;
7881 	}
7882 	cpuid_scan_security(cpu, fset);
7883 }
7884 
7885 /* ARGSUSED */
7886 static int
7887 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7888 {
7889 	uchar_t *fset;
7890 	boolean_t first_pass = (boolean_t)arg1;
7891 
7892 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7893 	if (first_pass && CPU->cpu_id != 0)
7894 		return (0);
7895 	if (!first_pass && CPU->cpu_id == 0)
7896 		return (0);
7897 	cpuid_pass_ucode(CPU, fset);
7898 
7899 	return (0);
7900 }
7901 
7902 /*
7903  * After a microcode update where the version has changed, then we need to
7904  * rescan CPUID. To do this we check every CPU to make sure that they have the
7905  * same microcode. Then we perform a cross call to all such CPUs. It's the
7906  * caller's job to make sure that no one else can end up doing an update while
7907  * this is going on.
7908  *
7909  * We assume that the system is microcode capable if we're called.
7910  */
7911 void
7912 cpuid_post_ucodeadm(void)
7913 {
7914 	uint32_t rev;
7915 	int i;
7916 	struct cpu *cpu;
7917 	cpuset_t cpuset;
7918 	void *argdata;
7919 	uchar_t *f0;
7920 
7921 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7922 
7923 	mutex_enter(&cpu_lock);
7924 	cpu = cpu_get(0);
7925 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7926 	CPUSET_ONLY(cpuset, 0);
7927 	for (i = 1; i < max_ncpus; i++) {
7928 		if ((cpu = cpu_get(i)) == NULL)
7929 			continue;
7930 
7931 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7932 			panic("post microcode update CPU %d has differing "
7933 			    "microcode revision (%u) from CPU 0 (%u)",
7934 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7935 		}
7936 		CPUSET_ADD(cpuset, i);
7937 	}
7938 
7939 	/*
7940 	 * We do the cross calls in two passes. The first pass is only for the
7941 	 * boot CPU. The second pass is for all of the other CPUs. This allows
7942 	 * the boot CPU to go through and change behavior related to patching or
7943 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
7944 	 * other CPUs to follow suit.
7945 	 */
7946 	kpreempt_disable();
7947 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7948 	    cpuid_post_ucodeadm_xc);
7949 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7950 	    cpuid_post_ucodeadm_xc);
7951 	kpreempt_enable();
7952 
7953 	/*
7954 	 * OK, now look at each CPU and see if their feature sets are equal.
7955 	 */
7956 	f0 = argdata;
7957 	for (i = 1; i < max_ncpus; i++) {
7958 		uchar_t *fset;
7959 		if (!CPU_IN_SET(cpuset, i))
7960 			continue;
7961 
7962 		fset = (uchar_t *)((uintptr_t)argdata +
7963 		    sizeof (x86_featureset) * i);
7964 
7965 		if (!compare_x86_featureset(f0, fset)) {
7966 			panic("Post microcode update CPU %d has "
7967 			    "differing security feature (%p) set from CPU 0 "
7968 			    "(%p), not appending to feature set", i,
7969 			    (void *)fset, (void *)f0);
7970 		}
7971 	}
7972 
7973 	mutex_exit(&cpu_lock);
7974 
7975 	for (i = 0; i < NUM_X86_FEATURES; i++) {
7976 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7977 		    x86_feature_names[i]);
7978 		if (is_x86_feature(f0, i)) {
7979 			add_x86_feature(x86_featureset, i);
7980 		}
7981 	}
7982 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7983 }
7984 
7985 typedef void (*cpuid_pass_f)(cpu_t *, void *);
7986 
7987 typedef struct cpuid_pass_def {
7988 	cpuid_pass_t cpd_pass;
7989 	cpuid_pass_f cpd_func;
7990 } cpuid_pass_def_t;
7991 
7992 /*
7993  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
7994  * normal sense and should not appear here.
7995  */
7996 static const cpuid_pass_def_t cpuid_pass_defs[] = {
7997 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
7998 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
7999 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
8000 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
8001 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8002 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8003 };
8004 
8005 void
8006 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8007 {
8008 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
8009 
8010 	if (cp == NULL)
8011 		cp = CPU;
8012 
8013 	/*
8014 	 * Space statically allocated for BSP, ensure pointer is set
8015 	 */
8016 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8017 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
8018 
8019 	ASSERT(cpuid_checkpass(cp, pass - 1));
8020 
8021 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8022 		if (cpuid_pass_defs[i].cpd_pass == pass) {
8023 			cpuid_pass_defs[i].cpd_func(cp, arg);
8024 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8025 			return;
8026 		}
8027 	}
8028 
8029 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8030 	    pass, cp->cpu_id);
8031 }
8032 
8033 /*
8034  * Extract the processor family from a chiprev.  Processor families are not the
8035  * same as cpuid families; see comments above and in x86_archext.h.
8036  */
8037 x86_processor_family_t
8038 chiprev_family(const x86_chiprev_t cr)
8039 {
8040 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8041 }
8042 
8043 /*
8044  * A chiprev matches its template if the vendor and family are identical and the
8045  * revision of the chiprev matches one of the bits set in the template.  Callers
8046  * may bitwise-OR together chiprevs of the same vendor and family to form the
8047  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8048  * multiple vendors or processor families with a single call.  Note that this
8049  * function operates on processor families, not cpuid families.
8050  */
8051 boolean_t
8052 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8053 {
8054 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8055 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8056 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8057 }
8058 
8059 /*
8060  * A chiprev is at least min if the vendor and family are identical and the
8061  * revision of the chiprev is at least as recent as that of min.  Processor
8062  * families are considered unordered and cannot be compared using this function.
8063  * Note that this function operates on processor families, not cpuid families.
8064  * Use of the _ANY chiprev variant with this function is not useful; it will
8065  * always return B_FALSE if the _ANY variant is supplied as the minimum
8066  * revision.  To determine only whether a chiprev is of a given processor
8067  * family, test the return value of chiprev_family() instead.
8068  */
8069 boolean_t
8070 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8071 {
8072 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8073 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8074 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8075 }
8076 
8077 /*
8078  * The uarch functions operate in a manner similar to the chiprev functions
8079  * above.  While it is tempting to allow these to operate on microarchitectures
8080  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8081  * than ZEN2), we elect not to do so because a manufacturer may supply
8082  * processors of multiple different microarchitecture families each of which may
8083  * be internally ordered but unordered with respect to those of other families.
8084  */
8085 x86_uarch_t
8086 uarchrev_uarch(const x86_uarchrev_t ur)
8087 {
8088 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8089 }
8090 
8091 boolean_t
8092 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8093 {
8094 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8095 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8096 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8097 }
8098 
8099 boolean_t
8100 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8101 {
8102 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8103 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8104 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8105 }
8106 
8107 /*
8108  * Topology cache related information. This is yet another cache interface that
8109  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8110  * AMD Leaf 8x1D (introduced with Zen 1).
8111  */
8112 static boolean_t
8113 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8114 {
8115 	switch (cpi->cpi_vendor) {
8116 	case X86_VENDOR_Intel:
8117 		if (cpi->cpi_maxeax >= 4) {
8118 			return (B_TRUE);
8119 		}
8120 		break;
8121 	case X86_VENDOR_AMD:
8122 	case X86_VENDOR_HYGON:
8123 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8124 		    is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8125 			return (B_TRUE);
8126 		}
8127 		break;
8128 	default:
8129 		break;
8130 	}
8131 
8132 	return (B_FALSE);
8133 }
8134 
8135 int
8136 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8137 {
8138 	const struct cpuid_info *cpi;
8139 
8140 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8141 	cpi = cpu->cpu_m.mcpu_cpi;
8142 
8143 	if (!cpuid_cache_topo_sup(cpi)) {
8144 		return (ENOTSUP);
8145 	}
8146 
8147 	*ncache = cpi->cpi_cache_leaf_size;
8148 	return (0);
8149 }
8150 
8151 int
8152 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8153 {
8154 	const struct cpuid_info *cpi;
8155 	const struct cpuid_regs *cp;
8156 
8157 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8158 	cpi = cpu->cpu_m.mcpu_cpi;
8159 
8160 	if (!cpuid_cache_topo_sup(cpi)) {
8161 		return (ENOTSUP);
8162 	}
8163 
8164 	if (cno >= cpi->cpi_cache_leaf_size) {
8165 		return (EINVAL);
8166 	}
8167 
8168 	bzero(cache, sizeof (cache));
8169 	cp = cpi->cpi_cache_leaves[cno];
8170 	switch (CPI_CACHE_TYPE(cp)) {
8171 	case CPI_CACHE_TYPE_DATA:
8172 		cache->xc_type = X86_CACHE_TYPE_DATA;
8173 		break;
8174 	case CPI_CACHE_TYPE_INSTR:
8175 		cache->xc_type = X86_CACHE_TYPE_INST;
8176 		break;
8177 	case CPI_CACHE_TYPE_UNIFIED:
8178 		cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8179 		break;
8180 	case CPI_CACHE_TYPE_DONE:
8181 	default:
8182 		return (EINVAL);
8183 	}
8184 	cache->xc_level = CPI_CACHE_LVL(cp);
8185 	if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8186 		cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8187 	}
8188 	cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8189 	/*
8190 	 * The number of sets is reserved on AMD if the CPU is tagged as fully
8191 	 * associative, where as it is considered valid on Intel.
8192 	 */
8193 	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8194 	    CPI_FULL_ASSOC_CACHE(cp) != 0) {
8195 		cache->xc_nsets = 1;
8196 	} else {
8197 		cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8198 	}
8199 	cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8200 	cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8201 	cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8202 	    cache->xc_line_size;
8203 	/*
8204 	 * We're looking for the number of bits to cover the number of CPUs that
8205 	 * are being shared. Normally this would be the value - 1, but the CPUID
8206 	 * value is encoded as the actual value minus one, so we don't modify
8207 	 * this at all.
8208 	 */
8209 	cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8210 
8211 	/*
8212 	 * To construct a unique ID we construct a uint64_t that looks as
8213 	 * follows:
8214 	 *
8215 	 * [47:40] cache level
8216 	 * [39:32] CPUID cache type
8217 	 * [31:00] shifted APIC ID
8218 	 *
8219 	 * The shifted APIC ID gives us a guarantee that a given cache entry is
8220 	 * unique within its peers. The other two numbers give us something that
8221 	 * ensures that something is unique within the CPU. If we just had the
8222 	 * APIC ID shifted over by the indicated number of bits we'd end up with
8223 	 * an ID of zero for the L1I, L1D, L2, and L3.
8224 	 *
8225 	 * The format of this ID is private to the system and can change across
8226 	 * a reboot for the time being.
8227 	 */
8228 	cache->xc_id = (uint64_t)cache->xc_level << 40;
8229 	cache->xc_id |= (uint64_t)cache->xc_type << 32;
8230 	cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8231 
8232 	return (0);
8233 }
8234