xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision 22e4c3ac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2022 Oxide Computer Company
28  * Copyright 2022 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * The Microcode Pass
328  *
329  * After a microcode update, we do a selective rescan of the cpuid leaves to
330  * determine what features have changed. Microcode updates can provide more
331  * details about security related features to deal with issues like Spectre and
332  * L1TF. On occasion, vendors have violated their contract and removed bits.
333  * However, we don't try to detect that because that puts us in a situation that
334  * we really can't deal with. As such, the only thing we rescan are security
335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
336  * different sequence on APs and therefore is not part of the sequential order;
337  * It is invoked directly instead of by cpuid_execpass() and its completion
338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
339  * a more complex dependency mechanism if warranted by future developments.
340  *
341  * All of the passes are run on all CPUs. However, for the most part we only
342  * care about what the boot CPU says about this information and use the other
343  * CPUs as a rough guide to sanity check that we have the same feature set.
344  *
345  * We do not support running multiple logical CPUs with disjoint, let alone
346  * different, feature sets.
347  *
348  * ------------------
349  * Processor Topology
350  * ------------------
351  *
352  * One of the important things that we need to do is to understand the topology
353  * of the underlying processor. When we say topology in this case, we're trying
354  * to understand the relationship between the logical CPUs that the operating
355  * system sees and the underlying physical layout. Different logical CPUs may
356  * share different resources which can have important consequences for the
357  * performance of the system. For example, they may share caches, execution
358  * units, and more.
359  *
360  * The topology of the processor changes from generation to generation and
361  * vendor to vendor.  Along with that, different vendors use different
362  * terminology, and the operating system itself uses occasionally overlapping
363  * terminology. It's important to understand what this topology looks like so
364  * one can understand the different things that we try to calculate and
365  * determine.
366  *
367  * To get started, let's talk about a little bit of terminology that we've used
368  * so far, is used throughout this file, and is fairly generic across multiple
369  * vendors:
370  *
371  * CPU
372  *	A central processing unit (CPU) refers to a logical and/or virtual
373  *	entity that the operating system can execute instructions on. The
374  *	underlying resources for this CPU may be shared between multiple
375  *	entities; however, to the operating system it is a discrete unit.
376  *
377  * PROCESSOR and PACKAGE
378  *
379  *	Generally, when we use the term 'processor' on its own, we are referring
380  *	to the physical entity that one buys and plugs into a board. However,
381  *	because processor has been overloaded and one might see it used to mean
382  *	multiple different levels, we will instead use the term 'package' for
383  *	the rest of this file. The term package comes from the electrical
384  *	engineering side and refers to the physical entity that encloses the
385  *	electronics inside. Strictly speaking the package can contain more than
386  *	just the CPU, for example, on many processors it may also have what's
387  *	called an 'integrated graphical processing unit (GPU)'. Because the
388  *	package can encapsulate multiple units, it is the largest physical unit
389  *	that we refer to.
390  *
391  * SOCKET
392  *
393  *	A socket refers to unit on a system board (generally the motherboard)
394  *	that can receive a package. A single package, or processor, is plugged
395  *	into a single socket. A system may have multiple sockets. Often times,
396  *	the term socket is used interchangeably with package and refers to the
397  *	electrical component that has plugged in, and not the receptacle itself.
398  *
399  * CORE
400  *
401  *	A core refers to the physical instantiation of a CPU, generally, with a
402  *	full set of hardware resources available to it. A package may contain
403  *	multiple cores inside of it or it may just have a single one. A
404  *	processor with more than one core is often referred to as 'multi-core'.
405  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
406  *	that has 'multi-core' processors.
407  *
408  *	A core may expose a single logical CPU to the operating system, or it
409  *	may expose multiple CPUs, which we call threads, defined below.
410  *
411  *	Some resources may still be shared by cores in the same package. For
412  *	example, many processors will share the level 3 cache between cores.
413  *	Some AMD generations share hardware resources between cores. For more
414  *	information on that see the section 'AMD Topology'.
415  *
416  * THREAD and STRAND
417  *
418  *	In this file, generally a thread refers to a hardware resources and not
419  *	the operating system's logical abstraction. A thread is always exposed
420  *	as an independent logical CPU to the operating system. A thread belongs
421  *	to a specific core. A core may have more than one thread. When that is
422  *	the case, the threads that are part of the same core are often referred
423  *	to as 'siblings'.
424  *
425  *	When multiple threads exist, this is generally referred to as
426  *	simultaneous multi-threading (SMT). When Intel introduced this in their
427  *	processors they called it hyper-threading (HT). When multiple threads
428  *	are active in a core, they split the resources of the core. For example,
429  *	two threads may share the same set of hardware execution units.
430  *
431  *	The operating system often uses the term 'strand' to refer to a thread.
432  *	This helps disambiguate it from the software concept.
433  *
434  * CHIP
435  *
436  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
437  *	base meaning, it is used to refer to a single integrated circuit, which
438  *	may or may not be the only thing in the package. In illumos, when you
439  *	see the term 'chip' it is almost always referring to the same thing as
440  *	the 'package'. However, many vendors may use chip to refer to one of
441  *	many integrated circuits that have been placed in the package. As an
442  *	example, see the subsequent definition.
443  *
444  *	To try and keep things consistent, we will only use chip when referring
445  *	to the entire integrated circuit package, with the exception of the
446  *	definition of multi-chip module (because it is in the name) and use the
447  *	term 'die' when we want the more general, potential sub-component
448  *	definition.
449  *
450  * DIE
451  *
452  *	A die refers to an integrated circuit. Inside of the package there may
453  *	be a single die or multiple dies. This is sometimes called a 'chip' in
454  *	vendor's parlance, but in this file, we use the term die to refer to a
455  *	subcomponent.
456  *
457  * MULTI-CHIP MODULE
458  *
459  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
460  *	are connected together in the same package. When a multi-chip design is
461  *	used, generally each chip is manufactured independently and then joined
462  *	together in the package. For example, on AMD's Zen microarchitecture
463  *	(family 0x17), the package contains several dies (the second meaning of
464  *	chip from above) that are connected together.
465  *
466  * CACHE
467  *
468  *	A cache is a part of the processor that maintains copies of recently
469  *	accessed memory. Caches are split into levels and then into types.
470  *	Commonly there are one to three levels, called level one, two, and
471  *	three. The lower the level, the smaller it is, the closer it is to the
472  *	execution units of the CPU, and the faster it is to access. The layout
473  *	and design of the cache come in many different flavors, consult other
474  *	resources for a discussion of those.
475  *
476  *	Caches are generally split into two types, the instruction and data
477  *	cache. The caches contain what their names suggest, the instruction
478  *	cache has executable program text, while the data cache has all other
479  *	memory that the processor accesses. As of this writing, data is kept
480  *	coherent between all of the caches on x86, so if one modifies program
481  *	text before it is executed, that will be in the data cache, and the
482  *	instruction cache will be synchronized with that change when the
483  *	processor actually executes those instructions. This coherency also
484  *	covers the fact that data could show up in multiple caches.
485  *
486  *	Generally, the lowest level caches are specific to a core. However, the
487  *	last layer cache is shared between some number of cores. The number of
488  *	CPUs sharing this last level cache is important. This has implications
489  *	for the choices that the scheduler makes, as accessing memory that might
490  *	be in a remote cache after thread migration can be quite expensive.
491  *
492  *	Sometimes, the word cache is abbreviated with a '$', because in US
493  *	English the word cache is pronounced the same as cash. So L1D$ refers to
494  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
495  *	in the rest of this theory statement for clarity.
496  *
497  * MEMORY CONTROLLER
498  *
499  *	The memory controller is a component that provides access to DRAM. Each
500  *	memory controller can access a set number of DRAM channels. Each channel
501  *	can have a number of DIMMs (sticks of memory) associated with it. A
502  *	given package may have more than one memory controller. The association
503  *	of the memory controller to a group of cores is important as it is
504  *	cheaper to access memory on the controller that you are associated with.
505  *
506  * NUMA
507  *
508  *	NUMA or non-uniform memory access, describes a way that systems are
509  *	built. On x86, any processor core can address all of the memory in the
510  *	system. However, When using multiple sockets or possibly within a
511  *	multi-chip module, some of that memory is physically closer and some of
512  *	it is further. Memory that is further away is more expensive to access.
513  *	Consider the following image of multiple sockets with memory:
514  *
515  *	+--------+                                                +--------+
516  *	| DIMM A |         +----------+      +----------+         | DIMM D |
517  *	+--------+-+       |          |      |          |       +-+------+-+
518  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
519  *	  +--------+-+     |          |      |          |     +-+------+-+
520  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
521  *	    +--------+                                        +--------+
522  *
523  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
524  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
525  *	access DIMMs A-C and more expensive to access D-F as it has to go
526  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
527  *	D-F are cheaper than A-C. While the socket form is the most common, when
528  *	using multi-chip modules, this can also sometimes occur. For another
529  *	example of this that's more involved, see the AMD topology section.
530  *
531  *
532  * Intel Topology
533  * --------------
534  *
535  * Most Intel processors since Nehalem, (as of this writing the current gen
536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
537  * the package is a single monolithic die. MCMs currently aren't used. Most
538  * parts have three levels of caches, with the L3 cache being shared between
539  * all of the cores on the package. The L1/L2 cache is generally specific to
540  * an individual core. The following image shows at a simplified level what
541  * this looks like. The memory controller is commonly part of something called
542  * the 'Uncore', that used to be separate physical chips that were not a part of
543  * the package, but are now part of the same chip.
544  *
545  *  +-----------------------------------------------------------------------+
546  *  | Package                                                               |
547  *  |  +-------------------+  +-------------------+  +-------------------+  |
548  *  |  | Core              |  | Core              |  | Core              |  |
549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
558  *  |  +-------------------+  +-------------------+  +-------------------+  |
559  *  | +-------------------------------------------------------------------+ |
560  *  | |                         Shared L3 Cache                           | |
561  *  | +-------------------------------------------------------------------+ |
562  *  | +-------------------------------------------------------------------+ |
563  *  | |                        Memory Controller                          | |
564  *  | +-------------------------------------------------------------------+ |
565  *  +-----------------------------------------------------------------------+
566  *
567  * A side effect of this current architecture is that what we care about from a
568  * scheduling and topology perspective, is simplified. In general we care about
569  * understanding which logical CPUs are part of the same core and socket.
570  *
571  * To determine the relationship between threads and cores, Intel initially used
572  * the identifier in the advanced programmable interrupt controller (APIC). They
573  * also added cpuid leaf 4 to give additional information about the number of
574  * threads and CPUs in the processor. With the addition of x2apic (which
575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
576  * additional cpuid topology leaf 0xB was added.
577  *
578  * AMD Topology
579  * ------------
580  *
581  * When discussing AMD topology, we want to break this into three distinct
582  * generations of topology. There's the basic topology that has been used in
583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
587  * additional terminology that's worth talking about.
588  *
589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
590  * that they considered SMT. Whether or not the AMD processors have SMT
591  * influences many things including scheduling and reliability, availability,
592  * and serviceability (RAS) features.
593  *
594  * NODE
595  *
596  *	AMD uses the term node to refer to a die that contains a number of cores
597  *	and I/O resources. Depending on the processor family and model, more
598  *	than one node can be present in the package. When there is more than one
599  *	node this indicates a multi-chip module. Usually each node has its own
600  *	access to memory and I/O devices. This is important and generally
601  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
602  *	result, we track this relationship in the operating system.
603  *
604  *	In processors with an L3 cache, the L3 cache is generally shared across
605  *	the entire node, though the way this is carved up varies from generation
606  *	to generation.
607  *
608  * BULLDOZER
609  *
610  *	Starting with the Bulldozer family (0x15) and continuing until the
611  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
612  *	compute unit. In a compute unit, two traditional cores share a number of
613  *	hardware resources. Critically, they share the FPU, L1 instruction
614  *	cache, and the L2 cache. Several compute units were then combined inside
615  *	of a single node.  Because the integer execution units, L1 data cache,
616  *	and some other resources were not shared between the cores, AMD never
617  *	considered this to be SMT.
618  *
619  * ZEN
620  *
621  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
622  *	is called Zeppelin. These modules are similar to the idea of nodes used
623  *	previously. Each of these nodes has two DRAM channels which all of the
624  *	cores in the node can access uniformly. These nodes are linked together
625  *	in the package, creating a NUMA environment.
626  *
627  *	The Zeppelin die itself contains two different 'core complexes'. Each
628  *	core complex consists of four cores which each have two threads, for a
629  *	total of 8 logical CPUs per complex. Unlike other generations,
630  *	where all the logical CPUs in a given node share the L3 cache, here each
631  *	core complex has its own shared L3 cache.
632  *
633  *	A further thing that we need to consider is that in some configurations,
634  *	particularly with the Threadripper line of processors, not every die
635  *	actually has its memory controllers wired up to actual memory channels.
636  *	This means that some cores have memory attached to them and others
637  *	don't.
638  *
639  *	To put Zen in perspective, consider the following images:
640  *
641  *      +--------------------------------------------------------+
642  *      | Core Complex                                           |
643  *      | +-------------------+    +-------------------+  +---+  |
644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
650  *      | +-------------------+    +-------------------+  | C |  |
651  *      | +-------------------+    +-------------------+  | a |  |
652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
658  *      | +-------------------+    +-------------------+  +---+  |
659  *      |                                                        |
660  *	+--------------------------------------------------------+
661  *
662  *  This first image represents a single Zen core complex that consists of four
663  *  cores.
664  *
665  *
666  *	+--------------------------------------------------------+
667  *	| Zeppelin Die                                           |
668  *	|  +--------------------------------------------------+  |
669  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
670  *	|  +--------------------------------------------------+  |
671  *      |                           HH                           |
672  *	|          +-----------+    HH    +-----------+          |
673  *	|          |           |    HH    |           |          |
674  *	|          |    Core   |==========|    Core   |          |
675  *	|          |  Complex  |==========|  Complex  |          |
676  *	|          |           |    HH    |           |          |
677  *	|          +-----------+    HH    +-----------+          |
678  *      |                           HH                           |
679  *	|  +--------------------------------------------------+  |
680  *	|  |                Memory Controller                 |  |
681  *	|  +--------------------------------------------------+  |
682  *      |                                                        |
683  *	+--------------------------------------------------------+
684  *
685  *  This image represents a single Zeppelin Die. Note how both cores are
686  *  connected to the same memory controller and I/O units. While each core
687  *  complex has its own L3 cache as seen in the first image, they both have
688  *  uniform access to memory.
689  *
690  *
691  *                      PP                     PP
692  *                      PP                     PP
693  *           +----------PP---------------------PP---------+
694  *           |          PP                     PP         |
695  *           |    +-----------+          +-----------+    |
696  *           |    |           |          |           |    |
697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
699  *           |    |           |          |           |    |
700  *           |    +-----------+ooo    ...+-----------+    |
701  *           |          HH      ooo  ...       HH         |
702  *           |          HH        oo..         HH         |
703  *           |          HH        ..oo         HH         |
704  *           |          HH      ...  ooo       HH         |
705  *           |    +-----------+...    ooo+-----------+    |
706  *           |    |           |          |           |    |
707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
709  *           |    |           |          |           |    |
710  *           |    +-----------+          +-----------+    |
711  *           |          PP                     PP         |
712  *           +----------PP---------------------PP---------+
713  *                      PP                     PP
714  *                      PP                     PP
715  *
716  *  This image represents a single Zen package. In this example, it has four
717  *  Zeppelin dies, though some configurations only have a single one. In this
718  *  example, each die is directly connected to the next. Also, each die is
719  *  represented as being connected to memory by the 'M' character and connected
720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
721  *  die is made up of two core complexes, we have multiple different NUMA
722  *  domains that we care about for these systems.
723  *
724  * ZEN 2
725  *
726  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
727  *	each Zeppelin Die had its own I/O die, that has been moved out of the
728  *	core complex in Zen 2. The actual core complex looks pretty similar, but
729  *	now the die actually looks much simpler:
730  *
731  *      +--------------------------------------------------------+
732  *      | Zen 2 Core Complex Die    HH                           |
733  *      |                           HH                           |
734  *      |          +-----------+    HH    +-----------+          |
735  *      |          |           |    HH    |           |          |
736  *      |          |    Core   |==========|    Core   |          |
737  *      |          |  Complex  |==========|  Complex  |          |
738  *      |          |           |    HH    |           |          |
739  *      |          +-----------+    HH    +-----------+          |
740  *      |                           HH                           |
741  *      |                           HH                           |
742  *      +--------------------------------------------------------+
743  *
744  *	From here, when we add the central I/O die, this changes things a bit.
745  *	Each die is connected to the I/O die, rather than trying to interconnect
746  *	them directly. The following image takes the same Zen 1 image that we
747  *	had earlier and shows what it looks like with the I/O die instead:
748  *
749  *                                 PP    PP
750  *                                 PP    PP
751  *           +---------------------PP----PP---------------------+
752  *           |                     PP    PP                     |
753  *           |  +-----------+      PP    PP      +-----------+  |
754  *           |  |           |      PP    PP      |           |  |
755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
757  *           |  |         |o|oooo|          |oooo|o|         |  |
758  *           |  +-----------+    |          |    +-----------+  |
759  *           |                   |   I/O    |                   |
760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
762  *           |                   |          |                   |
763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
765  *           |                   |          |                   |
766  *           |  +-----------+    |          |    +-----------+  |
767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
769  *           |  |    Die    |      PP    PP      |    Die    |  |
770  *           |  |           |      PP    PP      |           |  |
771  *           |  +-----------+      PP    PP      +-----------+  |
772  *           |                     PP    PP                     |
773  *           +---------------------PP----PP---------------------+
774  *                                 PP    PP
775  *                                 PP    PP
776  *
777  *	The above has four core complex dies installed, though the Zen 2 EPYC
778  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
779  *	generally only have one to two. The more notable difference here is how
780  *	everything communicates. Note that memory and PCIe come out of the
781  *	central die. This changes the way that one die accesses a resource. It
782  *	basically always has to go to the I/O die, where as in Zen 1 it may have
783  *	satisfied it locally. In general, this ends up being a better strategy
784  *	for most things, though it is possible to still treat everything in four
785  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
786  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
787  *	now there is only one 'node' present.
788  *
789  * ZEN 3
790  *
791  *	From an architectural perspective, Zen 3 is a much smaller change from
792  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
793  *	its microarchitectural changes. The biggest thing for us is how the die
794  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
795  *	cache. However, in Zen 3, the L3 is now shared between the entire core
796  *	complex die and is no longer partitioned between each core complex. This
797  *	means that all cores on the die can share the same L3 cache. Otherwise,
798  *	the general layout of the overall package with various core complexes
799  *	and an I/O die stays the same. Here's what the Core Complex Die looks
800  *	like in a bit more detail:
801  *
802  *               +-------------------------------------------------+
803  *               | Zen 3 Core Complex Die                          |
804  *               | +-------------------+    +-------------------+  |
805  *               | | Core       +----+ |    | Core       +----+ |  |
806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
811  *               | +-------------------+    +-------------------+  |
812  *               | +-------------------+    +-------------------+  |
813  *               | | Core       +----+ |    | Core       +----+ |  |
814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
819  *               | +-------------------+    +-------------------+  |
820  *               |                                                 |
821  *               | +--------------------------------------------+  |
822  *               | |                 L3 Cache                   |  |
823  *               | +--------------------------------------------+  |
824  *               |                                                 |
825  *               | +-------------------+    +-------------------+  |
826  *               | | Core       +----+ |    | Core       +----+ |  |
827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
832  *               | +-------------------+    +-------------------+  |
833  *               | +-------------------+    +-------------------+  |
834  *               | | Core       +----+ |    | Core       +----+ |  |
835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
840  *               | +-------------------+    +-------------------+  |
841  *               +-------------------------------------------------+
842  *
843  *	While it is not pictured, there are connections from the die to the
844  *	broader data fabric and additional functional blocks to support that
845  *	communication and coherency.
846  *
847  * CPUID LEAVES
848  *
849  * There are a few different CPUID leaves that we can use to try and understand
850  * the actual state of the world. As part of the introduction of family 0xf, AMD
851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
852  * processors that are in the system. Because families before Zen didn't have
853  * SMT, this was always the number of cores that were in the system. However, it
854  * should always be thought of as the number of logical threads to be consistent
855  * between generations. In addition we also get the size of the APIC ID that is
856  * used to represent the number of logical processors. This is important for
857  * deriving topology information.
858  *
859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
860  * bit between Bulldozer and later families, but it is quite useful in
861  * determining the topology information. Because this information has changed
862  * across family generations, it's worth calling out what these mean
863  * explicitly. The registers have the following meanings:
864  *
865  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
866  *		APIC ID, even though on systems without x2apic support, it will
867  *		be limited to 8 bits.
868  *
869  *	%ebx	On Bulldozer-era systems this contains information about the
870  *		number of cores that are in a compute unit (cores that share
871  *		resources). It also contains a per-package compute unit ID that
872  *		identifies which compute unit the logical CPU is a part of.
873  *
874  *		On Zen-era systems this instead contains the number of threads
875  *		per core and the ID of the core that the logical CPU is a part
876  *		of. Note, this ID is unique only to the package, it is not
877  *		globally unique across the entire system.
878  *
879  *	%ecx	This contains the number of nodes that exist in the package. It
880  *		also contains an ID that identifies which node the logical CPU
881  *		is a part of.
882  *
883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
884  * cache layout to determine which logical CPUs are sharing which caches.
885  *
886  * illumos Topology
887  * ----------------
888  *
889  * Based on the above we synthesize the information into several different
890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
891  * of what each member is supposed to represent and their uniqueness. In
892  * general, there are two levels of uniqueness that we care about. We care about
893  * an ID that is globally unique. That means that it will be unique across all
894  * entities in the system. For example, the default logical CPU ID is globally
895  * unique. On the other hand, there is some information that we only care about
896  * being unique within the context of a single package / socket. Here are the
897  * variables that we keep track of and their meaning.
898  *
899  * Several of the values that are asking for an identifier, with the exception
900  * of cpi_apicid, are allowed to be synthetic.
901  *
902  *
903  * cpi_apicid
904  *
905  *	This is the value of the CPU's APIC id. This should be the full 32-bit
906  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
907  *	APIC ID. This value is globally unique between all logical CPUs across
908  *	all packages. This is usually required by the APIC.
909  *
910  * cpi_chipid
911  *
912  *	This value indicates the ID of the package that the logical CPU is a
913  *	part of. This value is allowed to be synthetic. It is usually derived by
914  *	taking the CPU's APIC ID and determining how many bits are used to
915  *	represent CPU cores in the package. All logical CPUs that are part of
916  *	the same package must have the same value.
917  *
918  * cpi_coreid
919  *
920  *	This represents the ID of a CPU core. Two logical CPUs should only have
921  *	the same cpi_coreid value if they are part of the same core. These
922  *	values may be synthetic. On systems that support SMT, this value is
923  *	usually derived from the APIC ID, otherwise it is often synthetic and
924  *	just set to the value of the cpu_id in the cpu_t.
925  *
926  * cpi_pkgcoreid
927  *
928  *	This is similar to the cpi_coreid in that logical CPUs that are part of
929  *	the same core should have the same ID. The main difference is that these
930  *	values are only required to be unique to a given socket.
931  *
932  * cpi_clogid
933  *
934  *	This represents the logical ID of a logical CPU. This value should be
935  *	unique within a given socket for each logical CPU. This is allowed to be
936  *	synthetic, though it is usually based off of the CPU's apic ID. The
937  *	broader system expects that logical CPUs that have are part of the same
938  *	core have contiguous numbers. For example, if there were two threads per
939  *	core, then the core IDs divided by two should be the same and the first
940  *	modulus two should be zero and the second one. For example, IDs 4 and 5
941  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
942  *	6 represent two logical CPUs that are part of different cores.
943  *
944  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
945  *	from the same source, strictly speaking, they don't have to be and the
946  *	two values should be considered logically independent. One should not
947  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
948  *	some kind of relationship. While this is tempting, we've seen cases on
949  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
950  *
951  * cpi_ncpu_per_chip
952  *
953  *	This value indicates the total number of logical CPUs that exist in the
954  *	physical package. Critically, this is not the number of logical CPUs
955  *	that exist for just the single core.
956  *
957  *	This value should be the same for all logical CPUs in the same package.
958  *
959  * cpi_ncore_per_chip
960  *
961  *	This value indicates the total number of physical CPU cores that exist
962  *	in the package. The system compares this value with cpi_ncpu_per_chip to
963  *	determine if simultaneous multi-threading (SMT) is enabled. When
964  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
965  *	the X86FSET_HTT feature is not set. If this value is greater than one,
966  *	than we consider the processor to have the feature X86FSET_CMP, to
967  *	indicate that there is support for more than one core.
968  *
969  *	This value should be the same for all logical CPUs in the same package.
970  *
971  * cpi_procnodes_per_pkg
972  *
973  *	This value indicates the number of 'nodes' that exist in the package.
974  *	When processors are actually a multi-chip module, this represents the
975  *	number of such modules that exist in the package. Currently, on Intel
976  *	based systems this member is always set to 1.
977  *
978  *	This value should be the same for all logical CPUs in the same package.
979  *
980  * cpi_procnodeid
981  *
982  *	This value indicates the ID of the node that the logical CPU is a part
983  *	of. All logical CPUs that are in the same node must have the same value
984  *	here. This value must be unique across all of the packages in the
985  *	system.  On Intel based systems, this is currently set to the value in
986  *	cpi_chipid because there is only one node.
987  *
988  * cpi_cores_per_compunit
989  *
990  *	This value indicates the number of cores that are part of a compute
991  *	unit. See the AMD topology section for this. This member only has real
992  *	meaning currently for AMD Bulldozer family processors. For all other
993  *	processors, this should currently be set to 1.
994  *
995  * cpi_compunitid
996  *
997  *	This indicates the compute unit that the logical CPU belongs to. For
998  *	processors without AMD Bulldozer-style compute units this should be set
999  *	to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *	This indicates the number of logical CPUs that are sharing the same last
1004  *	level cache. This value should be the same for all CPUs that are sharing
1005  *	that cache. The last cache refers to the cache that is closest to memory
1006  *	and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *	This indicates the ID of the last cache that the logical CPU uses. This
1011  *	cache is often shared between multiple logical CPUs and is the cache
1012  *	that is closest to memory and furthest away from the CPU. This value
1013  *	should be the same for a group of logical CPUs only if they actually
1014  *	share the same last level cache. IDs should not overlap between
1015  *	packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *	This indicates the number of bits that are required to represent all of
1020  *	the cores in the system. As cores are derived based on their APIC IDs,
1021  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *	this value to be larger than the actual number of IDs that are present
1023  *	in the system. This is used to size tables by the CMI framework. It is
1024  *	only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *	This indicates the number of bits required to represent all of the IDs
1029  *	that cover the logical CPUs that exist on a given core. It's OK for this
1030  *	value to be larger than the actual number of IDs that are present in the
1031  *	system.  This is used to size tables by the CMI framework. It is
1032  *	only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *
1117  * Each of these requires different sets of mitigations and has different attack
1118  * surfaces. For the most part, this discussion is about protecting the kernel
1119  * from non-kernel executing environments such as user processes and hardware
1120  * virtual machines. Unfortunately, there are a number of user vs. user
1121  * scenarios that exist with these. The rest of this section will describe the
1122  * overall approach that the system has taken to address these as well as their
1123  * shortcomings. Unfortunately, not all of the above have been handled today.
1124  *
1125  * SPECTRE v2, ret2spec, SpectreRSB
1126  *
1127  * The second variant of the spectre attack focuses on performing branch target
1128  * injection. This generally impacts indirect call instructions in the system.
1129  * There are three different ways to mitigate this issue that are commonly
1130  * described today:
1131  *
1132  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1133  *  2. Using Retpolines and RSB Stuffing
1134  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1135  *
1136  * IBRS uses a feature added to microcode to restrict speculation, among other
1137  * things. This form of mitigation has not been used as it has been generally
1138  * seen as too expensive and requires reactivation upon various transitions in
1139  * the system.
1140  *
1141  * As a less impactful alternative to IBRS, retpolines were developed by
1142  * Google. These basically require one to replace indirect calls with a specific
1143  * trampoline that will cause speculation to fail and break the attack.
1144  * Retpolines require compiler support. We always build with retpolines in the
1145  * external thunk mode. This means that a traditional indirect call is replaced
1146  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1147  * of this is that all indirect function calls are performed through a register.
1148  *
1149  * We have to use a common external location of the thunk and not inline it into
1150  * the callsite so that way we can have a single place to patch these functions.
1151  * As it turns out, we currently have two different forms of retpolines that
1152  * exist in the system:
1153  *
1154  *  1. A full retpoline
1155  *  2. A no-op version
1156  *
1157  * The first one is used in the general case. Historically, there was an
1158  * AMD-specific optimized retopoline variant that was based around using a
1159  * serializing lfence instruction; however, in March 2022 it was announced that
1160  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1161  * use it and it is no longer available in the system.
1162  *
1163  * The third form described above is the most curious. It turns out that the way
1164  * that retpolines are implemented is that they rely on how speculation is
1165  * performed on a 'ret' instruction. Intel has continued to optimize this
1166  * process (which is partly why we need to have return stack buffer stuffing,
1167  * but more on that in a bit) and in processors starting with Cascade Lake
1168  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1169  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1170  *
1171  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1172  * physical core. However, if this is the case, we don't want to use retpolines
1173  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1174  * function (called a thunk) into a jmp instruction. This means that we're still
1175  * paying the cost of an extra jump to the external thunk, but it gives us
1176  * flexibility and the ability to have a single kernel image that works across a
1177  * wide variety of systems and hardware features.
1178  *
1179  * Unfortunately, this alone is insufficient. First, Skylake systems have
1180  * additional speculation for the Return Stack Buffer (RSB) which is used to
1181  * return from call instructions which retpolines take advantage of. However,
1182  * this problem is not just limited to Skylake and is actually more pernicious.
1183  * The SpectreRSB paper introduces several more problems that can arise with
1184  * dealing with this. The RSB can be poisoned just like the indirect branch
1185  * predictor. This means that one needs to clear the RSB when transitioning
1186  * between two different privilege domains. Some examples include:
1187  *
1188  *  - Switching between two different user processes
1189  *  - Going between user land and the kernel
1190  *  - Returning to the kernel from a hardware virtual machine
1191  *
1192  * Mitigating this involves combining a couple of different things. The first is
1193  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1194  * Bridge. When an RSB entry refers to a user address and we're executing in the
1195  * kernel, speculation through it will be stopped when SMEP is enabled. This
1196  * protects against a number of the different cases that we would normally be
1197  * worried about such as when we enter the kernel from user land.
1198  *
1199  * To prevent against additional manipulation of the RSB from other contexts
1200  * such as a non-root VMX context attacking the kernel we first look to
1201  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1202  * nothing else that we need to do to protect the kernel at this time.
1203  *
1204  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1205  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1206  * Currently this is employed on context switch and vmx_exit. The
1207  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1208  *
1209  * If SMEP is not present, then we would have to stuff the RSB every time we
1210  * transitioned from user mode to the kernel, which isn't very practical right
1211  * now.
1212  *
1213  * To fully protect user to user and vmx to vmx attacks from these classes of
1214  * issues, we would also need to allow them to opt into performing an Indirect
1215  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1216  *
1217  * By default, the system will enable RSB stuffing and the required variant of
1218  * retpolines and store that information in the x86_spectrev2_mitigation value.
1219  * This will be evaluated after a microcode update as well, though it is
1220  * expected that microcode updates will not take away features. This may mean
1221  * that a late loaded microcode may not end up in the optimal configuration
1222  * (though this should be rare).
1223  *
1224  * Currently we do not build kmdb with retpolines or perform any additional side
1225  * channel security mitigations for it. One complication with kmdb is that it
1226  * requires its own retpoline thunks and it would need to adjust itself based on
1227  * what the kernel does. The threat model of kmdb is more limited and therefore
1228  * it may make more sense to investigate using prediction barriers as the whole
1229  * system is only executing a single instruction at a time while in kmdb.
1230  *
1231  * SPECTRE v1, v4
1232  *
1233  * The v1 and v4 variants of spectre are not currently mitigated in the
1234  * system and require other classes of changes to occur in the code.
1235  *
1236  * SPECTRE v1 (SWAPGS VARIANT)
1237  *
1238  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1239  * can generally affect any branch-dependent code. The swapgs issue is one
1240  * variant of this. If we are coming in from userspace, we can have code like
1241  * this:
1242  *
1243  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1244  *	je	1f
1245  *	movq	$0, REGOFF_SAVFP(%rsp)
1246  *	swapgs
1247  *	1:
1248  *	movq	%gs:CPU_THREAD, %rax
1249  *
1250  * If an attacker can cause a mis-speculation of the branch here, we could skip
1251  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1252  * load. If subsequent code can act as the usual Spectre cache gadget, this
1253  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1254  * any use of the %gs override.
1255  *
1256  * The other case is also an issue: if we're coming into a trap from kernel
1257  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1258  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1259  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1260  * case, and the fix is the same in both cases (an lfence at the branch target
1261  * 1: in this example), we'll just do it unconditionally.
1262  *
1263  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1264  * harder for user-space to actually set a useful %gsbase value: although it's
1265  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1266  * mitigate anyway.
1267  *
1268  * MELTDOWN
1269  *
1270  * Meltdown, or spectre v3, allowed a user process to read any data in their
1271  * address space regardless of whether or not the page tables in question
1272  * allowed the user to have the ability to read them. The solution to meltdown
1273  * is kernel page table isolation. In this world, there are two page tables that
1274  * are used for a process, one in user land and one in the kernel. To implement
1275  * this we use per-CPU page tables and switch between the user and kernel
1276  * variants when entering and exiting the kernel.  For more information about
1277  * this process and how the trampolines work, please see the big theory
1278  * statements and additional comments in:
1279  *
1280  *  - uts/i86pc/ml/kpti_trampolines.s
1281  *  - uts/i86pc/vm/hat_i86.c
1282  *
1283  * While Meltdown only impacted Intel systems and there are also Intel systems
1284  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1285  * kernel page table isolation enabled. While this may at first seem weird, an
1286  * important thing to remember is that you can't speculatively read an address
1287  * if it's never in your page table at all. Having user processes without kernel
1288  * pages present provides us with an important layer of defense in the kernel
1289  * against any other side channel attacks that exist and have yet to be
1290  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1291  * default, no matter the x86 system.
1292  *
1293  * L1 TERMINAL FAULT
1294  *
1295  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1296  * execution uses page table entries. Effectively, it is two different problems.
1297  * The first is that it ignores the not present bit in the page table entries
1298  * when performing speculative execution. This means that something can
1299  * speculatively read the listed physical address if it's present in the L1
1300  * cache under certain conditions (see Intel's documentation for the full set of
1301  * conditions). Secondly, this can be used to bypass hardware virtualization
1302  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1303  * instructions.
1304  *
1305  * For the non-hardware virtualized case, this is relatively easy to deal with.
1306  * We must make sure that all unmapped pages have an address of zero. This means
1307  * that they could read the first 4k of physical memory; however, we never use
1308  * that first page in the operating system and always skip putting it in our
1309  * memory map, even if firmware tells us we can use it in our memory map. While
1310  * other systems try to put extra metadata in the address and reserved bits,
1311  * which led to this being problematic in those cases, we do not.
1312  *
1313  * For hardware virtual machines things are more complicated. Because they can
1314  * construct their own page tables, it isn't hard for them to perform this
1315  * attack against any physical address. The one wrinkle is that this physical
1316  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1317  * to flush the L1 data cache. We wrap this up in the function
1318  * spec_uarch_flush(). This function is also used in the mitigation of
1319  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1320  * hypervisors such as KVM or bhyve are responsible for performing this before
1321  * entering the guest.
1322  *
1323  * Because this attack takes place in the L1 cache, there's another wrinkle
1324  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1325  * designs. This means that when a thread enters a hardware virtualized context
1326  * and flushes the L1 data cache, the other thread on the processor may then go
1327  * ahead and put new data in it that can be potentially attacked. While one
1328  * solution is to disable SMT on the system, another option that is available is
1329  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1330  * goes through and makes sure that if a HVM is being scheduled on one thread,
1331  * then the thing on the other thread is from the same hardware virtual machine.
1332  * If an interrupt comes in or the guest exits to the broader system, then the
1333  * other SMT thread will be kicked out.
1334  *
1335  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1336  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1337  * perform L1TF related mitigations.
1338  *
1339  * MICROARCHITECTURAL DATA SAMPLING
1340  *
1341  * Microarchitectural data sampling (MDS) is a combination of four discrete
1342  * vulnerabilities that are similar issues affecting various parts of the CPU's
1343  * microarchitectural implementation around load, store, and fill buffers.
1344  * Specifically it is made up of the following subcomponents:
1345  *
1346  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1347  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1348  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1349  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1350  *
1351  * To begin addressing these, Intel has introduced another feature in microcode
1352  * called MD_CLEAR. This changes the verw instruction to operate in a different
1353  * way. This allows us to execute the verw instruction in a particular way to
1354  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1355  * updated when this microcode is present to flush this state.
1356  *
1357  * Primarily we need to flush this state whenever we transition from the kernel
1358  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1359  * little bit different. Here the structures are statically sized when a logical
1360  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1361  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1362  * mwait, or another ACPI method. To perform these flushes, we call
1363  * x86_md_clear() at all of these transition points.
1364  *
1365  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1366  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1367  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1368  * a no-op.
1369  *
1370  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1371  * particular, everything we've discussed above is only valid for a single
1372  * thread executing on a core. In the case where you have hyper-threading
1373  * present, this attack can be performed between threads. The theoretical fix
1374  * for this is to ensure that both threads are always in the same security
1375  * domain. This means that they are executing in the same ring and mutually
1376  * trust each other. Practically speaking, this would mean that a system call
1377  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1378  * Rather than implement this, we recommend that one disables hyper-threading
1379  * through the use of psradm -aS.
1380  *
1381  * TSX ASYNCHRONOUS ABORT
1382  *
1383  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1384  * behaves like MDS, but leverages Intel's transactional instructions as another
1385  * vector. Effectively, when a transaction hits one of these cases (unmapped
1386  * page, various cache snoop activity, etc.) then the same data can be exposed
1387  * as in the case of MDS. This means that you can attack your twin.
1388  *
1389  * Intel has described that there are two different ways that we can mitigate
1390  * this problem on affected processors:
1391  *
1392  *   1) We can use the same techniques used to deal with MDS. Flushing the
1393  *      microarchitectural buffers and disabling hyperthreading will mitigate
1394  *      this in the same way.
1395  *
1396  *   2) Using microcode to disable TSX.
1397  *
1398  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1399  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1400  * That's OK as we're already doing all such mitigations. On the other hand,
1401  * processors with MDS_NO are all supposed to receive microcode updates that
1402  * enumerate support for disabling TSX. In general, we'd rather use this method
1403  * when available as it doesn't require disabling hyperthreading to be
1404  * effective. Currently we basically are relying on microcode for processors
1405  * that enumerate MDS_NO.
1406  *
1407  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1408  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1409  * different powers. The first allows us to cause all transactions to
1410  * immediately abort. The second gives us a means of disabling TSX completely,
1411  * which includes removing it from cpuid. If we have support for this in
1412  * microcode during the first cpuid pass, then we'll disable TSX completely such
1413  * that user land never has a chance to observe the bit. However, if we are late
1414  * loading the microcode, then we must use the functionality to cause
1415  * transactions to automatically abort. This is necessary for user land's sake.
1416  * Once a program sees a cpuid bit, it must not be taken away.
1417  *
1418  * We track whether or not we should do this based on what cpuid pass we're in.
1419  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1420  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1421  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1422  * second time after we do the initial microcode update.  As a result we need to
1423  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1424  * suitable microcode on the current CPU (which happens prior to
1425  * cpuid_pass_ucode()).
1426  *
1427  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1428  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1429  * unfortunate feature in a number of ways, and taking the opportunity to
1430  * finally be able to turn it off is likely to be of benefit in the future.
1431  *
1432  * SUMMARY
1433  *
1434  * The following table attempts to summarize the mitigations for various issues
1435  * and what's done in various places:
1436  *
1437  *  - Spectre v1: Not currently mitigated
1438  *  - swapgs: lfences after swapgs paths
1439  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS if HW support
1440  *  - Meltdown: Kernel Page Table Isolation
1441  *  - Spectre v3a: Updated CPU microcode
1442  *  - Spectre v4: Not currently mitigated
1443  *  - SpectreRSB: SMEP and RSB Stuffing
1444  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1445  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1446  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1447  *
1448  * The following table indicates the x86 feature set bits that indicate that a
1449  * given problem has been solved or a notable feature is present:
1450  *
1451  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1452  *  - MDS_NO: All forms of MDS
1453  *  - TAA_NO: TAA
1454  */
1455 
1456 #include <sys/types.h>
1457 #include <sys/archsystm.h>
1458 #include <sys/x86_archext.h>
1459 #include <sys/kmem.h>
1460 #include <sys/systm.h>
1461 #include <sys/cmn_err.h>
1462 #include <sys/sunddi.h>
1463 #include <sys/sunndi.h>
1464 #include <sys/cpuvar.h>
1465 #include <sys/processor.h>
1466 #include <sys/sysmacros.h>
1467 #include <sys/pg.h>
1468 #include <sys/fp.h>
1469 #include <sys/controlregs.h>
1470 #include <sys/bitmap.h>
1471 #include <sys/auxv_386.h>
1472 #include <sys/memnode.h>
1473 #include <sys/pci_cfgspace.h>
1474 #include <sys/comm_page.h>
1475 #include <sys/mach_mmu.h>
1476 #include <sys/ucode.h>
1477 #include <sys/tsc.h>
1478 #include <sys/kobj.h>
1479 #include <sys/asm_misc.h>
1480 
1481 #ifdef __xpv
1482 #include <sys/hypervisor.h>
1483 #else
1484 #include <sys/ontrap.h>
1485 #endif
1486 
1487 uint_t x86_vendor = X86_VENDOR_IntelClone;
1488 uint_t x86_type = X86_TYPE_OTHER;
1489 uint_t x86_clflush_size = 0;
1490 
1491 #if defined(__xpv)
1492 int x86_use_pcid = 0;
1493 int x86_use_invpcid = 0;
1494 #else
1495 int x86_use_pcid = -1;
1496 int x86_use_invpcid = -1;
1497 #endif
1498 
1499 typedef enum {
1500 	X86_SPECTREV2_RETPOLINE,
1501 	X86_SPECTREV2_ENHANCED_IBRS,
1502 	X86_SPECTREV2_DISABLED
1503 } x86_spectrev2_mitigation_t;
1504 
1505 uint_t x86_disable_spectrev2 = 0;
1506 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1507     X86_SPECTREV2_RETPOLINE;
1508 
1509 /*
1510  * The mitigation status for TAA:
1511  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1512  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1513  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1514  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1515  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1516  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1517  */
1518 typedef enum {
1519 	X86_TAA_NOTHING,
1520 	X86_TAA_DISABLED,
1521 	X86_TAA_MD_CLEAR,
1522 	X86_TAA_TSX_FORCE_ABORT,
1523 	X86_TAA_TSX_DISABLE,
1524 	X86_TAA_HW_MITIGATED
1525 } x86_taa_mitigation_t;
1526 
1527 uint_t x86_disable_taa = 0;
1528 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1529 
1530 uint_t pentiumpro_bug4046376;
1531 
1532 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1533 
1534 static char *x86_feature_names[NUM_X86_FEATURES] = {
1535 	"lgpg",
1536 	"tsc",
1537 	"msr",
1538 	"mtrr",
1539 	"pge",
1540 	"de",
1541 	"cmov",
1542 	"mmx",
1543 	"mca",
1544 	"pae",
1545 	"cv8",
1546 	"pat",
1547 	"sep",
1548 	"sse",
1549 	"sse2",
1550 	"htt",
1551 	"asysc",
1552 	"nx",
1553 	"sse3",
1554 	"cx16",
1555 	"cmp",
1556 	"tscp",
1557 	"mwait",
1558 	"sse4a",
1559 	"cpuid",
1560 	"ssse3",
1561 	"sse4_1",
1562 	"sse4_2",
1563 	"1gpg",
1564 	"clfsh",
1565 	"64",
1566 	"aes",
1567 	"pclmulqdq",
1568 	"xsave",
1569 	"avx",
1570 	"vmx",
1571 	"svm",
1572 	"topoext",
1573 	"f16c",
1574 	"rdrand",
1575 	"x2apic",
1576 	"avx2",
1577 	"bmi1",
1578 	"bmi2",
1579 	"fma",
1580 	"smep",
1581 	"smap",
1582 	"adx",
1583 	"rdseed",
1584 	"mpx",
1585 	"avx512f",
1586 	"avx512dq",
1587 	"avx512pf",
1588 	"avx512er",
1589 	"avx512cd",
1590 	"avx512bw",
1591 	"avx512vl",
1592 	"avx512fma",
1593 	"avx512vbmi",
1594 	"avx512_vpopcntdq",
1595 	"avx512_4vnniw",
1596 	"avx512_4fmaps",
1597 	"xsaveopt",
1598 	"xsavec",
1599 	"xsaves",
1600 	"sha",
1601 	"umip",
1602 	"pku",
1603 	"ospke",
1604 	"pcid",
1605 	"invpcid",
1606 	"ibrs",
1607 	"ibpb",
1608 	"stibp",
1609 	"ssbd",
1610 	"ssbd_virt",
1611 	"rdcl_no",
1612 	"ibrs_all",
1613 	"rsba",
1614 	"ssb_no",
1615 	"stibp_all",
1616 	"flush_cmd",
1617 	"l1d_vmentry_no",
1618 	"fsgsbase",
1619 	"clflushopt",
1620 	"clwb",
1621 	"monitorx",
1622 	"clzero",
1623 	"xop",
1624 	"fma4",
1625 	"tbm",
1626 	"avx512_vnni",
1627 	"amd_pcec",
1628 	"md_clear",
1629 	"mds_no",
1630 	"core_thermal",
1631 	"pkg_thermal",
1632 	"tsx_ctrl",
1633 	"taa_no",
1634 	"ppin",
1635 	"vaes",
1636 	"vpclmulqdq",
1637 	"lfence_serializing"
1638 };
1639 
1640 boolean_t
1641 is_x86_feature(void *featureset, uint_t feature)
1642 {
1643 	ASSERT(feature < NUM_X86_FEATURES);
1644 	return (BT_TEST((ulong_t *)featureset, feature));
1645 }
1646 
1647 void
1648 add_x86_feature(void *featureset, uint_t feature)
1649 {
1650 	ASSERT(feature < NUM_X86_FEATURES);
1651 	BT_SET((ulong_t *)featureset, feature);
1652 }
1653 
1654 void
1655 remove_x86_feature(void *featureset, uint_t feature)
1656 {
1657 	ASSERT(feature < NUM_X86_FEATURES);
1658 	BT_CLEAR((ulong_t *)featureset, feature);
1659 }
1660 
1661 boolean_t
1662 compare_x86_featureset(void *setA, void *setB)
1663 {
1664 	/*
1665 	 * We assume that the unused bits of the bitmap are always zero.
1666 	 */
1667 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1668 		return (B_TRUE);
1669 	} else {
1670 		return (B_FALSE);
1671 	}
1672 }
1673 
1674 void
1675 print_x86_featureset(void *featureset)
1676 {
1677 	uint_t i;
1678 
1679 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1680 		if (is_x86_feature(featureset, i)) {
1681 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1682 			    x86_feature_names[i]);
1683 		}
1684 	}
1685 }
1686 
1687 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1688 static size_t xsave_state_size = 0;
1689 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1690 boolean_t xsave_force_disable = B_FALSE;
1691 extern int disable_smap;
1692 
1693 /*
1694  * This is set to platform type we are running on.
1695  */
1696 static int platform_type = -1;
1697 
1698 #if !defined(__xpv)
1699 /*
1700  * Variable to patch if hypervisor platform detection needs to be
1701  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1702  */
1703 int enable_platform_detection = 1;
1704 #endif
1705 
1706 /*
1707  * monitor/mwait info.
1708  *
1709  * size_actual and buf_actual are the real address and size allocated to get
1710  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1711  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1712  * processor cache-line alignment, but this is not guarantied in the furture.
1713  */
1714 struct mwait_info {
1715 	size_t		mon_min;	/* min size to avoid missed wakeups */
1716 	size_t		mon_max;	/* size to avoid false wakeups */
1717 	size_t		size_actual;	/* size actually allocated */
1718 	void		*buf_actual;	/* memory actually allocated */
1719 	uint32_t	support;	/* processor support of monitor/mwait */
1720 };
1721 
1722 /*
1723  * xsave/xrestor info.
1724  *
1725  * This structure contains HW feature bits and the size of the xsave save area.
1726  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1727  * (xsave_state) to describe the xsave layout. However, at runtime the
1728  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1729  * xsave_state structure simply represents the legacy layout of the beginning
1730  * of the xsave area.
1731  */
1732 struct xsave_info {
1733 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1734 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1735 	size_t		xsav_max_size;  /* max size save area for HW features */
1736 	size_t		ymm_size;	/* AVX: size of ymm save area */
1737 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1738 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1739 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1740 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1741 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1742 	size_t		opmask_size;	/* AVX512: size of opmask save */
1743 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1744 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1745 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1746 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1747 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1748 };
1749 
1750 
1751 /*
1752  * These constants determine how many of the elements of the
1753  * cpuid we cache in the cpuid_info data structure; the
1754  * remaining elements are accessible via the cpuid instruction.
1755  */
1756 
1757 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1758 #define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1759 
1760 /*
1761  * See the big theory statement for a more detailed explanation of what some of
1762  * these members mean.
1763  */
1764 struct cpuid_info {
1765 	uint_t cpi_pass;		/* last pass completed */
1766 	/*
1767 	 * standard function information
1768 	 */
1769 	uint_t cpi_maxeax;		/* fn 0: %eax */
1770 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1771 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1772 
1773 	uint_t cpi_family;		/* fn 1: extended family */
1774 	uint_t cpi_model;		/* fn 1: extended model */
1775 	uint_t cpi_step;		/* fn 1: stepping */
1776 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1777 					/*		AMD: package/socket # */
1778 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1779 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1780 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1781 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1782 	uint_t cpi_ncache;		/* fn 2: number of elements */
1783 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1784 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1785 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1786 					/* Intel fn: 4, AMD fn: 8000001d */
1787 	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1788 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1789 	/*
1790 	 * extended function information
1791 	 */
1792 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1793 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1794 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1795 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1796 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1797 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1798 
1799 	id_t cpi_coreid;		/* same coreid => strands share core */
1800 	int cpi_pkgcoreid;		/* core number within single package */
1801 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1802 					/* Intel: fn 4: %eax[31-26] */
1803 
1804 	/*
1805 	 * These values represent the number of bits that are required to store
1806 	 * information about the number of cores and threads.
1807 	 */
1808 	uint_t cpi_ncore_bits;
1809 	uint_t cpi_nthread_bits;
1810 	/*
1811 	 * supported feature information
1812 	 */
1813 	uint32_t cpi_support[6];
1814 #define	STD_EDX_FEATURES	0
1815 #define	AMD_EDX_FEATURES	1
1816 #define	TM_EDX_FEATURES		2
1817 #define	STD_ECX_FEATURES	3
1818 #define	AMD_ECX_FEATURES	4
1819 #define	STD_EBX_FEATURES	5
1820 	/*
1821 	 * Synthesized information, where known.
1822 	 */
1823 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1824 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1825 	uint32_t cpi_socket;		/* Chip package/socket type */
1826 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1827 
1828 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1829 	uint32_t cpi_apicid;
1830 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1831 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1832 					/* Intel: 1 */
1833 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1834 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1835 
1836 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1837 };
1838 
1839 
1840 static struct cpuid_info cpuid_info0;
1841 
1842 /*
1843  * These bit fields are defined by the Intel Application Note AP-485
1844  * "Intel Processor Identification and the CPUID Instruction"
1845  */
1846 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1847 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1848 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1849 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1850 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1851 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1852 
1853 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1854 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1855 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1856 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1857 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1858 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1859 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1860 
1861 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1862 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1863 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1864 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1865 
1866 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1867 #define	CPI_XMAXEAX_MAX		0x80000100
1868 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1869 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1870 
1871 /*
1872  * Function 4 (Deterministic Cache Parameters) macros
1873  * Defined by Intel Application Note AP-485
1874  */
1875 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1876 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1877 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1878 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1879 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1880 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1881 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1882 
1883 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1884 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1885 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1886 
1887 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1888 
1889 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1890 
1891 
1892 /*
1893  * A couple of shorthand macros to identify "later" P6-family chips
1894  * like the Pentium M and Core.  First, the "older" P6-based stuff
1895  * (loosely defined as "pre-Pentium-4"):
1896  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1897  */
1898 #define	IS_LEGACY_P6(cpi) (			\
1899 	cpi->cpi_family == 6 &&			\
1900 		(cpi->cpi_model == 1 ||		\
1901 		cpi->cpi_model == 3 ||		\
1902 		cpi->cpi_model == 5 ||		\
1903 		cpi->cpi_model == 6 ||		\
1904 		cpi->cpi_model == 7 ||		\
1905 		cpi->cpi_model == 8 ||		\
1906 		cpi->cpi_model == 0xA ||	\
1907 		cpi->cpi_model == 0xB)		\
1908 )
1909 
1910 /* A "new F6" is everything with family 6 that's not the above */
1911 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1912 
1913 /* Extended family/model support */
1914 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1915 	cpi->cpi_family >= 0xf)
1916 
1917 /*
1918  * Info for monitor/mwait idle loop.
1919  *
1920  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1921  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1922  * 2006.
1923  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1924  * Documentation Updates" #33633, Rev 2.05, December 2006.
1925  */
1926 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1927 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1928 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1929 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1930 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1931 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1932 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1933 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1934 /*
1935  * Number of sub-cstates for a given c-state.
1936  */
1937 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1938 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1939 
1940 /*
1941  * XSAVE leaf 0xD enumeration
1942  */
1943 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1944 #define	CPUID_LEAFD_2_YMM_SIZE		256
1945 
1946 /*
1947  * Common extended leaf names to cut down on typos.
1948  */
1949 #define	CPUID_LEAF_EXT_0		0x80000000
1950 #define	CPUID_LEAF_EXT_8		0x80000008
1951 #define	CPUID_LEAF_EXT_1d		0x8000001d
1952 #define	CPUID_LEAF_EXT_1e		0x8000001e
1953 
1954 /*
1955  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1956  * file to try and keep people using the expected cpuid_* interfaces.
1957  */
1958 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1959 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1960 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1961 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1962 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
1963 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1964 
1965 /*
1966  * Apply up various platform-dependent restrictions where the
1967  * underlying platform restrictions mean the CPU can be marked
1968  * as less capable than its cpuid instruction would imply.
1969  */
1970 #if defined(__xpv)
1971 static void
1972 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1973 {
1974 	switch (eax) {
1975 	case 1: {
1976 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1977 		    0 : CPUID_INTC_EDX_MCA;
1978 		cp->cp_edx &=
1979 		    ~(mcamask |
1980 		    CPUID_INTC_EDX_PSE |
1981 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1982 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1983 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1984 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1985 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1986 		break;
1987 	}
1988 
1989 	case 0x80000001:
1990 		cp->cp_edx &=
1991 		    ~(CPUID_AMD_EDX_PSE |
1992 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1993 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1994 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1995 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1996 		    CPUID_AMD_EDX_TSCP);
1997 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1998 		break;
1999 	default:
2000 		break;
2001 	}
2002 
2003 	switch (vendor) {
2004 	case X86_VENDOR_Intel:
2005 		switch (eax) {
2006 		case 4:
2007 			/*
2008 			 * Zero out the (ncores-per-chip - 1) field
2009 			 */
2010 			cp->cp_eax &= 0x03fffffff;
2011 			break;
2012 		default:
2013 			break;
2014 		}
2015 		break;
2016 	case X86_VENDOR_AMD:
2017 	case X86_VENDOR_HYGON:
2018 		switch (eax) {
2019 
2020 		case 0x80000001:
2021 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2022 			break;
2023 
2024 		case CPUID_LEAF_EXT_8:
2025 			/*
2026 			 * Zero out the (ncores-per-chip - 1) field
2027 			 */
2028 			cp->cp_ecx &= 0xffffff00;
2029 			break;
2030 		default:
2031 			break;
2032 		}
2033 		break;
2034 	default:
2035 		break;
2036 	}
2037 }
2038 #else
2039 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2040 #endif
2041 
2042 /*
2043  *  Some undocumented ways of patching the results of the cpuid
2044  *  instruction to permit running Solaris 10 on future cpus that
2045  *  we don't currently support.  Could be set to non-zero values
2046  *  via settings in eeprom.
2047  */
2048 
2049 uint32_t cpuid_feature_ecx_include;
2050 uint32_t cpuid_feature_ecx_exclude;
2051 uint32_t cpuid_feature_edx_include;
2052 uint32_t cpuid_feature_edx_exclude;
2053 
2054 /*
2055  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2056  */
2057 void
2058 cpuid_alloc_space(cpu_t *cpu)
2059 {
2060 	/*
2061 	 * By convention, cpu0 is the boot cpu, which is set up
2062 	 * before memory allocation is available.  All other cpus get
2063 	 * their cpuid_info struct allocated here.
2064 	 */
2065 	ASSERT(cpu->cpu_id != 0);
2066 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2067 	cpu->cpu_m.mcpu_cpi =
2068 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2069 }
2070 
2071 void
2072 cpuid_free_space(cpu_t *cpu)
2073 {
2074 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2075 	int i;
2076 
2077 	ASSERT(cpi != NULL);
2078 	ASSERT(cpi != &cpuid_info0);
2079 
2080 	/*
2081 	 * Free up any cache leaf related dynamic storage. The first entry was
2082 	 * cached from the standard cpuid storage, so we should not free it.
2083 	 */
2084 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2085 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2086 	if (cpi->cpi_cache_leaf_size > 0)
2087 		kmem_free(cpi->cpi_cache_leaves,
2088 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2089 
2090 	kmem_free(cpi, sizeof (*cpi));
2091 	cpu->cpu_m.mcpu_cpi = NULL;
2092 }
2093 
2094 #if !defined(__xpv)
2095 /*
2096  * Determine the type of the underlying platform. This is used to customize
2097  * initialization of various subsystems (e.g. TSC). determine_platform() must
2098  * only ever be called once to prevent two processors from seeing different
2099  * values of platform_type. Must be called before cpuid_pass_ident(), the
2100  * earliest consumer to execute; the identification pass will call
2101  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2102  */
2103 void
2104 determine_platform(void)
2105 {
2106 	struct cpuid_regs cp;
2107 	uint32_t base;
2108 	uint32_t regs[4];
2109 	char *hvstr = (char *)regs;
2110 
2111 	ASSERT(platform_type == -1);
2112 
2113 	platform_type = HW_NATIVE;
2114 
2115 	if (!enable_platform_detection)
2116 		return;
2117 
2118 	/*
2119 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2120 	 * vendor signature, and set platform type accordingly.
2121 	 *
2122 	 * References:
2123 	 * http://lkml.org/lkml/2008/10/1/246
2124 	 * http://kb.vmware.com/kb/1009458
2125 	 */
2126 	cp.cp_eax = 0x1;
2127 	(void) __cpuid_insn(&cp);
2128 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2129 		cp.cp_eax = 0x40000000;
2130 		(void) __cpuid_insn(&cp);
2131 		regs[0] = cp.cp_ebx;
2132 		regs[1] = cp.cp_ecx;
2133 		regs[2] = cp.cp_edx;
2134 		regs[3] = 0;
2135 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2136 			platform_type = HW_XEN_HVM;
2137 			return;
2138 		}
2139 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2140 			platform_type = HW_VMWARE;
2141 			return;
2142 		}
2143 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2144 			platform_type = HW_KVM;
2145 			return;
2146 		}
2147 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2148 			platform_type = HW_BHYVE;
2149 			return;
2150 		}
2151 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
2152 			platform_type = HW_MICROSOFT;
2153 	} else {
2154 		/*
2155 		 * Check older VMware hardware versions. VMware hypervisor is
2156 		 * detected by performing an IN operation to VMware hypervisor
2157 		 * port and checking that value returned in %ebx is VMware
2158 		 * hypervisor magic value.
2159 		 *
2160 		 * References: http://kb.vmware.com/kb/1009458
2161 		 */
2162 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2163 		if (regs[1] == VMWARE_HVMAGIC) {
2164 			platform_type = HW_VMWARE;
2165 			return;
2166 		}
2167 	}
2168 
2169 	/*
2170 	 * Check Xen hypervisor. In a fully virtualized domain,
2171 	 * Xen's pseudo-cpuid function returns a string representing the
2172 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2173 	 * supported cpuid function. We need at least a (base + 2) leaf value
2174 	 * to do what we want to do. Try different base values, since the
2175 	 * hypervisor might use a different one depending on whether Hyper-V
2176 	 * emulation is switched on by default or not.
2177 	 */
2178 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2179 		cp.cp_eax = base;
2180 		(void) __cpuid_insn(&cp);
2181 		regs[0] = cp.cp_ebx;
2182 		regs[1] = cp.cp_ecx;
2183 		regs[2] = cp.cp_edx;
2184 		regs[3] = 0;
2185 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2186 		    cp.cp_eax >= (base + 2)) {
2187 			platform_type &= ~HW_NATIVE;
2188 			platform_type |= HW_XEN_HVM;
2189 			return;
2190 		}
2191 	}
2192 }
2193 
2194 int
2195 get_hwenv(void)
2196 {
2197 	ASSERT(platform_type != -1);
2198 	return (platform_type);
2199 }
2200 
2201 int
2202 is_controldom(void)
2203 {
2204 	return (0);
2205 }
2206 
2207 #else
2208 
2209 int
2210 get_hwenv(void)
2211 {
2212 	return (HW_XEN_PV);
2213 }
2214 
2215 int
2216 is_controldom(void)
2217 {
2218 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2219 }
2220 
2221 #endif	/* __xpv */
2222 
2223 /*
2224  * Make sure that we have gathered all of the CPUID leaves that we might need to
2225  * determine topology. We assume that the standard leaf 1 has already been done
2226  * and that xmaxeax has already been calculated.
2227  */
2228 static void
2229 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2230 {
2231 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2232 
2233 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2234 		struct cpuid_regs *cp;
2235 
2236 		cp = &cpi->cpi_extd[8];
2237 		cp->cp_eax = CPUID_LEAF_EXT_8;
2238 		(void) __cpuid_insn(cp);
2239 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2240 	}
2241 
2242 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2243 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2244 		struct cpuid_regs *cp;
2245 
2246 		cp = &cpi->cpi_extd[0x1e];
2247 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2248 		(void) __cpuid_insn(cp);
2249 	}
2250 }
2251 
2252 /*
2253  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2254  * it to everything else. If not, and we're on an AMD system where 8000001e is
2255  * valid, then we use that. Othewrise, we fall back to the default value for the
2256  * APIC ID in leaf 1.
2257  */
2258 static uint32_t
2259 cpuid_gather_apicid(struct cpuid_info *cpi)
2260 {
2261 	/*
2262 	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2263 	 * it, we need to gather it again.
2264 	 */
2265 	if (cpi->cpi_maxeax >= 0xB) {
2266 		struct cpuid_regs regs;
2267 		struct cpuid_regs *cp;
2268 
2269 		cp = &regs;
2270 		cp->cp_eax = 0xB;
2271 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2272 		(void) __cpuid_insn(cp);
2273 
2274 		if (cp->cp_ebx != 0) {
2275 			return (cp->cp_edx);
2276 		}
2277 	}
2278 
2279 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2280 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2281 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2282 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2283 		return (cpi->cpi_extd[0x1e].cp_eax);
2284 	}
2285 
2286 	return (CPI_APIC_ID(cpi));
2287 }
2288 
2289 /*
2290  * For AMD processors, attempt to calculate the number of chips and cores that
2291  * exist. The way that we do this varies based on the generation, because the
2292  * generations themselves have changed dramatically.
2293  *
2294  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2295  * However, with the advent of family 17h (Zen) it actually tells us the number
2296  * of threads, so we need to look at leaf 0x8000001e if available to determine
2297  * its value. Otherwise, for all prior families, the number of enabled cores is
2298  * the same as threads.
2299  *
2300  * If we do not have leaf 0x80000008, then we assume that this processor does
2301  * not have anything. AMD's older CPUID specification says there's no reason to
2302  * fall back to leaf 1.
2303  *
2304  * In some virtualization cases we will not have leaf 8000001e or it will be
2305  * zero. When that happens we assume the number of threads is one.
2306  */
2307 static void
2308 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2309 {
2310 	uint_t nthreads, nthread_per_core;
2311 
2312 	nthreads = nthread_per_core = 1;
2313 
2314 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2315 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2316 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2317 		nthreads = CPI_CPU_COUNT(cpi);
2318 	}
2319 
2320 	/*
2321 	 * For us to have threads, and know about it, we have to be at least at
2322 	 * family 17h and have the cpuid bit that says we have extended
2323 	 * topology.
2324 	 */
2325 	if (cpi->cpi_family >= 0x17 &&
2326 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2327 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2328 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2329 	}
2330 
2331 	*ncpus = nthreads;
2332 	*ncores = nthreads / nthread_per_core;
2333 }
2334 
2335 /*
2336  * Seed the initial values for the cores and threads for an Intel based
2337  * processor. These values will be overwritten if we detect that the processor
2338  * supports CPUID leaf 0xb.
2339  */
2340 static void
2341 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2342 {
2343 	/*
2344 	 * Only seed the number of physical cores from the first level leaf 4
2345 	 * information. The number of threads there indicate how many share the
2346 	 * L1 cache, which may or may not have anything to do with the number of
2347 	 * logical CPUs per core.
2348 	 */
2349 	if (cpi->cpi_maxeax >= 4) {
2350 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2351 	} else {
2352 		*ncores = 1;
2353 	}
2354 
2355 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2356 		*ncpus = CPI_CPU_COUNT(cpi);
2357 	} else {
2358 		*ncpus = *ncores;
2359 	}
2360 }
2361 
2362 static boolean_t
2363 cpuid_leafB_getids(cpu_t *cpu)
2364 {
2365 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2366 	struct cpuid_regs regs;
2367 	struct cpuid_regs *cp;
2368 
2369 	if (cpi->cpi_maxeax < 0xB)
2370 		return (B_FALSE);
2371 
2372 	cp = &regs;
2373 	cp->cp_eax = 0xB;
2374 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2375 
2376 	(void) __cpuid_insn(cp);
2377 
2378 	/*
2379 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2380 	 * indicates that the extended topology enumeration leaf is
2381 	 * available.
2382 	 */
2383 	if (cp->cp_ebx != 0) {
2384 		uint32_t x2apic_id = 0;
2385 		uint_t coreid_shift = 0;
2386 		uint_t ncpu_per_core = 1;
2387 		uint_t chipid_shift = 0;
2388 		uint_t ncpu_per_chip = 1;
2389 		uint_t i;
2390 		uint_t level;
2391 
2392 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2393 			cp->cp_eax = 0xB;
2394 			cp->cp_ecx = i;
2395 
2396 			(void) __cpuid_insn(cp);
2397 			level = CPI_CPU_LEVEL_TYPE(cp);
2398 
2399 			if (level == 1) {
2400 				x2apic_id = cp->cp_edx;
2401 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2402 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2403 			} else if (level == 2) {
2404 				x2apic_id = cp->cp_edx;
2405 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2406 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2407 			}
2408 		}
2409 
2410 		/*
2411 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2412 		 */
2413 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2414 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2415 		    ncpu_per_core;
2416 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2417 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2418 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2419 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2420 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2421 		cpi->cpi_compunitid = cpi->cpi_coreid;
2422 
2423 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2424 			cpi->cpi_nthread_bits = coreid_shift;
2425 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2426 		}
2427 
2428 		return (B_TRUE);
2429 	} else {
2430 		return (B_FALSE);
2431 	}
2432 }
2433 
2434 static void
2435 cpuid_intel_getids(cpu_t *cpu, void *feature)
2436 {
2437 	uint_t i;
2438 	uint_t chipid_shift = 0;
2439 	uint_t coreid_shift = 0;
2440 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2441 
2442 	/*
2443 	 * There are no compute units or processor nodes currently on Intel.
2444 	 * Always set these to one.
2445 	 */
2446 	cpi->cpi_procnodes_per_pkg = 1;
2447 	cpi->cpi_cores_per_compunit = 1;
2448 
2449 	/*
2450 	 * If cpuid Leaf B is present, use that to try and get this information.
2451 	 * It will be the most accurate for Intel CPUs.
2452 	 */
2453 	if (cpuid_leafB_getids(cpu))
2454 		return;
2455 
2456 	/*
2457 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2458 	 * and ncore_per_chip. These represent the largest power of two values
2459 	 * that we need to cover all of the IDs in the system. Therefore, we use
2460 	 * those values to seed the number of bits needed to cover information
2461 	 * in the case when leaf B is not available. These values will probably
2462 	 * be larger than required, but that's OK.
2463 	 */
2464 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2465 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2466 
2467 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2468 		chipid_shift++;
2469 
2470 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2471 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2472 
2473 	if (is_x86_feature(feature, X86FSET_CMP)) {
2474 		/*
2475 		 * Multi-core (and possibly multi-threaded)
2476 		 * processors.
2477 		 */
2478 		uint_t ncpu_per_core = 0;
2479 
2480 		if (cpi->cpi_ncore_per_chip == 1)
2481 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2482 		else if (cpi->cpi_ncore_per_chip > 1)
2483 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2484 			    cpi->cpi_ncore_per_chip;
2485 		/*
2486 		 * 8bit APIC IDs on dual core Pentiums
2487 		 * look like this:
2488 		 *
2489 		 * +-----------------------+------+------+
2490 		 * | Physical Package ID   |  MC  |  HT  |
2491 		 * +-----------------------+------+------+
2492 		 * <------- chipid -------->
2493 		 * <------- coreid --------------->
2494 		 *			   <--- clogid -->
2495 		 *			   <------>
2496 		 *			   pkgcoreid
2497 		 *
2498 		 * Where the number of bits necessary to
2499 		 * represent MC and HT fields together equals
2500 		 * to the minimum number of bits necessary to
2501 		 * store the value of cpi->cpi_ncpu_per_chip.
2502 		 * Of those bits, the MC part uses the number
2503 		 * of bits necessary to store the value of
2504 		 * cpi->cpi_ncore_per_chip.
2505 		 */
2506 		for (i = 1; i < ncpu_per_core; i <<= 1)
2507 			coreid_shift++;
2508 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2509 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2510 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2511 		/*
2512 		 * Single-core multi-threaded processors.
2513 		 */
2514 		cpi->cpi_coreid = cpi->cpi_chipid;
2515 		cpi->cpi_pkgcoreid = 0;
2516 	} else {
2517 		/*
2518 		 * Single-core single-thread processors.
2519 		 */
2520 		cpi->cpi_coreid = cpu->cpu_id;
2521 		cpi->cpi_pkgcoreid = 0;
2522 	}
2523 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2524 	cpi->cpi_compunitid = cpi->cpi_coreid;
2525 }
2526 
2527 /*
2528  * Historically, AMD has had CMP chips with only a single thread per core.
2529  * However, starting in family 17h (Zen), this has changed and they now have
2530  * multiple threads. Our internal core id needs to be a unique value.
2531  *
2532  * To determine the core id of an AMD system, if we're from a family before 17h,
2533  * then we just use the cpu id, as that gives us a good value that will be
2534  * unique for each core. If instead, we're on family 17h or later, then we need
2535  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2536  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2537  * We can't use the normal core id in that leaf as it's only unique within the
2538  * socket, which is perfect for cpi_pkgcoreid, but not us.
2539  */
2540 static id_t
2541 cpuid_amd_get_coreid(cpu_t *cpu)
2542 {
2543 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2544 
2545 	if (cpi->cpi_family >= 0x17 &&
2546 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2547 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2548 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2549 		if (nthreads > 1) {
2550 			VERIFY3U(nthreads, ==, 2);
2551 			return (cpi->cpi_apicid >> 1);
2552 		}
2553 	}
2554 
2555 	return (cpu->cpu_id);
2556 }
2557 
2558 /*
2559  * IDs on AMD is a more challenging task. This is notable because of the
2560  * following two facts:
2561  *
2562  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2563  *     also no way to get an actual unique core id from the system. As such, we
2564  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2565  *     however, guarantee that sibling cores of a chip will have sequential
2566  *     coreids starting at a multiple of the number of cores per chip - that is
2567  *     usually the case, but if the APIC IDs have been set up in a different
2568  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2569  *
2570  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2571  *     called compute units. These compute units share the L1I cache, L2 cache,
2572  *     and the FPU. To deal with this, a new topology leaf was added in
2573  *     0x8000001e. However, parts of this leaf have different meanings
2574  *     once we get to family 0x17.
2575  */
2576 
2577 static void
2578 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2579 {
2580 	int i, first_half, coreidsz;
2581 	uint32_t nb_caps_reg;
2582 	uint_t node2_1;
2583 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2584 	struct cpuid_regs *cp;
2585 
2586 	/*
2587 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2588 	 * hasn't been stripped by virtualization). We always set the compute
2589 	 * unit id to the same value. Also, initialize the default number of
2590 	 * cores per compute unit and nodes per package. This will be
2591 	 * overwritten when we know information about a particular family.
2592 	 */
2593 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2594 	cpi->cpi_compunitid = cpi->cpi_coreid;
2595 	cpi->cpi_cores_per_compunit = 1;
2596 	cpi->cpi_procnodes_per_pkg = 1;
2597 
2598 	/*
2599 	 * To construct the logical ID, we need to determine how many APIC IDs
2600 	 * are dedicated to the cores and threads. This is provided for us in
2601 	 * 0x80000008. However, if it's not present (say due to virtualization),
2602 	 * then we assume it's one. This should be present on all 64-bit AMD
2603 	 * processors.  It was added in family 0xf (Hammer).
2604 	 */
2605 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2606 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2607 
2608 		/*
2609 		 * In AMD parlance chip is really a node while illumos
2610 		 * uses chip as equivalent to socket/package.
2611 		 */
2612 		if (coreidsz == 0) {
2613 			/* Use legacy method */
2614 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2615 				coreidsz++;
2616 			if (coreidsz == 0)
2617 				coreidsz = 1;
2618 		}
2619 	} else {
2620 		/* Assume single-core part */
2621 		coreidsz = 1;
2622 	}
2623 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2624 
2625 	/*
2626 	 * The package core ID varies depending on the family. While it may be
2627 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2628 	 * this value is the core id in the given node. For non-virtualized
2629 	 * family 17h, we need to take the logical core id and shift off the
2630 	 * threads like we do when getting the core id.  Otherwise, we can use
2631 	 * the clogid as is. When family 17h is virtualized, the clogid should
2632 	 * be sufficient as if we don't have valid data in the leaf, then we
2633 	 * won't think we have SMT, in which case the cpi_clogid should be
2634 	 * sufficient.
2635 	 */
2636 	if (cpi->cpi_family >= 0x17 &&
2637 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2638 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2639 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2640 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2641 		if (nthreads > 1) {
2642 			VERIFY3U(nthreads, ==, 2);
2643 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2644 		} else {
2645 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2646 		}
2647 	} else {
2648 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2649 	}
2650 
2651 	/*
2652 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2653 	 * (bulldozer) or newer, then we can derive all of this from leaf
2654 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2655 	 */
2656 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2657 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2658 		cp = &cpi->cpi_extd[0x1e];
2659 
2660 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2661 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2662 
2663 		/*
2664 		 * For Bulldozer-era CPUs, recalculate the compute unit
2665 		 * information.
2666 		 */
2667 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2668 			cpi->cpi_cores_per_compunit =
2669 			    BITX(cp->cp_ebx, 15, 8) + 1;
2670 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2671 			    (cpi->cpi_ncore_per_chip /
2672 			    cpi->cpi_cores_per_compunit) *
2673 			    (cpi->cpi_procnodeid /
2674 			    cpi->cpi_procnodes_per_pkg);
2675 		}
2676 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2677 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2678 	} else if (cpi->cpi_family == 0x10) {
2679 		/*
2680 		 * See if we are a multi-node processor.
2681 		 * All processors in the system have the same number of nodes
2682 		 */
2683 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2684 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2685 			/* Single-node */
2686 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2687 			    coreidsz);
2688 		} else {
2689 
2690 			/*
2691 			 * Multi-node revision D (2 nodes per package
2692 			 * are supported)
2693 			 */
2694 			cpi->cpi_procnodes_per_pkg = 2;
2695 
2696 			first_half = (cpi->cpi_pkgcoreid <=
2697 			    (cpi->cpi_ncore_per_chip/2 - 1));
2698 
2699 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2700 				/* We are BSP */
2701 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2702 			} else {
2703 
2704 				/* We are AP */
2705 				/* NodeId[2:1] bits to use for reading F3xe8 */
2706 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2707 
2708 				nb_caps_reg =
2709 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2710 
2711 				/*
2712 				 * Check IntNodeNum bit (31:30, but bit 31 is
2713 				 * always 0 on dual-node processors)
2714 				 */
2715 				if (BITX(nb_caps_reg, 30, 30) == 0)
2716 					cpi->cpi_procnodeid = node2_1 +
2717 					    !first_half;
2718 				else
2719 					cpi->cpi_procnodeid = node2_1 +
2720 					    first_half;
2721 			}
2722 		}
2723 	} else {
2724 		cpi->cpi_procnodeid = 0;
2725 	}
2726 
2727 	cpi->cpi_chipid =
2728 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2729 
2730 	cpi->cpi_ncore_bits = coreidsz;
2731 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2732 	    cpi->cpi_ncore_per_chip);
2733 }
2734 
2735 static void
2736 spec_uarch_flush_noop(void)
2737 {
2738 }
2739 
2740 /*
2741  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2742  * MDS-related micro-architectural state that would normally happen by calling
2743  * x86_md_clear().
2744  */
2745 static void
2746 spec_uarch_flush_msr(void)
2747 {
2748 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2749 }
2750 
2751 /*
2752  * This function points to a function that will flush certain
2753  * micro-architectural state on the processor. This flush is used to mitigate
2754  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2755  * function can point to one of three functions:
2756  *
2757  * - A noop which is done because we either are vulnerable, but do not have
2758  *   microcode available to help deal with a fix, or because we aren't
2759  *   vulnerable.
2760  *
2761  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2762  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2763  *   however, it only flushes the MDS related micro-architectural state on the
2764  *   current hyperthread, it does not do anything for the twin.
2765  *
2766  * - x86_md_clear which will flush the MDS related state. This is done when we
2767  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2768  *   (RDCL_NO is set).
2769  */
2770 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2771 
2772 static void
2773 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2774 {
2775 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2776 
2777 	/*
2778 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2779 	 * has been fixed in hardware, it doesn't cover everything related to
2780 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2781 	 * need to mitigate this.
2782 	 */
2783 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2784 	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2785 		return;
2786 	}
2787 
2788 	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2789 		const uint8_t nop = NOP_INSTR;
2790 		uint8_t *md = (uint8_t *)x86_md_clear;
2791 
2792 		*md = nop;
2793 	}
2794 
2795 	membar_producer();
2796 }
2797 
2798 static void
2799 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2800 {
2801 	boolean_t need_l1d, need_mds;
2802 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2803 
2804 	/*
2805 	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2806 	 * hardware, then there's nothing left for us to do for enabling the
2807 	 * flush. We can also go ahead and say that SMT exclusion is
2808 	 * unnecessary.
2809 	 */
2810 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2811 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2812 	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2813 		extern int smt_exclusion;
2814 		smt_exclusion = 0;
2815 		spec_uarch_flush = spec_uarch_flush_noop;
2816 		membar_producer();
2817 		return;
2818 	}
2819 
2820 	/*
2821 	 * The locations where we need to perform an L1D flush are required both
2822 	 * for mitigating L1TF and MDS. When verw support is present in
2823 	 * microcode, then the L1D flush will take care of doing that as well.
2824 	 * However, if we have a system where RDCL_NO is present, but we don't
2825 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2826 	 * L1D flush.
2827 	 */
2828 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2829 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2830 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2831 		need_l1d = B_TRUE;
2832 	} else {
2833 		need_l1d = B_FALSE;
2834 	}
2835 
2836 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2837 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2838 		need_mds = B_TRUE;
2839 	} else {
2840 		need_mds = B_FALSE;
2841 	}
2842 
2843 	if (need_l1d) {
2844 		spec_uarch_flush = spec_uarch_flush_msr;
2845 	} else if (need_mds) {
2846 		spec_uarch_flush = x86_md_clear;
2847 	} else {
2848 		/*
2849 		 * We have no hardware mitigations available to us.
2850 		 */
2851 		spec_uarch_flush = spec_uarch_flush_noop;
2852 	}
2853 	membar_producer();
2854 }
2855 
2856 /*
2857  * We default to enabling RSB mitigations.
2858  *
2859  * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2860  * post-barrier RSB guessing suggests we should enable RSB mitigations always
2861  * unless specifically instructed not to.
2862  */
2863 static void
2864 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2865 {
2866 	const uint8_t ret = RET_INSTR;
2867 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2868 
2869 	switch (mit) {
2870 	case X86_SPECTREV2_DISABLED:
2871 		*stuff = ret;
2872 		break;
2873 	default:
2874 		break;
2875 	}
2876 }
2877 
2878 static void
2879 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2880 {
2881 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2882 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2883 	    "_r14", "_r15" };
2884 	const uint_t nthunks = ARRAY_SIZE(thunks);
2885 	const char *type;
2886 	uint_t i;
2887 
2888 	if (mit == x86_spectrev2_mitigation)
2889 		return;
2890 
2891 	switch (mit) {
2892 	case X86_SPECTREV2_RETPOLINE:
2893 		type = "gen";
2894 		break;
2895 	case X86_SPECTREV2_ENHANCED_IBRS:
2896 	case X86_SPECTREV2_DISABLED:
2897 		type = "jmp";
2898 		break;
2899 	default:
2900 		panic("asked to updated retpoline state with unknown state!");
2901 	}
2902 
2903 	for (i = 0; i < nthunks; i++) {
2904 		uintptr_t source, dest;
2905 		int ssize, dsize;
2906 		char sourcebuf[64], destbuf[64];
2907 
2908 		(void) snprintf(destbuf, sizeof (destbuf),
2909 		    "__x86_indirect_thunk%s", thunks[i]);
2910 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2911 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2912 
2913 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2914 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2915 		VERIFY3U(source, !=, 0);
2916 		VERIFY3U(dest, !=, 0);
2917 		VERIFY3S(dsize, >=, ssize);
2918 		bcopy((void *)source, (void *)dest, ssize);
2919 	}
2920 }
2921 
2922 static void
2923 cpuid_enable_enhanced_ibrs(void)
2924 {
2925 	uint64_t val;
2926 
2927 	val = rdmsr(MSR_IA32_SPEC_CTRL);
2928 	val |= IA32_SPEC_CTRL_IBRS;
2929 	wrmsr(MSR_IA32_SPEC_CTRL, val);
2930 }
2931 
2932 /*
2933  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2934  * we can disable TSX, we do so.
2935  *
2936  * This determination is done only on the boot CPU, potentially after loading
2937  * updated microcode.
2938  */
2939 static void
2940 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2941 {
2942 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2943 
2944 	VERIFY(cpu->cpu_id == 0);
2945 
2946 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2947 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2948 		return;
2949 	}
2950 
2951 	if (x86_disable_taa) {
2952 		x86_taa_mitigation = X86_TAA_DISABLED;
2953 		return;
2954 	}
2955 
2956 	/*
2957 	 * If we do not have the ability to disable TSX, then our only
2958 	 * mitigation options are in hardware (TAA_NO), or by using our existing
2959 	 * MDS mitigation as described above.  The latter relies upon us having
2960 	 * configured MDS mitigations correctly! This includes disabling SMT if
2961 	 * we want to cross-CPU-thread protection.
2962 	 */
2963 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2964 		/*
2965 		 * It's not clear whether any parts will enumerate TAA_NO
2966 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
2967 		 */
2968 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2969 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2970 			return;
2971 		}
2972 
2973 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2974 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2975 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
2976 		} else {
2977 			x86_taa_mitigation = X86_TAA_NOTHING;
2978 		}
2979 		return;
2980 	}
2981 
2982 	/*
2983 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
2984 	 * enough in boot.
2985 	 *
2986 	 * Otherwise, we'll fall back to causing transactions to abort as our
2987 	 * mitigation. TSX-using code will always take the fallback path.
2988 	 */
2989 	if (cpi->cpi_pass < 4) {
2990 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2991 	} else {
2992 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2993 	}
2994 }
2995 
2996 /*
2997  * As mentioned, we should only touch the MSR when we've got a suitable
2998  * microcode loaded on this CPU.
2999  */
3000 static void
3001 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3002 {
3003 	uint64_t val;
3004 
3005 	switch (taa) {
3006 	case X86_TAA_TSX_DISABLE:
3007 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3008 			return;
3009 		val = rdmsr(MSR_IA32_TSX_CTRL);
3010 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3011 		wrmsr(MSR_IA32_TSX_CTRL, val);
3012 		break;
3013 	case X86_TAA_TSX_FORCE_ABORT:
3014 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3015 			return;
3016 		val = rdmsr(MSR_IA32_TSX_CTRL);
3017 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3018 		wrmsr(MSR_IA32_TSX_CTRL, val);
3019 		break;
3020 	case X86_TAA_HW_MITIGATED:
3021 	case X86_TAA_MD_CLEAR:
3022 	case X86_TAA_DISABLED:
3023 	case X86_TAA_NOTHING:
3024 		break;
3025 	}
3026 }
3027 
3028 static void
3029 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3030 {
3031 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3032 	x86_spectrev2_mitigation_t v2mit;
3033 
3034 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3035 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3036 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3037 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3038 			add_x86_feature(featureset, X86FSET_IBPB);
3039 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3040 			add_x86_feature(featureset, X86FSET_IBRS);
3041 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3042 			add_x86_feature(featureset, X86FSET_STIBP);
3043 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3044 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3045 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3046 			add_x86_feature(featureset, X86FSET_SSBD);
3047 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3048 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3049 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3050 			add_x86_feature(featureset, X86FSET_SSB_NO);
3051 		/*
3052 		 * Don't enable enhanced IBRS unless we're told that we should
3053 		 * prefer it and it has the same semantics as Intel. This is
3054 		 * split into two bits rather than a single one.
3055 		 */
3056 		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3057 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
3058 			add_x86_feature(featureset, X86FSET_IBRS_ALL);
3059 		}
3060 
3061 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3062 	    cpi->cpi_maxeax >= 7) {
3063 		struct cpuid_regs *ecp;
3064 		ecp = &cpi->cpi_std[7];
3065 
3066 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3067 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3068 		}
3069 
3070 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3071 			add_x86_feature(featureset, X86FSET_IBRS);
3072 			add_x86_feature(featureset, X86FSET_IBPB);
3073 		}
3074 
3075 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3076 			add_x86_feature(featureset, X86FSET_STIBP);
3077 		}
3078 
3079 		/*
3080 		 * Don't read the arch caps MSR on xpv where we lack the
3081 		 * on_trap().
3082 		 */
3083 #ifndef __xpv
3084 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3085 			on_trap_data_t otd;
3086 
3087 			/*
3088 			 * Be paranoid and assume we'll get a #GP.
3089 			 */
3090 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3091 				uint64_t reg;
3092 
3093 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3094 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3095 					add_x86_feature(featureset,
3096 					    X86FSET_RDCL_NO);
3097 				}
3098 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3099 					add_x86_feature(featureset,
3100 					    X86FSET_IBRS_ALL);
3101 				}
3102 				if (reg & IA32_ARCH_CAP_RSBA) {
3103 					add_x86_feature(featureset,
3104 					    X86FSET_RSBA);
3105 				}
3106 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3107 					add_x86_feature(featureset,
3108 					    X86FSET_L1D_VM_NO);
3109 				}
3110 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3111 					add_x86_feature(featureset,
3112 					    X86FSET_SSB_NO);
3113 				}
3114 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3115 					add_x86_feature(featureset,
3116 					    X86FSET_MDS_NO);
3117 				}
3118 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3119 					add_x86_feature(featureset,
3120 					    X86FSET_TSX_CTRL);
3121 				}
3122 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3123 					add_x86_feature(featureset,
3124 					    X86FSET_TAA_NO);
3125 				}
3126 			}
3127 			no_trap();
3128 		}
3129 #endif	/* !__xpv */
3130 
3131 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3132 			add_x86_feature(featureset, X86FSET_SSBD);
3133 
3134 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3135 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3136 	}
3137 
3138 	/*
3139 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3140 	 * will have already run this function and determined what we need to
3141 	 * do. This gives us a hook for per-HW thread mitigations such as
3142 	 * enhanced IBRS, or disabling TSX.
3143 	 */
3144 	if (cpu->cpu_id != 0) {
3145 		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
3146 			cpuid_enable_enhanced_ibrs();
3147 		}
3148 
3149 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3150 		return;
3151 	}
3152 
3153 	/*
3154 	 * Go through and initialize various security mechanisms that we should
3155 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3156 	 * TAA.
3157 	 */
3158 
3159 	/*
3160 	 * By default we've come in with retpolines enabled. Check whether we
3161 	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3162 	 * by default, but disabled if we are using enhanced IBRS. Note, we do
3163 	 * not allow the use of AMD optimized retpolines as it was disclosed by
3164 	 * AMD in March 2022 that they were still vulnerable. Prior to that
3165 	 * point, we used them.
3166 	 */
3167 	if (x86_disable_spectrev2 != 0) {
3168 		v2mit = X86_SPECTREV2_DISABLED;
3169 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3170 		cpuid_enable_enhanced_ibrs();
3171 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3172 	} else {
3173 		v2mit = X86_SPECTREV2_RETPOLINE;
3174 	}
3175 
3176 	cpuid_patch_retpolines(v2mit);
3177 	cpuid_patch_rsb(v2mit);
3178 	x86_spectrev2_mitigation = v2mit;
3179 	membar_producer();
3180 
3181 	/*
3182 	 * We need to determine what changes are required for mitigating L1TF
3183 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3184 	 * is required.
3185 	 *
3186 	 * If any of these are present, then we need to flush u-arch state at
3187 	 * various points. For MDS, we need to do so whenever we change to a
3188 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3189 	 * flush the L1D cache at VM entry. When we have microcode that handles
3190 	 * MDS, the L1D flush also clears the other u-arch state that the
3191 	 * md_clear does.
3192 	 */
3193 
3194 	/*
3195 	 * Update whether or not we need to be taking explicit action against
3196 	 * MDS.
3197 	 */
3198 	cpuid_update_md_clear(cpu, featureset);
3199 
3200 	/*
3201 	 * Determine whether SMT exclusion is required and whether or not we
3202 	 * need to perform an l1d flush.
3203 	 */
3204 	cpuid_update_l1d_flush(cpu, featureset);
3205 
3206 	/*
3207 	 * Determine what our mitigation strategy should be for TAA and then
3208 	 * also apply TAA mitigations.
3209 	 */
3210 	cpuid_update_tsx(cpu, featureset);
3211 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3212 }
3213 
3214 /*
3215  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3216  */
3217 void
3218 setup_xfem(void)
3219 {
3220 	uint64_t flags = XFEATURE_LEGACY_FP;
3221 
3222 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3223 
3224 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3225 		flags |= XFEATURE_SSE;
3226 
3227 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3228 		flags |= XFEATURE_AVX;
3229 
3230 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3231 		flags |= XFEATURE_AVX512;
3232 
3233 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3234 
3235 	xsave_bv_all = flags;
3236 }
3237 
3238 static void
3239 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3240 {
3241 	struct cpuid_info *cpi;
3242 
3243 	cpi = cpu->cpu_m.mcpu_cpi;
3244 
3245 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3246 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3247 		cpuid_gather_amd_topology_leaves(cpu);
3248 	}
3249 
3250 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3251 
3252 	/*
3253 	 * Before we can calculate the IDs that we should assign to this
3254 	 * processor, we need to understand how many cores and threads it has.
3255 	 */
3256 	switch (cpi->cpi_vendor) {
3257 	case X86_VENDOR_Intel:
3258 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3259 		    &cpi->cpi_ncore_per_chip);
3260 		break;
3261 	case X86_VENDOR_AMD:
3262 	case X86_VENDOR_HYGON:
3263 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3264 		    &cpi->cpi_ncore_per_chip);
3265 		break;
3266 	default:
3267 		/*
3268 		 * If we have some other x86 compatible chip, it's not clear how
3269 		 * they would behave. The most common case is virtualization
3270 		 * today, though there are also 64-bit VIA chips. Assume that
3271 		 * all we can get is the basic Leaf 1 HTT information.
3272 		 */
3273 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3274 			cpi->cpi_ncore_per_chip = 1;
3275 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3276 		}
3277 		break;
3278 	}
3279 
3280 	/*
3281 	 * Based on the calculated number of threads and cores, potentially
3282 	 * assign the HTT and CMT features.
3283 	 */
3284 	if (cpi->cpi_ncore_per_chip > 1) {
3285 		add_x86_feature(featureset, X86FSET_CMP);
3286 	}
3287 
3288 	if (cpi->cpi_ncpu_per_chip > 1 &&
3289 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3290 		add_x86_feature(featureset, X86FSET_HTT);
3291 	}
3292 
3293 	/*
3294 	 * Now that has been set up, we need to go through and calculate all of
3295 	 * the rest of the parameters that exist. If we think the CPU doesn't
3296 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3297 	 * up information in some way. The most likely case for this is
3298 	 * virtualization where we have a lot of partial topology information.
3299 	 */
3300 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3301 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3302 		/*
3303 		 * This is a single core, single-threaded processor.
3304 		 */
3305 		cpi->cpi_procnodes_per_pkg = 1;
3306 		cpi->cpi_cores_per_compunit = 1;
3307 		cpi->cpi_compunitid = 0;
3308 		cpi->cpi_chipid = -1;
3309 		cpi->cpi_clogid = 0;
3310 		cpi->cpi_coreid = cpu->cpu_id;
3311 		cpi->cpi_pkgcoreid = 0;
3312 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3313 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3314 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3315 		} else {
3316 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3317 		}
3318 	} else {
3319 		switch (cpi->cpi_vendor) {
3320 		case X86_VENDOR_Intel:
3321 			cpuid_intel_getids(cpu, featureset);
3322 			break;
3323 		case X86_VENDOR_AMD:
3324 		case X86_VENDOR_HYGON:
3325 			cpuid_amd_getids(cpu, featureset);
3326 			break;
3327 		default:
3328 			/*
3329 			 * In this case, it's hard to say what we should do.
3330 			 * We're going to model them to the OS as single core
3331 			 * threads. We don't have a good identifier for them, so
3332 			 * we're just going to use the cpu id all on a single
3333 			 * chip.
3334 			 *
3335 			 * This case has historically been different from the
3336 			 * case above where we don't have HTT or CMP. While they
3337 			 * could be combined, we've opted to keep it separate to
3338 			 * minimize the risk of topology changes in weird cases.
3339 			 */
3340 			cpi->cpi_procnodes_per_pkg = 1;
3341 			cpi->cpi_cores_per_compunit = 1;
3342 			cpi->cpi_chipid = 0;
3343 			cpi->cpi_coreid = cpu->cpu_id;
3344 			cpi->cpi_clogid = cpu->cpu_id;
3345 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3346 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3347 			cpi->cpi_compunitid = cpi->cpi_coreid;
3348 			break;
3349 		}
3350 	}
3351 }
3352 
3353 /*
3354  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3355  * always gather leaf 6 if it's supported; however, we only look for features on
3356  * Intel systems as AMD does not currently define any of the features we look
3357  * for below.
3358  */
3359 static void
3360 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3361 {
3362 	struct cpuid_regs *cp;
3363 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3364 
3365 	if (cpi->cpi_maxeax < 6) {
3366 		return;
3367 	}
3368 
3369 	cp = &cpi->cpi_std[6];
3370 	cp->cp_eax = 6;
3371 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3372 	(void) __cpuid_insn(cp);
3373 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3374 
3375 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3376 		return;
3377 	}
3378 
3379 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3380 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3381 	}
3382 
3383 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3384 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3385 	}
3386 }
3387 
3388 /*
3389  * PPIN is the protected processor inventory number. On AMD this is an actual
3390  * feature bit. However, on Intel systems we need to read the platform
3391  * information MSR if we're on a specific model.
3392  */
3393 #if !defined(__xpv)
3394 static void
3395 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3396 {
3397 	on_trap_data_t otd;
3398 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3399 
3400 	switch (cpi->cpi_vendor) {
3401 	case X86_VENDOR_AMD:
3402 		/*
3403 		 * This leaf will have already been gathered in the topology
3404 		 * functions.
3405 		 */
3406 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3407 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3408 				add_x86_feature(featureset, X86FSET_PPIN);
3409 			}
3410 		}
3411 		break;
3412 	case X86_VENDOR_Intel:
3413 		if (cpi->cpi_family != 6)
3414 			break;
3415 		switch (cpi->cpi_model) {
3416 		case INTC_MODEL_IVYBRIDGE_XEON:
3417 		case INTC_MODEL_HASWELL_XEON:
3418 		case INTC_MODEL_BROADWELL_XEON:
3419 		case INTC_MODEL_BROADWELL_XEON_D:
3420 		case INTC_MODEL_SKYLAKE_XEON:
3421 		case INTC_MODEL_ICELAKE_XEON:
3422 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3423 				uint64_t value;
3424 
3425 				value = rdmsr(MSR_PLATFORM_INFO);
3426 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3427 					add_x86_feature(featureset,
3428 					    X86FSET_PPIN);
3429 				}
3430 			}
3431 			no_trap();
3432 			break;
3433 		default:
3434 			break;
3435 		}
3436 		break;
3437 	default:
3438 		break;
3439 	}
3440 }
3441 #endif	/* ! __xpv */
3442 
3443 static void
3444 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3445 {
3446 	uchar_t *featureset = (uchar_t *)arg;
3447 
3448 	/*
3449 	 * We don't run on any processor that doesn't have cpuid, and could not
3450 	 * possibly have arrived here.
3451 	 */
3452 	add_x86_feature(featureset, X86FSET_CPUID);
3453 }
3454 
3455 static void
3456 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3457 {
3458 	struct cpuid_info *cpi;
3459 	struct cpuid_regs *cp;
3460 
3461 	/*
3462 	 * We require that virtual/native detection be complete and that PCI
3463 	 * config space access has been set up; at present there is no reliable
3464 	 * way to determine the latter.
3465 	 */
3466 #if !defined(__xpv)
3467 	ASSERT3S(platform_type, !=, -1);
3468 #endif	/* !__xpv */
3469 
3470 	cpi = cpu->cpu_m.mcpu_cpi;
3471 	ASSERT(cpi != NULL);
3472 
3473 	cp = &cpi->cpi_std[0];
3474 	cp->cp_eax = 0;
3475 	cpi->cpi_maxeax = __cpuid_insn(cp);
3476 	{
3477 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3478 		*iptr++ = cp->cp_ebx;
3479 		*iptr++ = cp->cp_edx;
3480 		*iptr++ = cp->cp_ecx;
3481 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3482 	}
3483 
3484 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3485 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3486 
3487 	/*
3488 	 * Limit the range in case of weird hardware
3489 	 */
3490 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3491 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3492 	if (cpi->cpi_maxeax < 1)
3493 		return;
3494 
3495 	cp = &cpi->cpi_std[1];
3496 	cp->cp_eax = 1;
3497 	(void) __cpuid_insn(cp);
3498 
3499 	/*
3500 	 * Extract identifying constants for easy access.
3501 	 */
3502 	cpi->cpi_model = CPI_MODEL(cpi);
3503 	cpi->cpi_family = CPI_FAMILY(cpi);
3504 
3505 	if (cpi->cpi_family == 0xf)
3506 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3507 
3508 	/*
3509 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3510 	 * Intel, and presumably everyone else, uses model == 0xf, as
3511 	 * one would expect (max value means possible overflow).  Sigh.
3512 	 */
3513 
3514 	switch (cpi->cpi_vendor) {
3515 	case X86_VENDOR_Intel:
3516 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3517 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3518 		break;
3519 	case X86_VENDOR_AMD:
3520 		if (CPI_FAMILY(cpi) == 0xf)
3521 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3522 		break;
3523 	case X86_VENDOR_HYGON:
3524 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3525 		break;
3526 	default:
3527 		if (cpi->cpi_model == 0xf)
3528 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3529 		break;
3530 	}
3531 
3532 	cpi->cpi_step = CPI_STEP(cpi);
3533 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3534 
3535 	/*
3536 	 * Synthesize chip "revision" and socket type
3537 	 */
3538 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3539 	    cpi->cpi_model, cpi->cpi_step);
3540 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3541 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3542 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3543 	    cpi->cpi_model, cpi->cpi_step);
3544 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3545 	    cpi->cpi_model, cpi->cpi_step);
3546 }
3547 
3548 static void
3549 cpuid_pass_basic(cpu_t *cpu, void *arg)
3550 {
3551 	uchar_t *featureset = (uchar_t *)arg;
3552 	uint32_t mask_ecx, mask_edx;
3553 	struct cpuid_info *cpi;
3554 	struct cpuid_regs *cp;
3555 	int xcpuid;
3556 #if !defined(__xpv)
3557 	extern int idle_cpu_prefer_mwait;
3558 #endif
3559 
3560 	cpi = cpu->cpu_m.mcpu_cpi;
3561 	ASSERT(cpi != NULL);
3562 
3563 	if (cpi->cpi_maxeax < 1)
3564 		return;
3565 
3566 	/*
3567 	 * This was filled during the identification pass.
3568 	 */
3569 	cp = &cpi->cpi_std[1];
3570 
3571 	/*
3572 	 * *default* assumptions:
3573 	 * - believe %edx feature word
3574 	 * - ignore %ecx feature word
3575 	 * - 32-bit virtual and physical addressing
3576 	 */
3577 	mask_edx = 0xffffffff;
3578 	mask_ecx = 0;
3579 
3580 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3581 
3582 	switch (cpi->cpi_vendor) {
3583 	case X86_VENDOR_Intel:
3584 		if (cpi->cpi_family == 5)
3585 			x86_type = X86_TYPE_P5;
3586 		else if (IS_LEGACY_P6(cpi)) {
3587 			x86_type = X86_TYPE_P6;
3588 			pentiumpro_bug4046376 = 1;
3589 			/*
3590 			 * Clear the SEP bit when it was set erroneously
3591 			 */
3592 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3593 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3594 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3595 			x86_type = X86_TYPE_P4;
3596 			/*
3597 			 * We don't currently depend on any of the %ecx
3598 			 * features until Prescott, so we'll only check
3599 			 * this from P4 onwards.  We might want to revisit
3600 			 * that idea later.
3601 			 */
3602 			mask_ecx = 0xffffffff;
3603 		} else if (cpi->cpi_family > 0xf)
3604 			mask_ecx = 0xffffffff;
3605 		/*
3606 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3607 		 * to obtain the monitor linesize.
3608 		 */
3609 		if (cpi->cpi_maxeax < 5)
3610 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3611 		break;
3612 	case X86_VENDOR_IntelClone:
3613 	default:
3614 		break;
3615 	case X86_VENDOR_AMD:
3616 #if defined(OPTERON_ERRATUM_108)
3617 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3618 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3619 			cpi->cpi_model = 0xc;
3620 		} else
3621 #endif
3622 		if (cpi->cpi_family == 5) {
3623 			/*
3624 			 * AMD K5 and K6
3625 			 *
3626 			 * These CPUs have an incomplete implementation
3627 			 * of MCA/MCE which we mask away.
3628 			 */
3629 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3630 
3631 			/*
3632 			 * Model 0 uses the wrong (APIC) bit
3633 			 * to indicate PGE.  Fix it here.
3634 			 */
3635 			if (cpi->cpi_model == 0) {
3636 				if (cp->cp_edx & 0x200) {
3637 					cp->cp_edx &= ~0x200;
3638 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3639 				}
3640 			}
3641 
3642 			/*
3643 			 * Early models had problems w/ MMX; disable.
3644 			 */
3645 			if (cpi->cpi_model < 6)
3646 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3647 		}
3648 
3649 		/*
3650 		 * For newer families, SSE3 and CX16, at least, are valid;
3651 		 * enable all
3652 		 */
3653 		if (cpi->cpi_family >= 0xf)
3654 			mask_ecx = 0xffffffff;
3655 		/*
3656 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3657 		 * to obtain the monitor linesize.
3658 		 */
3659 		if (cpi->cpi_maxeax < 5)
3660 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3661 
3662 #if !defined(__xpv)
3663 		/*
3664 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3665 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3666 		 * know for certain that in at least family 17h, per AMD, mwait
3667 		 * is preferred. Families in-between are less certain.
3668 		 */
3669 		if (cpi->cpi_family < 0x17) {
3670 			idle_cpu_prefer_mwait = 0;
3671 		}
3672 #endif
3673 
3674 		break;
3675 	case X86_VENDOR_HYGON:
3676 		/* Enable all for Hygon Dhyana CPU */
3677 		mask_ecx = 0xffffffff;
3678 		break;
3679 	case X86_VENDOR_TM:
3680 		/*
3681 		 * workaround the NT workaround in CMS 4.1
3682 		 */
3683 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3684 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3685 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3686 		break;
3687 	case X86_VENDOR_Centaur:
3688 		/*
3689 		 * workaround the NT workarounds again
3690 		 */
3691 		if (cpi->cpi_family == 6)
3692 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3693 		break;
3694 	case X86_VENDOR_Cyrix:
3695 		/*
3696 		 * We rely heavily on the probing in locore
3697 		 * to actually figure out what parts, if any,
3698 		 * of the Cyrix cpuid instruction to believe.
3699 		 */
3700 		switch (x86_type) {
3701 		case X86_TYPE_CYRIX_486:
3702 			mask_edx = 0;
3703 			break;
3704 		case X86_TYPE_CYRIX_6x86:
3705 			mask_edx = 0;
3706 			break;
3707 		case X86_TYPE_CYRIX_6x86L:
3708 			mask_edx =
3709 			    CPUID_INTC_EDX_DE |
3710 			    CPUID_INTC_EDX_CX8;
3711 			break;
3712 		case X86_TYPE_CYRIX_6x86MX:
3713 			mask_edx =
3714 			    CPUID_INTC_EDX_DE |
3715 			    CPUID_INTC_EDX_MSR |
3716 			    CPUID_INTC_EDX_CX8 |
3717 			    CPUID_INTC_EDX_PGE |
3718 			    CPUID_INTC_EDX_CMOV |
3719 			    CPUID_INTC_EDX_MMX;
3720 			break;
3721 		case X86_TYPE_CYRIX_GXm:
3722 			mask_edx =
3723 			    CPUID_INTC_EDX_MSR |
3724 			    CPUID_INTC_EDX_CX8 |
3725 			    CPUID_INTC_EDX_CMOV |
3726 			    CPUID_INTC_EDX_MMX;
3727 			break;
3728 		case X86_TYPE_CYRIX_MediaGX:
3729 			break;
3730 		case X86_TYPE_CYRIX_MII:
3731 		case X86_TYPE_VIA_CYRIX_III:
3732 			mask_edx =
3733 			    CPUID_INTC_EDX_DE |
3734 			    CPUID_INTC_EDX_TSC |
3735 			    CPUID_INTC_EDX_MSR |
3736 			    CPUID_INTC_EDX_CX8 |
3737 			    CPUID_INTC_EDX_PGE |
3738 			    CPUID_INTC_EDX_CMOV |
3739 			    CPUID_INTC_EDX_MMX;
3740 			break;
3741 		default:
3742 			break;
3743 		}
3744 		break;
3745 	}
3746 
3747 #if defined(__xpv)
3748 	/*
3749 	 * Do not support MONITOR/MWAIT under a hypervisor
3750 	 */
3751 	mask_ecx &= ~CPUID_INTC_ECX_MON;
3752 	/*
3753 	 * Do not support XSAVE under a hypervisor for now
3754 	 */
3755 	xsave_force_disable = B_TRUE;
3756 
3757 #endif	/* __xpv */
3758 
3759 	if (xsave_force_disable) {
3760 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3761 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3762 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3763 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3764 	}
3765 
3766 	/*
3767 	 * Now we've figured out the masks that determine
3768 	 * which bits we choose to believe, apply the masks
3769 	 * to the feature words, then map the kernel's view
3770 	 * of these feature words into its feature word.
3771 	 */
3772 	cp->cp_edx &= mask_edx;
3773 	cp->cp_ecx &= mask_ecx;
3774 
3775 	/*
3776 	 * apply any platform restrictions (we don't call this
3777 	 * immediately after __cpuid_insn here, because we need the
3778 	 * workarounds applied above first)
3779 	 */
3780 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3781 
3782 	/*
3783 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3784 	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3785 	 */
3786 	if (cpi->cpi_maxeax >= 7) {
3787 		struct cpuid_regs *ecp;
3788 		ecp = &cpi->cpi_std[7];
3789 		ecp->cp_eax = 7;
3790 		ecp->cp_ecx = 0;
3791 		(void) __cpuid_insn(ecp);
3792 
3793 		/*
3794 		 * If XSAVE has been disabled, just ignore all of the
3795 		 * extended-save-area dependent flags here.
3796 		 */
3797 		if (xsave_force_disable) {
3798 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3799 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3800 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3801 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3802 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3803 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3804 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3805 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3806 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3807 		}
3808 
3809 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3810 			add_x86_feature(featureset, X86FSET_SMEP);
3811 
3812 		/*
3813 		 * We check disable_smap here in addition to in startup_smap()
3814 		 * to ensure CPUs that aren't the boot CPU don't accidentally
3815 		 * include it in the feature set and thus generate a mismatched
3816 		 * x86 feature set across CPUs.
3817 		 */
3818 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3819 		    disable_smap == 0)
3820 			add_x86_feature(featureset, X86FSET_SMAP);
3821 
3822 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3823 			add_x86_feature(featureset, X86FSET_RDSEED);
3824 
3825 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3826 			add_x86_feature(featureset, X86FSET_ADX);
3827 
3828 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3829 			add_x86_feature(featureset, X86FSET_FSGSBASE);
3830 
3831 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3832 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3833 
3834 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3835 			add_x86_feature(featureset, X86FSET_INVPCID);
3836 
3837 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3838 			add_x86_feature(featureset, X86FSET_UMIP);
3839 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3840 			add_x86_feature(featureset, X86FSET_PKU);
3841 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3842 			add_x86_feature(featureset, X86FSET_OSPKE);
3843 
3844 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3845 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3846 				add_x86_feature(featureset, X86FSET_MPX);
3847 
3848 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3849 				add_x86_feature(featureset, X86FSET_CLWB);
3850 		}
3851 	}
3852 
3853 	/*
3854 	 * fold in overrides from the "eeprom" mechanism
3855 	 */
3856 	cp->cp_edx |= cpuid_feature_edx_include;
3857 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3858 
3859 	cp->cp_ecx |= cpuid_feature_ecx_include;
3860 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3861 
3862 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3863 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3864 	}
3865 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3866 		add_x86_feature(featureset, X86FSET_TSC);
3867 	}
3868 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3869 		add_x86_feature(featureset, X86FSET_MSR);
3870 	}
3871 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3872 		add_x86_feature(featureset, X86FSET_MTRR);
3873 	}
3874 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3875 		add_x86_feature(featureset, X86FSET_PGE);
3876 	}
3877 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3878 		add_x86_feature(featureset, X86FSET_CMOV);
3879 	}
3880 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3881 		add_x86_feature(featureset, X86FSET_MMX);
3882 	}
3883 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3884 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3885 		add_x86_feature(featureset, X86FSET_MCA);
3886 	}
3887 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3888 		add_x86_feature(featureset, X86FSET_PAE);
3889 	}
3890 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3891 		add_x86_feature(featureset, X86FSET_CX8);
3892 	}
3893 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3894 		add_x86_feature(featureset, X86FSET_CX16);
3895 	}
3896 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3897 		add_x86_feature(featureset, X86FSET_PAT);
3898 	}
3899 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3900 		add_x86_feature(featureset, X86FSET_SEP);
3901 	}
3902 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3903 		/*
3904 		 * In our implementation, fxsave/fxrstor
3905 		 * are prerequisites before we'll even
3906 		 * try and do SSE things.
3907 		 */
3908 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3909 			add_x86_feature(featureset, X86FSET_SSE);
3910 		}
3911 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3912 			add_x86_feature(featureset, X86FSET_SSE2);
3913 		}
3914 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3915 			add_x86_feature(featureset, X86FSET_SSE3);
3916 		}
3917 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3918 			add_x86_feature(featureset, X86FSET_SSSE3);
3919 		}
3920 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3921 			add_x86_feature(featureset, X86FSET_SSE4_1);
3922 		}
3923 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3924 			add_x86_feature(featureset, X86FSET_SSE4_2);
3925 		}
3926 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3927 			add_x86_feature(featureset, X86FSET_AES);
3928 		}
3929 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3930 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3931 		}
3932 
3933 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3934 			add_x86_feature(featureset, X86FSET_SHA);
3935 
3936 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3937 			add_x86_feature(featureset, X86FSET_XSAVE);
3938 
3939 			/* We only test AVX & AVX512 when there is XSAVE */
3940 
3941 			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3942 				add_x86_feature(featureset,
3943 				    X86FSET_AVX);
3944 
3945 				/*
3946 				 * Intel says we can't check these without also
3947 				 * checking AVX.
3948 				 */
3949 				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3950 					add_x86_feature(featureset,
3951 					    X86FSET_F16C);
3952 
3953 				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3954 					add_x86_feature(featureset,
3955 					    X86FSET_FMA);
3956 
3957 				if (cpi->cpi_std[7].cp_ebx &
3958 				    CPUID_INTC_EBX_7_0_BMI1)
3959 					add_x86_feature(featureset,
3960 					    X86FSET_BMI1);
3961 
3962 				if (cpi->cpi_std[7].cp_ebx &
3963 				    CPUID_INTC_EBX_7_0_BMI2)
3964 					add_x86_feature(featureset,
3965 					    X86FSET_BMI2);
3966 
3967 				if (cpi->cpi_std[7].cp_ebx &
3968 				    CPUID_INTC_EBX_7_0_AVX2)
3969 					add_x86_feature(featureset,
3970 					    X86FSET_AVX2);
3971 
3972 				if (cpi->cpi_std[7].cp_ecx &
3973 				    CPUID_INTC_ECX_7_0_VAES)
3974 					add_x86_feature(featureset,
3975 					    X86FSET_VAES);
3976 
3977 				if (cpi->cpi_std[7].cp_ecx &
3978 				    CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3979 					add_x86_feature(featureset,
3980 					    X86FSET_VPCLMULQDQ);
3981 			}
3982 
3983 			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3984 			    (cpi->cpi_std[7].cp_ebx &
3985 			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3986 				add_x86_feature(featureset, X86FSET_AVX512F);
3987 
3988 				if (cpi->cpi_std[7].cp_ebx &
3989 				    CPUID_INTC_EBX_7_0_AVX512DQ)
3990 					add_x86_feature(featureset,
3991 					    X86FSET_AVX512DQ);
3992 				if (cpi->cpi_std[7].cp_ebx &
3993 				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3994 					add_x86_feature(featureset,
3995 					    X86FSET_AVX512FMA);
3996 				if (cpi->cpi_std[7].cp_ebx &
3997 				    CPUID_INTC_EBX_7_0_AVX512PF)
3998 					add_x86_feature(featureset,
3999 					    X86FSET_AVX512PF);
4000 				if (cpi->cpi_std[7].cp_ebx &
4001 				    CPUID_INTC_EBX_7_0_AVX512ER)
4002 					add_x86_feature(featureset,
4003 					    X86FSET_AVX512ER);
4004 				if (cpi->cpi_std[7].cp_ebx &
4005 				    CPUID_INTC_EBX_7_0_AVX512CD)
4006 					add_x86_feature(featureset,
4007 					    X86FSET_AVX512CD);
4008 				if (cpi->cpi_std[7].cp_ebx &
4009 				    CPUID_INTC_EBX_7_0_AVX512BW)
4010 					add_x86_feature(featureset,
4011 					    X86FSET_AVX512BW);
4012 				if (cpi->cpi_std[7].cp_ebx &
4013 				    CPUID_INTC_EBX_7_0_AVX512VL)
4014 					add_x86_feature(featureset,
4015 					    X86FSET_AVX512VL);
4016 
4017 				if (cpi->cpi_std[7].cp_ecx &
4018 				    CPUID_INTC_ECX_7_0_AVX512VBMI)
4019 					add_x86_feature(featureset,
4020 					    X86FSET_AVX512VBMI);
4021 				if (cpi->cpi_std[7].cp_ecx &
4022 				    CPUID_INTC_ECX_7_0_AVX512VNNI)
4023 					add_x86_feature(featureset,
4024 					    X86FSET_AVX512VNNI);
4025 				if (cpi->cpi_std[7].cp_ecx &
4026 				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
4027 					add_x86_feature(featureset,
4028 					    X86FSET_AVX512VPOPCDQ);
4029 
4030 				if (cpi->cpi_std[7].cp_edx &
4031 				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
4032 					add_x86_feature(featureset,
4033 					    X86FSET_AVX512NNIW);
4034 				if (cpi->cpi_std[7].cp_edx &
4035 				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
4036 					add_x86_feature(featureset,
4037 					    X86FSET_AVX512FMAPS);
4038 			}
4039 		}
4040 	}
4041 
4042 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4043 		add_x86_feature(featureset, X86FSET_PCID);
4044 	}
4045 
4046 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4047 		add_x86_feature(featureset, X86FSET_X2APIC);
4048 	}
4049 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4050 		add_x86_feature(featureset, X86FSET_DE);
4051 	}
4052 #if !defined(__xpv)
4053 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4054 
4055 		/*
4056 		 * We require the CLFLUSH instruction for erratum workaround
4057 		 * to use MONITOR/MWAIT.
4058 		 */
4059 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4060 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4061 			add_x86_feature(featureset, X86FSET_MWAIT);
4062 		} else {
4063 			extern int idle_cpu_assert_cflush_monitor;
4064 
4065 			/*
4066 			 * All processors we are aware of which have
4067 			 * MONITOR/MWAIT also have CLFLUSH.
4068 			 */
4069 			if (idle_cpu_assert_cflush_monitor) {
4070 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4071 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4072 			}
4073 		}
4074 	}
4075 #endif	/* __xpv */
4076 
4077 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4078 		add_x86_feature(featureset, X86FSET_VMX);
4079 	}
4080 
4081 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4082 		add_x86_feature(featureset, X86FSET_RDRAND);
4083 
4084 	/*
4085 	 * Only need it first time, rest of the cpus would follow suit.
4086 	 * we only capture this for the bootcpu.
4087 	 */
4088 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4089 		add_x86_feature(featureset, X86FSET_CLFSH);
4090 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4091 	}
4092 	if (is_x86_feature(featureset, X86FSET_PAE))
4093 		cpi->cpi_pabits = 36;
4094 
4095 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4096 		struct cpuid_regs r, *ecp;
4097 
4098 		ecp = &r;
4099 		ecp->cp_eax = 0xD;
4100 		ecp->cp_ecx = 1;
4101 		ecp->cp_edx = ecp->cp_ebx = 0;
4102 		(void) __cpuid_insn(ecp);
4103 
4104 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4105 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4106 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4107 			add_x86_feature(featureset, X86FSET_XSAVEC);
4108 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4109 			add_x86_feature(featureset, X86FSET_XSAVES);
4110 	}
4111 
4112 	/*
4113 	 * Work on the "extended" feature information, doing
4114 	 * some basic initialization to be used in the extended pass.
4115 	 */
4116 	xcpuid = 0;
4117 	switch (cpi->cpi_vendor) {
4118 	case X86_VENDOR_Intel:
4119 		/*
4120 		 * On KVM we know we will have proper support for extended
4121 		 * cpuid.
4122 		 */
4123 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4124 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4125 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4126 			xcpuid++;
4127 		break;
4128 	case X86_VENDOR_AMD:
4129 		if (cpi->cpi_family > 5 ||
4130 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4131 			xcpuid++;
4132 		break;
4133 	case X86_VENDOR_Cyrix:
4134 		/*
4135 		 * Only these Cyrix CPUs are -known- to support
4136 		 * extended cpuid operations.
4137 		 */
4138 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4139 		    x86_type == X86_TYPE_CYRIX_GXm)
4140 			xcpuid++;
4141 		break;
4142 	case X86_VENDOR_HYGON:
4143 	case X86_VENDOR_Centaur:
4144 	case X86_VENDOR_TM:
4145 	default:
4146 		xcpuid++;
4147 		break;
4148 	}
4149 
4150 	if (xcpuid) {
4151 		cp = &cpi->cpi_extd[0];
4152 		cp->cp_eax = CPUID_LEAF_EXT_0;
4153 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4154 	}
4155 
4156 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4157 
4158 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4159 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4160 
4161 		switch (cpi->cpi_vendor) {
4162 		case X86_VENDOR_Intel:
4163 		case X86_VENDOR_AMD:
4164 		case X86_VENDOR_HYGON:
4165 			if (cpi->cpi_xmaxeax < 0x80000001)
4166 				break;
4167 			cp = &cpi->cpi_extd[1];
4168 			cp->cp_eax = 0x80000001;
4169 			(void) __cpuid_insn(cp);
4170 
4171 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4172 			    cpi->cpi_family == 5 &&
4173 			    cpi->cpi_model == 6 &&
4174 			    cpi->cpi_step == 6) {
4175 				/*
4176 				 * K6 model 6 uses bit 10 to indicate SYSC
4177 				 * Later models use bit 11. Fix it here.
4178 				 */
4179 				if (cp->cp_edx & 0x400) {
4180 					cp->cp_edx &= ~0x400;
4181 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4182 				}
4183 			}
4184 
4185 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4186 
4187 			/*
4188 			 * Compute the additions to the kernel's feature word.
4189 			 */
4190 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4191 				add_x86_feature(featureset, X86FSET_NX);
4192 			}
4193 
4194 			/*
4195 			 * Regardless whether or not we boot 64-bit,
4196 			 * we should have a way to identify whether
4197 			 * the CPU is capable of running 64-bit.
4198 			 */
4199 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4200 				add_x86_feature(featureset, X86FSET_64);
4201 			}
4202 
4203 			/* 1 GB large page - enable only for 64 bit kernel */
4204 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4205 				add_x86_feature(featureset, X86FSET_1GPG);
4206 			}
4207 
4208 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4209 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4210 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4211 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4212 				add_x86_feature(featureset, X86FSET_SSE4A);
4213 			}
4214 
4215 			/*
4216 			 * It's really tricky to support syscall/sysret in
4217 			 * the i386 kernel; we rely on sysenter/sysexit
4218 			 * instead.  In the amd64 kernel, things are -way-
4219 			 * better.
4220 			 */
4221 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4222 				add_x86_feature(featureset, X86FSET_ASYSC);
4223 			}
4224 
4225 			/*
4226 			 * While we're thinking about system calls, note
4227 			 * that AMD processors don't support sysenter
4228 			 * in long mode at all, so don't try to program them.
4229 			 */
4230 			if (x86_vendor == X86_VENDOR_AMD ||
4231 			    x86_vendor == X86_VENDOR_HYGON) {
4232 				remove_x86_feature(featureset, X86FSET_SEP);
4233 			}
4234 
4235 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4236 				add_x86_feature(featureset, X86FSET_TSCP);
4237 			}
4238 
4239 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4240 				add_x86_feature(featureset, X86FSET_SVM);
4241 			}
4242 
4243 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4244 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4245 			}
4246 
4247 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4248 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4249 			}
4250 
4251 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4252 				add_x86_feature(featureset, X86FSET_XOP);
4253 			}
4254 
4255 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4256 				add_x86_feature(featureset, X86FSET_FMA4);
4257 			}
4258 
4259 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4260 				add_x86_feature(featureset, X86FSET_TBM);
4261 			}
4262 
4263 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4264 				add_x86_feature(featureset, X86FSET_MONITORX);
4265 			}
4266 			break;
4267 		default:
4268 			break;
4269 		}
4270 
4271 		/*
4272 		 * Get CPUID data about processor cores and hyperthreads.
4273 		 */
4274 		switch (cpi->cpi_vendor) {
4275 		case X86_VENDOR_Intel:
4276 			if (cpi->cpi_maxeax >= 4) {
4277 				cp = &cpi->cpi_std[4];
4278 				cp->cp_eax = 4;
4279 				cp->cp_ecx = 0;
4280 				(void) __cpuid_insn(cp);
4281 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4282 			}
4283 			/*FALLTHROUGH*/
4284 		case X86_VENDOR_AMD:
4285 		case X86_VENDOR_HYGON:
4286 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4287 				break;
4288 			cp = &cpi->cpi_extd[8];
4289 			cp->cp_eax = CPUID_LEAF_EXT_8;
4290 			(void) __cpuid_insn(cp);
4291 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4292 			    cp);
4293 
4294 			/*
4295 			 * AMD uses ebx for some extended functions.
4296 			 */
4297 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4298 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4299 				/*
4300 				 * While we're here, check for the AMD "Error
4301 				 * Pointer Zero/Restore" feature. This can be
4302 				 * used to setup the FP save handlers
4303 				 * appropriately.
4304 				 */
4305 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4306 					cpi->cpi_fp_amd_save = 0;
4307 				} else {
4308 					cpi->cpi_fp_amd_save = 1;
4309 				}
4310 
4311 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4312 					add_x86_feature(featureset,
4313 					    X86FSET_CLZERO);
4314 				}
4315 			}
4316 
4317 			/*
4318 			 * Virtual and physical address limits from
4319 			 * cpuid override previously guessed values.
4320 			 */
4321 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4322 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4323 			break;
4324 		default:
4325 			break;
4326 		}
4327 
4328 		/*
4329 		 * Get CPUID data about TSC Invariance in Deep C-State.
4330 		 */
4331 		switch (cpi->cpi_vendor) {
4332 		case X86_VENDOR_Intel:
4333 		case X86_VENDOR_AMD:
4334 		case X86_VENDOR_HYGON:
4335 			if (cpi->cpi_maxeax >= 7) {
4336 				cp = &cpi->cpi_extd[7];
4337 				cp->cp_eax = 0x80000007;
4338 				cp->cp_ecx = 0;
4339 				(void) __cpuid_insn(cp);
4340 			}
4341 			break;
4342 		default:
4343 			break;
4344 		}
4345 	}
4346 
4347 	/*
4348 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4349 	 * run and thus gathered some of its dependent leaves.
4350 	 */
4351 	cpuid_basic_topology(cpu, featureset);
4352 	cpuid_basic_thermal(cpu, featureset);
4353 #if !defined(__xpv)
4354 	cpuid_basic_ppin(cpu, featureset);
4355 #endif
4356 
4357 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4358 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4359 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4360 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4361 			/* Special handling for AMD FP not necessary. */
4362 			cpi->cpi_fp_amd_save = 0;
4363 		} else {
4364 			cpi->cpi_fp_amd_save = 1;
4365 		}
4366 	}
4367 
4368 	/*
4369 	 * Check (and potentially set) if lfence is serializing.
4370 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4371 	 */
4372 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4373 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4374 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4375 		/*
4376 		 * The AMD white paper Software Techniques For Managing
4377 		 * Speculation on AMD Processors details circumstances for when
4378 		 * lfence instructions are serializing.
4379 		 *
4380 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4381 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4382 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4383 		 * committed to supporting that MSR on all later CPUs.
4384 		 */
4385 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4386 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4387 		} else if (cpi->cpi_family >= 0x10) {
4388 #if !defined(__xpv)
4389 			uint64_t val;
4390 
4391 			/*
4392 			 * Be careful when attempting to enable the bit, and
4393 			 * verify that it was actually set in case we are
4394 			 * running in a hypervisor which is less than faithful
4395 			 * about its emulation of this feature.
4396 			 */
4397 			on_trap_data_t otd;
4398 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4399 				val = rdmsr(MSR_AMD_DE_CFG);
4400 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4401 				wrmsr(MSR_AMD_DE_CFG, val);
4402 				val = rdmsr(MSR_AMD_DE_CFG);
4403 			} else {
4404 				val = 0;
4405 			}
4406 			no_trap();
4407 
4408 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4409 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4410 			}
4411 #endif
4412 		}
4413 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4414 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4415 		/*
4416 		 * Documentation and other OSes indicate that lfence is always
4417 		 * serializing on Intel CPUs.
4418 		 */
4419 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4420 	}
4421 
4422 
4423 	/*
4424 	 * Check the processor leaves that are used for security features.
4425 	 */
4426 	cpuid_scan_security(cpu, featureset);
4427 }
4428 
4429 /*
4430  * Make copies of the cpuid table entries we depend on, in
4431  * part for ease of parsing now, in part so that we have only
4432  * one place to correct any of it, in part for ease of
4433  * later export to userland, and in part so we can look at
4434  * this stuff in a crash dump.
4435  */
4436 
4437 static void
4438 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4439 {
4440 	uint_t n, nmax;
4441 	int i;
4442 	struct cpuid_regs *cp;
4443 	uint8_t *dp;
4444 	uint32_t *iptr;
4445 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4446 
4447 	if (cpi->cpi_maxeax < 1)
4448 		return;
4449 
4450 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4451 		nmax = NMAX_CPI_STD;
4452 	/*
4453 	 * (We already handled n == 0 and n == 1 in the basic pass)
4454 	 */
4455 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4456 		/*
4457 		 * leaves 6 and 7 were handled in the basic pass
4458 		 */
4459 		if (n == 6 || n == 7)
4460 			continue;
4461 
4462 		cp->cp_eax = n;
4463 
4464 		/*
4465 		 * CPUID function 4 expects %ecx to be initialized
4466 		 * with an index which indicates which cache to return
4467 		 * information about. The OS is expected to call function 4
4468 		 * with %ecx set to 0, 1, 2, ... until it returns with
4469 		 * EAX[4:0] set to 0, which indicates there are no more
4470 		 * caches.
4471 		 *
4472 		 * Here, populate cpi_std[4] with the information returned by
4473 		 * function 4 when %ecx == 0, and do the rest in a later pass
4474 		 * when dynamic memory allocation becomes available.
4475 		 *
4476 		 * Note: we need to explicitly initialize %ecx here, since
4477 		 * function 4 may have been previously invoked.
4478 		 */
4479 		if (n == 4)
4480 			cp->cp_ecx = 0;
4481 
4482 		(void) __cpuid_insn(cp);
4483 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4484 		switch (n) {
4485 		case 2:
4486 			/*
4487 			 * "the lower 8 bits of the %eax register
4488 			 * contain a value that identifies the number
4489 			 * of times the cpuid [instruction] has to be
4490 			 * executed to obtain a complete image of the
4491 			 * processor's caching systems."
4492 			 *
4493 			 * How *do* they make this stuff up?
4494 			 */
4495 			cpi->cpi_ncache = sizeof (*cp) *
4496 			    BITX(cp->cp_eax, 7, 0);
4497 			if (cpi->cpi_ncache == 0)
4498 				break;
4499 			cpi->cpi_ncache--;	/* skip count byte */
4500 
4501 			/*
4502 			 * Well, for now, rather than attempt to implement
4503 			 * this slightly dubious algorithm, we just look
4504 			 * at the first 15 ..
4505 			 */
4506 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4507 				cpi->cpi_ncache = sizeof (*cp) - 1;
4508 
4509 			dp = cpi->cpi_cacheinfo;
4510 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4511 				uint8_t *p = (void *)&cp->cp_eax;
4512 				for (i = 1; i < 4; i++)
4513 					if (p[i] != 0)
4514 						*dp++ = p[i];
4515 			}
4516 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4517 				uint8_t *p = (void *)&cp->cp_ebx;
4518 				for (i = 0; i < 4; i++)
4519 					if (p[i] != 0)
4520 						*dp++ = p[i];
4521 			}
4522 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4523 				uint8_t *p = (void *)&cp->cp_ecx;
4524 				for (i = 0; i < 4; i++)
4525 					if (p[i] != 0)
4526 						*dp++ = p[i];
4527 			}
4528 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4529 				uint8_t *p = (void *)&cp->cp_edx;
4530 				for (i = 0; i < 4; i++)
4531 					if (p[i] != 0)
4532 						*dp++ = p[i];
4533 			}
4534 			break;
4535 
4536 		case 3:	/* Processor serial number, if PSN supported */
4537 			break;
4538 
4539 		case 4:	/* Deterministic cache parameters */
4540 			break;
4541 
4542 		case 5:	/* Monitor/Mwait parameters */
4543 		{
4544 			size_t mwait_size;
4545 
4546 			/*
4547 			 * check cpi_mwait.support which was set in
4548 			 * cpuid_pass_basic()
4549 			 */
4550 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4551 				break;
4552 
4553 			/*
4554 			 * Protect ourself from insane mwait line size.
4555 			 * Workaround for incomplete hardware emulator(s).
4556 			 */
4557 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4558 			if (mwait_size < sizeof (uint32_t) ||
4559 			    !ISP2(mwait_size)) {
4560 #if DEBUG
4561 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4562 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4563 #endif
4564 				break;
4565 			}
4566 
4567 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4568 			cpi->cpi_mwait.mon_max = mwait_size;
4569 			if (MWAIT_EXTENSION(cpi)) {
4570 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4571 				if (MWAIT_INT_ENABLE(cpi))
4572 					cpi->cpi_mwait.support |=
4573 					    MWAIT_ECX_INT_ENABLE;
4574 			}
4575 			break;
4576 		}
4577 		default:
4578 			break;
4579 		}
4580 	}
4581 
4582 	/*
4583 	 * XSAVE enumeration
4584 	 */
4585 	if (cpi->cpi_maxeax >= 0xD) {
4586 		struct cpuid_regs regs;
4587 		boolean_t cpuid_d_valid = B_TRUE;
4588 
4589 		cp = &regs;
4590 		cp->cp_eax = 0xD;
4591 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4592 
4593 		(void) __cpuid_insn(cp);
4594 
4595 		/*
4596 		 * Sanity checks for debug
4597 		 */
4598 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4599 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4600 			cpuid_d_valid = B_FALSE;
4601 		}
4602 
4603 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4604 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4605 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4606 
4607 		/*
4608 		 * If the hw supports AVX, get the size and offset in the save
4609 		 * area for the ymm state.
4610 		 */
4611 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4612 			cp->cp_eax = 0xD;
4613 			cp->cp_ecx = 2;
4614 			cp->cp_edx = cp->cp_ebx = 0;
4615 
4616 			(void) __cpuid_insn(cp);
4617 
4618 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4619 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4620 				cpuid_d_valid = B_FALSE;
4621 			}
4622 
4623 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4624 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4625 		}
4626 
4627 		/*
4628 		 * If the hw supports MPX, get the size and offset in the
4629 		 * save area for BNDREGS and BNDCSR.
4630 		 */
4631 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4632 			cp->cp_eax = 0xD;
4633 			cp->cp_ecx = 3;
4634 			cp->cp_edx = cp->cp_ebx = 0;
4635 
4636 			(void) __cpuid_insn(cp);
4637 
4638 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4639 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4640 
4641 			cp->cp_eax = 0xD;
4642 			cp->cp_ecx = 4;
4643 			cp->cp_edx = cp->cp_ebx = 0;
4644 
4645 			(void) __cpuid_insn(cp);
4646 
4647 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4648 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4649 		}
4650 
4651 		/*
4652 		 * If the hw supports AVX512, get the size and offset in the
4653 		 * save area for the opmask registers and zmm state.
4654 		 */
4655 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4656 			cp->cp_eax = 0xD;
4657 			cp->cp_ecx = 5;
4658 			cp->cp_edx = cp->cp_ebx = 0;
4659 
4660 			(void) __cpuid_insn(cp);
4661 
4662 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4663 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4664 
4665 			cp->cp_eax = 0xD;
4666 			cp->cp_ecx = 6;
4667 			cp->cp_edx = cp->cp_ebx = 0;
4668 
4669 			(void) __cpuid_insn(cp);
4670 
4671 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4672 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4673 
4674 			cp->cp_eax = 0xD;
4675 			cp->cp_ecx = 7;
4676 			cp->cp_edx = cp->cp_ebx = 0;
4677 
4678 			(void) __cpuid_insn(cp);
4679 
4680 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4681 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4682 		}
4683 
4684 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4685 			xsave_state_size = 0;
4686 		} else if (cpuid_d_valid) {
4687 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4688 		} else {
4689 			/* Broken CPUID 0xD, probably in HVM */
4690 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4691 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4692 			    ", ymm_size = %d, ymm_offset = %d\n",
4693 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4694 			    cpi->cpi_xsave.xsav_hw_features_high,
4695 			    (int)cpi->cpi_xsave.xsav_max_size,
4696 			    (int)cpi->cpi_xsave.ymm_size,
4697 			    (int)cpi->cpi_xsave.ymm_offset);
4698 
4699 			if (xsave_state_size != 0) {
4700 				/*
4701 				 * This must be a non-boot CPU. We cannot
4702 				 * continue, because boot cpu has already
4703 				 * enabled XSAVE.
4704 				 */
4705 				ASSERT(cpu->cpu_id != 0);
4706 				cmn_err(CE_PANIC, "cpu%d: we have already "
4707 				    "enabled XSAVE on boot cpu, cannot "
4708 				    "continue.", cpu->cpu_id);
4709 			} else {
4710 				/*
4711 				 * If we reached here on the boot CPU, it's also
4712 				 * almost certain that we'll reach here on the
4713 				 * non-boot CPUs. When we're here on a boot CPU
4714 				 * we should disable the feature, on a non-boot
4715 				 * CPU we need to confirm that we have.
4716 				 */
4717 				if (cpu->cpu_id == 0) {
4718 					remove_x86_feature(x86_featureset,
4719 					    X86FSET_XSAVE);
4720 					remove_x86_feature(x86_featureset,
4721 					    X86FSET_AVX);
4722 					remove_x86_feature(x86_featureset,
4723 					    X86FSET_F16C);
4724 					remove_x86_feature(x86_featureset,
4725 					    X86FSET_BMI1);
4726 					remove_x86_feature(x86_featureset,
4727 					    X86FSET_BMI2);
4728 					remove_x86_feature(x86_featureset,
4729 					    X86FSET_FMA);
4730 					remove_x86_feature(x86_featureset,
4731 					    X86FSET_AVX2);
4732 					remove_x86_feature(x86_featureset,
4733 					    X86FSET_MPX);
4734 					remove_x86_feature(x86_featureset,
4735 					    X86FSET_AVX512F);
4736 					remove_x86_feature(x86_featureset,
4737 					    X86FSET_AVX512DQ);
4738 					remove_x86_feature(x86_featureset,
4739 					    X86FSET_AVX512PF);
4740 					remove_x86_feature(x86_featureset,
4741 					    X86FSET_AVX512ER);
4742 					remove_x86_feature(x86_featureset,
4743 					    X86FSET_AVX512CD);
4744 					remove_x86_feature(x86_featureset,
4745 					    X86FSET_AVX512BW);
4746 					remove_x86_feature(x86_featureset,
4747 					    X86FSET_AVX512VL);
4748 					remove_x86_feature(x86_featureset,
4749 					    X86FSET_AVX512FMA);
4750 					remove_x86_feature(x86_featureset,
4751 					    X86FSET_AVX512VBMI);
4752 					remove_x86_feature(x86_featureset,
4753 					    X86FSET_AVX512VNNI);
4754 					remove_x86_feature(x86_featureset,
4755 					    X86FSET_AVX512VPOPCDQ);
4756 					remove_x86_feature(x86_featureset,
4757 					    X86FSET_AVX512NNIW);
4758 					remove_x86_feature(x86_featureset,
4759 					    X86FSET_AVX512FMAPS);
4760 					remove_x86_feature(x86_featureset,
4761 					    X86FSET_VAES);
4762 					remove_x86_feature(x86_featureset,
4763 					    X86FSET_VPCLMULQDQ);
4764 
4765 					CPI_FEATURES_ECX(cpi) &=
4766 					    ~CPUID_INTC_ECX_XSAVE;
4767 					CPI_FEATURES_ECX(cpi) &=
4768 					    ~CPUID_INTC_ECX_AVX;
4769 					CPI_FEATURES_ECX(cpi) &=
4770 					    ~CPUID_INTC_ECX_F16C;
4771 					CPI_FEATURES_ECX(cpi) &=
4772 					    ~CPUID_INTC_ECX_FMA;
4773 					CPI_FEATURES_7_0_EBX(cpi) &=
4774 					    ~CPUID_INTC_EBX_7_0_BMI1;
4775 					CPI_FEATURES_7_0_EBX(cpi) &=
4776 					    ~CPUID_INTC_EBX_7_0_BMI2;
4777 					CPI_FEATURES_7_0_EBX(cpi) &=
4778 					    ~CPUID_INTC_EBX_7_0_AVX2;
4779 					CPI_FEATURES_7_0_EBX(cpi) &=
4780 					    ~CPUID_INTC_EBX_7_0_MPX;
4781 					CPI_FEATURES_7_0_EBX(cpi) &=
4782 					    ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4783 
4784 					CPI_FEATURES_7_0_ECX(cpi) &=
4785 					    ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4786 
4787 					CPI_FEATURES_7_0_ECX(cpi) &=
4788 					    ~CPUID_INTC_ECX_7_0_VAES;
4789 					CPI_FEATURES_7_0_ECX(cpi) &=
4790 					    ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4791 
4792 					CPI_FEATURES_7_0_EDX(cpi) &=
4793 					    ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4794 
4795 					xsave_force_disable = B_TRUE;
4796 				} else {
4797 					VERIFY(is_x86_feature(x86_featureset,
4798 					    X86FSET_XSAVE) == B_FALSE);
4799 				}
4800 			}
4801 		}
4802 	}
4803 
4804 
4805 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4806 		return;
4807 
4808 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4809 		nmax = NMAX_CPI_EXTD;
4810 	/*
4811 	 * Copy the extended properties, fixing them as we go.
4812 	 * (We already handled n == 0 and n == 1 in the basic pass)
4813 	 */
4814 	iptr = (void *)cpi->cpi_brandstr;
4815 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4816 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4817 		(void) __cpuid_insn(cp);
4818 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4819 		    cp);
4820 		switch (n) {
4821 		case 2:
4822 		case 3:
4823 		case 4:
4824 			/*
4825 			 * Extract the brand string
4826 			 */
4827 			*iptr++ = cp->cp_eax;
4828 			*iptr++ = cp->cp_ebx;
4829 			*iptr++ = cp->cp_ecx;
4830 			*iptr++ = cp->cp_edx;
4831 			break;
4832 		case 5:
4833 			switch (cpi->cpi_vendor) {
4834 			case X86_VENDOR_AMD:
4835 				/*
4836 				 * The Athlon and Duron were the first
4837 				 * parts to report the sizes of the
4838 				 * TLB for large pages. Before then,
4839 				 * we don't trust the data.
4840 				 */
4841 				if (cpi->cpi_family < 6 ||
4842 				    (cpi->cpi_family == 6 &&
4843 				    cpi->cpi_model < 1))
4844 					cp->cp_eax = 0;
4845 				break;
4846 			default:
4847 				break;
4848 			}
4849 			break;
4850 		case 6:
4851 			switch (cpi->cpi_vendor) {
4852 			case X86_VENDOR_AMD:
4853 				/*
4854 				 * The Athlon and Duron were the first
4855 				 * AMD parts with L2 TLB's.
4856 				 * Before then, don't trust the data.
4857 				 */
4858 				if (cpi->cpi_family < 6 ||
4859 				    (cpi->cpi_family == 6 &&
4860 				    cpi->cpi_model < 1))
4861 					cp->cp_eax = cp->cp_ebx = 0;
4862 				/*
4863 				 * AMD Duron rev A0 reports L2
4864 				 * cache size incorrectly as 1K
4865 				 * when it is really 64K
4866 				 */
4867 				if (cpi->cpi_family == 6 &&
4868 				    cpi->cpi_model == 3 &&
4869 				    cpi->cpi_step == 0) {
4870 					cp->cp_ecx &= 0xffff;
4871 					cp->cp_ecx |= 0x400000;
4872 				}
4873 				break;
4874 			case X86_VENDOR_Cyrix:	/* VIA C3 */
4875 				/*
4876 				 * VIA C3 processors are a bit messed
4877 				 * up w.r.t. encoding cache sizes in %ecx
4878 				 */
4879 				if (cpi->cpi_family != 6)
4880 					break;
4881 				/*
4882 				 * model 7 and 8 were incorrectly encoded
4883 				 *
4884 				 * xxx is model 8 really broken?
4885 				 */
4886 				if (cpi->cpi_model == 7 ||
4887 				    cpi->cpi_model == 8)
4888 					cp->cp_ecx =
4889 					    BITX(cp->cp_ecx, 31, 24) << 16 |
4890 					    BITX(cp->cp_ecx, 23, 16) << 12 |
4891 					    BITX(cp->cp_ecx, 15, 8) << 8 |
4892 					    BITX(cp->cp_ecx, 7, 0);
4893 				/*
4894 				 * model 9 stepping 1 has wrong associativity
4895 				 */
4896 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4897 					cp->cp_ecx |= 8 << 12;
4898 				break;
4899 			case X86_VENDOR_Intel:
4900 				/*
4901 				 * Extended L2 Cache features function.
4902 				 * First appeared on Prescott.
4903 				 */
4904 			default:
4905 				break;
4906 			}
4907 			break;
4908 		default:
4909 			break;
4910 		}
4911 	}
4912 }
4913 
4914 static const char *
4915 intel_cpubrand(const struct cpuid_info *cpi)
4916 {
4917 	int i;
4918 
4919 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
4920 
4921 	switch (cpi->cpi_family) {
4922 	case 5:
4923 		return ("Intel Pentium(r)");
4924 	case 6:
4925 		switch (cpi->cpi_model) {
4926 			uint_t celeron, xeon;
4927 			const struct cpuid_regs *cp;
4928 		case 0:
4929 		case 1:
4930 		case 2:
4931 			return ("Intel Pentium(r) Pro");
4932 		case 3:
4933 		case 4:
4934 			return ("Intel Pentium(r) II");
4935 		case 6:
4936 			return ("Intel Celeron(r)");
4937 		case 5:
4938 		case 7:
4939 			celeron = xeon = 0;
4940 			cp = &cpi->cpi_std[2];	/* cache info */
4941 
4942 			for (i = 1; i < 4; i++) {
4943 				uint_t tmp;
4944 
4945 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4946 				if (tmp == 0x40)
4947 					celeron++;
4948 				if (tmp >= 0x44 && tmp <= 0x45)
4949 					xeon++;
4950 			}
4951 
4952 			for (i = 0; i < 2; i++) {
4953 				uint_t tmp;
4954 
4955 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4956 				if (tmp == 0x40)
4957 					celeron++;
4958 				else if (tmp >= 0x44 && tmp <= 0x45)
4959 					xeon++;
4960 			}
4961 
4962 			for (i = 0; i < 4; i++) {
4963 				uint_t tmp;
4964 
4965 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4966 				if (tmp == 0x40)
4967 					celeron++;
4968 				else if (tmp >= 0x44 && tmp <= 0x45)
4969 					xeon++;
4970 			}
4971 
4972 			for (i = 0; i < 4; i++) {
4973 				uint_t tmp;
4974 
4975 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4976 				if (tmp == 0x40)
4977 					celeron++;
4978 				else if (tmp >= 0x44 && tmp <= 0x45)
4979 					xeon++;
4980 			}
4981 
4982 			if (celeron)
4983 				return ("Intel Celeron(r)");
4984 			if (xeon)
4985 				return (cpi->cpi_model == 5 ?
4986 				    "Intel Pentium(r) II Xeon(tm)" :
4987 				    "Intel Pentium(r) III Xeon(tm)");
4988 			return (cpi->cpi_model == 5 ?
4989 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4990 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4991 		default:
4992 			break;
4993 		}
4994 	default:
4995 		break;
4996 	}
4997 
4998 	/* BrandID is present if the field is nonzero */
4999 	if (cpi->cpi_brandid != 0) {
5000 		static const struct {
5001 			uint_t bt_bid;
5002 			const char *bt_str;
5003 		} brand_tbl[] = {
5004 			{ 0x1,	"Intel(r) Celeron(r)" },
5005 			{ 0x2,	"Intel(r) Pentium(r) III" },
5006 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5007 			{ 0x4,	"Intel(r) Pentium(r) III" },
5008 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5009 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5010 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5011 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5012 			{ 0xa,	"Intel(r) Celeron(r)" },
5013 			{ 0xb,	"Intel(r) Xeon(tm)" },
5014 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5015 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5016 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5017 			{ 0x11, "Mobile Genuine Intel(r)" },
5018 			{ 0x12, "Intel(r) Celeron(r) M" },
5019 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5020 			{ 0x14, "Intel(r) Celeron(r)" },
5021 			{ 0x15, "Mobile Genuine Intel(r)" },
5022 			{ 0x16,	"Intel(r) Pentium(r) M" },
5023 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5024 		};
5025 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5026 		uint_t sgn;
5027 
5028 		sgn = (cpi->cpi_family << 8) |
5029 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5030 
5031 		for (i = 0; i < btblmax; i++)
5032 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5033 				break;
5034 		if (i < btblmax) {
5035 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5036 				return ("Intel(r) Celeron(r)");
5037 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5038 				return ("Intel(r) Xeon(tm) MP");
5039 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5040 				return ("Intel(r) Xeon(tm)");
5041 			return (brand_tbl[i].bt_str);
5042 		}
5043 	}
5044 
5045 	return (NULL);
5046 }
5047 
5048 static const char *
5049 amd_cpubrand(const struct cpuid_info *cpi)
5050 {
5051 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5052 
5053 	switch (cpi->cpi_family) {
5054 	case 5:
5055 		switch (cpi->cpi_model) {
5056 		case 0:
5057 		case 1:
5058 		case 2:
5059 		case 3:
5060 		case 4:
5061 		case 5:
5062 			return ("AMD-K5(r)");
5063 		case 6:
5064 		case 7:
5065 			return ("AMD-K6(r)");
5066 		case 8:
5067 			return ("AMD-K6(r)-2");
5068 		case 9:
5069 			return ("AMD-K6(r)-III");
5070 		default:
5071 			return ("AMD (family 5)");
5072 		}
5073 	case 6:
5074 		switch (cpi->cpi_model) {
5075 		case 1:
5076 			return ("AMD-K7(tm)");
5077 		case 0:
5078 		case 2:
5079 		case 4:
5080 			return ("AMD Athlon(tm)");
5081 		case 3:
5082 		case 7:
5083 			return ("AMD Duron(tm)");
5084 		case 6:
5085 		case 8:
5086 		case 10:
5087 			/*
5088 			 * Use the L2 cache size to distinguish
5089 			 */
5090 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5091 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5092 		default:
5093 			return ("AMD (family 6)");
5094 		}
5095 	default:
5096 		break;
5097 	}
5098 
5099 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5100 	    cpi->cpi_brandid != 0) {
5101 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5102 		case 3:
5103 			return ("AMD Opteron(tm) UP 1xx");
5104 		case 4:
5105 			return ("AMD Opteron(tm) DP 2xx");
5106 		case 5:
5107 			return ("AMD Opteron(tm) MP 8xx");
5108 		default:
5109 			return ("AMD Opteron(tm)");
5110 		}
5111 	}
5112 
5113 	return (NULL);
5114 }
5115 
5116 static const char *
5117 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5118 {
5119 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5120 
5121 	switch (type) {
5122 	case X86_TYPE_CYRIX_6x86:
5123 		return ("Cyrix 6x86");
5124 	case X86_TYPE_CYRIX_6x86L:
5125 		return ("Cyrix 6x86L");
5126 	case X86_TYPE_CYRIX_6x86MX:
5127 		return ("Cyrix 6x86MX");
5128 	case X86_TYPE_CYRIX_GXm:
5129 		return ("Cyrix GXm");
5130 	case X86_TYPE_CYRIX_MediaGX:
5131 		return ("Cyrix MediaGX");
5132 	case X86_TYPE_CYRIX_MII:
5133 		return ("Cyrix M2");
5134 	case X86_TYPE_VIA_CYRIX_III:
5135 		return ("VIA Cyrix M3");
5136 	default:
5137 		/*
5138 		 * Have another wild guess ..
5139 		 */
5140 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5141 			return ("Cyrix 5x86");
5142 		else if (cpi->cpi_family == 5) {
5143 			switch (cpi->cpi_model) {
5144 			case 2:
5145 				return ("Cyrix 6x86");	/* Cyrix M1 */
5146 			case 4:
5147 				return ("Cyrix MediaGX");
5148 			default:
5149 				break;
5150 			}
5151 		} else if (cpi->cpi_family == 6) {
5152 			switch (cpi->cpi_model) {
5153 			case 0:
5154 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5155 			case 5:
5156 			case 6:
5157 			case 7:
5158 			case 8:
5159 			case 9:
5160 				return ("VIA C3");
5161 			default:
5162 				break;
5163 			}
5164 		}
5165 		break;
5166 	}
5167 	return (NULL);
5168 }
5169 
5170 /*
5171  * This only gets called in the case that the CPU extended
5172  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5173  * aren't available, or contain null bytes for some reason.
5174  */
5175 static void
5176 fabricate_brandstr(struct cpuid_info *cpi)
5177 {
5178 	const char *brand = NULL;
5179 
5180 	switch (cpi->cpi_vendor) {
5181 	case X86_VENDOR_Intel:
5182 		brand = intel_cpubrand(cpi);
5183 		break;
5184 	case X86_VENDOR_AMD:
5185 		brand = amd_cpubrand(cpi);
5186 		break;
5187 	case X86_VENDOR_Cyrix:
5188 		brand = cyrix_cpubrand(cpi, x86_type);
5189 		break;
5190 	case X86_VENDOR_NexGen:
5191 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5192 			brand = "NexGen Nx586";
5193 		break;
5194 	case X86_VENDOR_Centaur:
5195 		if (cpi->cpi_family == 5)
5196 			switch (cpi->cpi_model) {
5197 			case 4:
5198 				brand = "Centaur C6";
5199 				break;
5200 			case 8:
5201 				brand = "Centaur C2";
5202 				break;
5203 			case 9:
5204 				brand = "Centaur C3";
5205 				break;
5206 			default:
5207 				break;
5208 			}
5209 		break;
5210 	case X86_VENDOR_Rise:
5211 		if (cpi->cpi_family == 5 &&
5212 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5213 			brand = "Rise mP6";
5214 		break;
5215 	case X86_VENDOR_SiS:
5216 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5217 			brand = "SiS 55x";
5218 		break;
5219 	case X86_VENDOR_TM:
5220 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5221 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5222 		break;
5223 	case X86_VENDOR_NSC:
5224 	case X86_VENDOR_UMC:
5225 	default:
5226 		break;
5227 	}
5228 	if (brand) {
5229 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5230 		return;
5231 	}
5232 
5233 	/*
5234 	 * If all else fails ...
5235 	 */
5236 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5237 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5238 	    cpi->cpi_model, cpi->cpi_step);
5239 }
5240 
5241 /*
5242  * This routine is called just after kernel memory allocation
5243  * becomes available on cpu0, and as part of mp_startup() on
5244  * the other cpus.
5245  *
5246  * Fixup the brand string, and collect any information from cpuid
5247  * that requires dynamically allocated storage to represent.
5248  */
5249 
5250 static void
5251 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5252 {
5253 	int	i, max, shft, level, size;
5254 	struct cpuid_regs regs;
5255 	struct cpuid_regs *cp;
5256 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5257 
5258 	/*
5259 	 * Deterministic cache parameters
5260 	 *
5261 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5262 	 * values that are present are currently defined to be the same. This
5263 	 * means we can use the same logic to parse it as long as we use the
5264 	 * appropriate leaf to get the data. If you're updating this, make sure
5265 	 * you're careful about which vendor supports which aspect.
5266 	 *
5267 	 * Take this opportunity to detect the number of threads sharing the
5268 	 * last level cache, and construct a corresponding cache id. The
5269 	 * respective cpuid_info members are initialized to the default case of
5270 	 * "no last level cache sharing".
5271 	 */
5272 	cpi->cpi_ncpu_shr_last_cache = 1;
5273 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5274 
5275 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5276 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5277 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5278 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5279 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5280 		uint32_t leaf;
5281 
5282 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5283 			leaf = 4;
5284 		} else {
5285 			leaf = CPUID_LEAF_EXT_1d;
5286 		}
5287 
5288 		/*
5289 		 * Find the # of elements (size) returned by the leaf and along
5290 		 * the way detect last level cache sharing details.
5291 		 */
5292 		bzero(&regs, sizeof (regs));
5293 		cp = &regs;
5294 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5295 			cp->cp_eax = leaf;
5296 			cp->cp_ecx = i;
5297 
5298 			(void) __cpuid_insn(cp);
5299 
5300 			if (CPI_CACHE_TYPE(cp) == 0)
5301 				break;
5302 			level = CPI_CACHE_LVL(cp);
5303 			if (level > max) {
5304 				max = level;
5305 				cpi->cpi_ncpu_shr_last_cache =
5306 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5307 			}
5308 		}
5309 		cpi->cpi_cache_leaf_size = size = i;
5310 
5311 		/*
5312 		 * Allocate the cpi_cache_leaves array. The first element
5313 		 * references the regs for the corresponding leaf with %ecx set
5314 		 * to 0. This was gathered in cpuid_pass_extended().
5315 		 */
5316 		if (size > 0) {
5317 			cpi->cpi_cache_leaves =
5318 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5319 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5320 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5321 			} else {
5322 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5323 			}
5324 
5325 			/*
5326 			 * Allocate storage to hold the additional regs
5327 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5328 			 *
5329 			 * The regs for the leaf, %ecx == 0 has already
5330 			 * been allocated as indicated above.
5331 			 */
5332 			for (i = 1; i < size; i++) {
5333 				cp = cpi->cpi_cache_leaves[i] =
5334 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5335 				cp->cp_eax = leaf;
5336 				cp->cp_ecx = i;
5337 
5338 				(void) __cpuid_insn(cp);
5339 			}
5340 		}
5341 		/*
5342 		 * Determine the number of bits needed to represent
5343 		 * the number of CPUs sharing the last level cache.
5344 		 *
5345 		 * Shift off that number of bits from the APIC id to
5346 		 * derive the cache id.
5347 		 */
5348 		shft = 0;
5349 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5350 			shft++;
5351 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5352 	}
5353 
5354 	/*
5355 	 * Now fixup the brand string
5356 	 */
5357 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5358 		fabricate_brandstr(cpi);
5359 	} else {
5360 
5361 		/*
5362 		 * If we successfully extracted a brand string from the cpuid
5363 		 * instruction, clean it up by removing leading spaces and
5364 		 * similar junk.
5365 		 */
5366 		if (cpi->cpi_brandstr[0]) {
5367 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5368 			char *src, *dst;
5369 
5370 			dst = src = (char *)cpi->cpi_brandstr;
5371 			src[maxlen - 1] = '\0';
5372 			/*
5373 			 * strip leading spaces
5374 			 */
5375 			while (*src == ' ')
5376 				src++;
5377 			/*
5378 			 * Remove any 'Genuine' or "Authentic" prefixes
5379 			 */
5380 			if (strncmp(src, "Genuine ", 8) == 0)
5381 				src += 8;
5382 			if (strncmp(src, "Authentic ", 10) == 0)
5383 				src += 10;
5384 
5385 			/*
5386 			 * Now do an in-place copy.
5387 			 * Map (R) to (r) and (TM) to (tm).
5388 			 * The era of teletypes is long gone, and there's
5389 			 * -really- no need to shout.
5390 			 */
5391 			while (*src != '\0') {
5392 				if (src[0] == '(') {
5393 					if (strncmp(src + 1, "R)", 2) == 0) {
5394 						(void) strncpy(dst, "(r)", 3);
5395 						src += 3;
5396 						dst += 3;
5397 						continue;
5398 					}
5399 					if (strncmp(src + 1, "TM)", 3) == 0) {
5400 						(void) strncpy(dst, "(tm)", 4);
5401 						src += 4;
5402 						dst += 4;
5403 						continue;
5404 					}
5405 				}
5406 				*dst++ = *src++;
5407 			}
5408 			*dst = '\0';
5409 
5410 			/*
5411 			 * Finally, remove any trailing spaces
5412 			 */
5413 			while (--dst > cpi->cpi_brandstr)
5414 				if (*dst == ' ')
5415 					*dst = '\0';
5416 				else
5417 					break;
5418 		} else
5419 			fabricate_brandstr(cpi);
5420 	}
5421 }
5422 
5423 /*
5424  * This routine is called out of bind_hwcap() much later in the life
5425  * of the kernel (post_startup()).  The job of this routine is to resolve
5426  * the hardware feature support and kernel support for those features into
5427  * what we're actually going to tell applications via the aux vector.
5428  */
5429 
5430 static void
5431 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5432 {
5433 	uint_t *hwcap_out = (uint_t *)arg;
5434 	struct cpuid_info *cpi;
5435 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
5436 
5437 	cpi = cpu->cpu_m.mcpu_cpi;
5438 
5439 	if (cpi->cpi_maxeax >= 1) {
5440 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5441 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5442 		uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
5443 
5444 		*edx = CPI_FEATURES_EDX(cpi);
5445 		*ecx = CPI_FEATURES_ECX(cpi);
5446 		*ebx = CPI_FEATURES_7_0_EBX(cpi);
5447 
5448 		/*
5449 		 * [these require explicit kernel support]
5450 		 */
5451 		if (!is_x86_feature(x86_featureset, X86FSET_SEP))
5452 			*edx &= ~CPUID_INTC_EDX_SEP;
5453 
5454 		if (!is_x86_feature(x86_featureset, X86FSET_SSE))
5455 			*edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
5456 		if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
5457 			*edx &= ~CPUID_INTC_EDX_SSE2;
5458 
5459 		if (!is_x86_feature(x86_featureset, X86FSET_HTT))
5460 			*edx &= ~CPUID_INTC_EDX_HTT;
5461 
5462 		if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
5463 			*ecx &= ~CPUID_INTC_ECX_SSE3;
5464 
5465 		if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
5466 			*ecx &= ~CPUID_INTC_ECX_SSSE3;
5467 		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
5468 			*ecx &= ~CPUID_INTC_ECX_SSE4_1;
5469 		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
5470 			*ecx &= ~CPUID_INTC_ECX_SSE4_2;
5471 		if (!is_x86_feature(x86_featureset, X86FSET_AES))
5472 			*ecx &= ~CPUID_INTC_ECX_AES;
5473 		if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
5474 			*ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
5475 		if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
5476 			*ecx &= ~(CPUID_INTC_ECX_XSAVE |
5477 			    CPUID_INTC_ECX_OSXSAVE);
5478 		if (!is_x86_feature(x86_featureset, X86FSET_AVX))
5479 			*ecx &= ~CPUID_INTC_ECX_AVX;
5480 		if (!is_x86_feature(x86_featureset, X86FSET_F16C))
5481 			*ecx &= ~CPUID_INTC_ECX_F16C;
5482 		if (!is_x86_feature(x86_featureset, X86FSET_FMA))
5483 			*ecx &= ~CPUID_INTC_ECX_FMA;
5484 		if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
5485 			*ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
5486 		if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
5487 			*ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
5488 		if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
5489 			*ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
5490 		if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
5491 			*ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
5492 		if (!is_x86_feature(x86_featureset, X86FSET_ADX))
5493 			*ebx &= ~CPUID_INTC_EBX_7_0_ADX;
5494 
5495 		/*
5496 		 * [no explicit support required beyond x87 fp context]
5497 		 */
5498 		if (!fpu_exists)
5499 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5500 
5501 		/*
5502 		 * Now map the supported feature vector to things that we
5503 		 * think userland will care about.
5504 		 */
5505 		if (*edx & CPUID_INTC_EDX_SEP)
5506 			hwcap_flags |= AV_386_SEP;
5507 		if (*edx & CPUID_INTC_EDX_SSE)
5508 			hwcap_flags |= AV_386_FXSR | AV_386_SSE;
5509 		if (*edx & CPUID_INTC_EDX_SSE2)
5510 			hwcap_flags |= AV_386_SSE2;
5511 		if (*ecx & CPUID_INTC_ECX_SSE3)
5512 			hwcap_flags |= AV_386_SSE3;
5513 		if (*ecx & CPUID_INTC_ECX_SSSE3)
5514 			hwcap_flags |= AV_386_SSSE3;
5515 		if (*ecx & CPUID_INTC_ECX_SSE4_1)
5516 			hwcap_flags |= AV_386_SSE4_1;
5517 		if (*ecx & CPUID_INTC_ECX_SSE4_2)
5518 			hwcap_flags |= AV_386_SSE4_2;
5519 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5520 			hwcap_flags |= AV_386_MOVBE;
5521 		if (*ecx & CPUID_INTC_ECX_AES)
5522 			hwcap_flags |= AV_386_AES;
5523 		if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5524 			hwcap_flags |= AV_386_PCLMULQDQ;
5525 		if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5526 		    (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5527 			hwcap_flags |= AV_386_XSAVE;
5528 
5529 			if (*ecx & CPUID_INTC_ECX_AVX) {
5530 				uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5531 				uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5532 
5533 				hwcap_flags |= AV_386_AVX;
5534 				if (*ecx & CPUID_INTC_ECX_F16C)
5535 					hwcap_flags_2 |= AV_386_2_F16C;
5536 				if (*ecx & CPUID_INTC_ECX_FMA)
5537 					hwcap_flags_2 |= AV_386_2_FMA;
5538 
5539 				if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5540 					hwcap_flags_2 |= AV_386_2_BMI1;
5541 				if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5542 					hwcap_flags_2 |= AV_386_2_BMI2;
5543 				if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5544 					hwcap_flags_2 |= AV_386_2_AVX2;
5545 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5546 					hwcap_flags_2 |= AV_386_2_AVX512F;
5547 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5548 					hwcap_flags_2 |= AV_386_2_AVX512DQ;
5549 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5550 					hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5551 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5552 					hwcap_flags_2 |= AV_386_2_AVX512PF;
5553 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5554 					hwcap_flags_2 |= AV_386_2_AVX512ER;
5555 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5556 					hwcap_flags_2 |= AV_386_2_AVX512CD;
5557 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5558 					hwcap_flags_2 |= AV_386_2_AVX512BW;
5559 				if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5560 					hwcap_flags_2 |= AV_386_2_AVX512VL;
5561 
5562 				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5563 					hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5564 				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5565 					hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5566 				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5567 					hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5568 				if (*ecx_7 & CPUID_INTC_ECX_7_0_VAES)
5569 					hwcap_flags_2 |= AV_386_2_VAES;
5570 				if (*ecx_7 & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
5571 					hwcap_flags_2 |= AV_386_2_VPCLMULQDQ;
5572 
5573 				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5574 					hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5575 				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5576 					hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5577 			}
5578 		}
5579 		if (*ecx & CPUID_INTC_ECX_VMX)
5580 			hwcap_flags |= AV_386_VMX;
5581 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5582 			hwcap_flags |= AV_386_POPCNT;
5583 		if (*edx & CPUID_INTC_EDX_FPU)
5584 			hwcap_flags |= AV_386_FPU;
5585 		if (*edx & CPUID_INTC_EDX_MMX)
5586 			hwcap_flags |= AV_386_MMX;
5587 
5588 		if (*edx & CPUID_INTC_EDX_TSC)
5589 			hwcap_flags |= AV_386_TSC;
5590 		if (*edx & CPUID_INTC_EDX_CX8)
5591 			hwcap_flags |= AV_386_CX8;
5592 		if (*edx & CPUID_INTC_EDX_CMOV)
5593 			hwcap_flags |= AV_386_CMOV;
5594 		if (*ecx & CPUID_INTC_ECX_CX16)
5595 			hwcap_flags |= AV_386_CX16;
5596 
5597 		if (*ecx & CPUID_INTC_ECX_RDRAND)
5598 			hwcap_flags_2 |= AV_386_2_RDRAND;
5599 		if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5600 			hwcap_flags_2 |= AV_386_2_ADX;
5601 		if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5602 			hwcap_flags_2 |= AV_386_2_RDSEED;
5603 		if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5604 			hwcap_flags_2 |= AV_386_2_SHA;
5605 		if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5606 			hwcap_flags_2 |= AV_386_2_FSGSBASE;
5607 		if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5608 			hwcap_flags_2 |= AV_386_2_CLWB;
5609 		if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5610 			hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5611 
5612 	}
5613 	/*
5614 	 * Check a few miscilaneous features.
5615 	 */
5616 	if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5617 		hwcap_flags_2 |= AV_386_2_CLZERO;
5618 
5619 	if (cpi->cpi_xmaxeax < 0x80000001)
5620 		goto resolve_done;
5621 
5622 	switch (cpi->cpi_vendor) {
5623 		struct cpuid_regs cp;
5624 		uint32_t *edx, *ecx;
5625 
5626 	case X86_VENDOR_Intel:
5627 		/*
5628 		 * Seems like Intel duplicated what we necessary
5629 		 * here to make the initial crop of 64-bit OS's work.
5630 		 * Hopefully, those are the only "extended" bits
5631 		 * they'll add.
5632 		 */
5633 		/*FALLTHROUGH*/
5634 
5635 	case X86_VENDOR_AMD:
5636 	case X86_VENDOR_HYGON:
5637 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5638 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5639 
5640 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5641 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5642 
5643 		/*
5644 		 * [these features require explicit kernel support]
5645 		 */
5646 		switch (cpi->cpi_vendor) {
5647 		case X86_VENDOR_Intel:
5648 			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5649 				*edx &= ~CPUID_AMD_EDX_TSCP;
5650 			break;
5651 
5652 		case X86_VENDOR_AMD:
5653 		case X86_VENDOR_HYGON:
5654 			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5655 				*edx &= ~CPUID_AMD_EDX_TSCP;
5656 			if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5657 				*ecx &= ~CPUID_AMD_ECX_SSE4A;
5658 			break;
5659 
5660 		default:
5661 			break;
5662 		}
5663 
5664 		/*
5665 		 * [no explicit support required beyond
5666 		 * x87 fp context and exception handlers]
5667 		 */
5668 		if (!fpu_exists)
5669 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5670 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5671 
5672 		if (!is_x86_feature(x86_featureset, X86FSET_NX))
5673 			*edx &= ~CPUID_AMD_EDX_NX;
5674 		/*
5675 		 * Now map the supported feature vector to
5676 		 * things that we think userland will care about.
5677 		 */
5678 		if (*edx & CPUID_AMD_EDX_SYSC)
5679 			hwcap_flags |= AV_386_AMD_SYSC;
5680 		if (*edx & CPUID_AMD_EDX_MMXamd)
5681 			hwcap_flags |= AV_386_AMD_MMX;
5682 		if (*edx & CPUID_AMD_EDX_3DNow)
5683 			hwcap_flags |= AV_386_AMD_3DNow;
5684 		if (*edx & CPUID_AMD_EDX_3DNowx)
5685 			hwcap_flags |= AV_386_AMD_3DNowx;
5686 		if (*ecx & CPUID_AMD_ECX_SVM)
5687 			hwcap_flags |= AV_386_AMD_SVM;
5688 
5689 		switch (cpi->cpi_vendor) {
5690 		case X86_VENDOR_AMD:
5691 		case X86_VENDOR_HYGON:
5692 			if (*edx & CPUID_AMD_EDX_TSCP)
5693 				hwcap_flags |= AV_386_TSCP;
5694 			if (*ecx & CPUID_AMD_ECX_AHF64)
5695 				hwcap_flags |= AV_386_AHF;
5696 			if (*ecx & CPUID_AMD_ECX_SSE4A)
5697 				hwcap_flags |= AV_386_AMD_SSE4A;
5698 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5699 				hwcap_flags |= AV_386_AMD_LZCNT;
5700 			if (*ecx & CPUID_AMD_ECX_MONITORX)
5701 				hwcap_flags_2 |= AV_386_2_MONITORX;
5702 			break;
5703 
5704 		case X86_VENDOR_Intel:
5705 			if (*edx & CPUID_AMD_EDX_TSCP)
5706 				hwcap_flags |= AV_386_TSCP;
5707 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5708 				hwcap_flags |= AV_386_AMD_LZCNT;
5709 			/*
5710 			 * Aarrgh.
5711 			 * Intel uses a different bit in the same word.
5712 			 */
5713 			if (*ecx & CPUID_INTC_ECX_AHF64)
5714 				hwcap_flags |= AV_386_AHF;
5715 			break;
5716 
5717 		default:
5718 			break;
5719 		}
5720 		break;
5721 
5722 	case X86_VENDOR_TM:
5723 		cp.cp_eax = 0x80860001;
5724 		(void) __cpuid_insn(&cp);
5725 		cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5726 		break;
5727 
5728 	default:
5729 		break;
5730 	}
5731 
5732 resolve_done:
5733 	if (hwcap_out != NULL) {
5734 		hwcap_out[0] = hwcap_flags;
5735 		hwcap_out[1] = hwcap_flags_2;
5736 	}
5737 }
5738 
5739 
5740 /*
5741  * Simulate the cpuid instruction using the data we previously
5742  * captured about this CPU.  We try our best to return the truth
5743  * about the hardware, independently of kernel support.
5744  */
5745 uint32_t
5746 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5747 {
5748 	struct cpuid_info *cpi;
5749 	struct cpuid_regs *xcp;
5750 
5751 	if (cpu == NULL)
5752 		cpu = CPU;
5753 	cpi = cpu->cpu_m.mcpu_cpi;
5754 
5755 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5756 
5757 	/*
5758 	 * CPUID data is cached in two separate places: cpi_std for standard
5759 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5760 	 */
5761 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5762 		xcp = &cpi->cpi_std[cp->cp_eax];
5763 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5764 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5765 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5766 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5767 	} else {
5768 		/*
5769 		 * The caller is asking for data from an input parameter which
5770 		 * the kernel has not cached.  In this case we go fetch from
5771 		 * the hardware and return the data directly to the user.
5772 		 */
5773 		return (__cpuid_insn(cp));
5774 	}
5775 
5776 	cp->cp_eax = xcp->cp_eax;
5777 	cp->cp_ebx = xcp->cp_ebx;
5778 	cp->cp_ecx = xcp->cp_ecx;
5779 	cp->cp_edx = xcp->cp_edx;
5780 	return (cp->cp_eax);
5781 }
5782 
5783 boolean_t
5784 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5785 {
5786 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5787 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5788 }
5789 
5790 int
5791 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5792 {
5793 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5794 
5795 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5796 }
5797 
5798 int
5799 cpuid_is_cmt(cpu_t *cpu)
5800 {
5801 	if (cpu == NULL)
5802 		cpu = CPU;
5803 
5804 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5805 
5806 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5807 }
5808 
5809 /*
5810  * AMD and Intel both implement the 64-bit variant of the syscall
5811  * instruction (syscallq), so if there's -any- support for syscall,
5812  * cpuid currently says "yes, we support this".
5813  *
5814  * However, Intel decided to -not- implement the 32-bit variant of the
5815  * syscall instruction, so we provide a predicate to allow our caller
5816  * to test that subtlety here.
5817  *
5818  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5819  *	even in the case where the hardware would in fact support it.
5820  */
5821 /*ARGSUSED*/
5822 int
5823 cpuid_syscall32_insn(cpu_t *cpu)
5824 {
5825 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5826 
5827 #if !defined(__xpv)
5828 	if (cpu == NULL)
5829 		cpu = CPU;
5830 
5831 	/*CSTYLED*/
5832 	{
5833 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5834 
5835 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5836 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5837 		    cpi->cpi_xmaxeax >= 0x80000001 &&
5838 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5839 			return (1);
5840 	}
5841 #endif
5842 	return (0);
5843 }
5844 
5845 int
5846 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5847 {
5848 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5849 
5850 	static const char fmt[] =
5851 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5852 	static const char fmt_ht[] =
5853 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5854 
5855 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5856 
5857 	if (cpuid_is_cmt(cpu))
5858 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5859 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5860 		    cpi->cpi_family, cpi->cpi_model,
5861 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5862 	return (snprintf(s, n, fmt,
5863 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5864 	    cpi->cpi_family, cpi->cpi_model,
5865 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5866 }
5867 
5868 const char *
5869 cpuid_getvendorstr(cpu_t *cpu)
5870 {
5871 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5872 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5873 }
5874 
5875 uint_t
5876 cpuid_getvendor(cpu_t *cpu)
5877 {
5878 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5879 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5880 }
5881 
5882 uint_t
5883 cpuid_getfamily(cpu_t *cpu)
5884 {
5885 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5886 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5887 }
5888 
5889 uint_t
5890 cpuid_getmodel(cpu_t *cpu)
5891 {
5892 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5893 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5894 }
5895 
5896 uint_t
5897 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5898 {
5899 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5900 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5901 }
5902 
5903 uint_t
5904 cpuid_get_ncore_per_chip(cpu_t *cpu)
5905 {
5906 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5907 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5908 }
5909 
5910 uint_t
5911 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5912 {
5913 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5914 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5915 }
5916 
5917 id_t
5918 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5919 {
5920 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5921 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5922 }
5923 
5924 uint_t
5925 cpuid_getstep(cpu_t *cpu)
5926 {
5927 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5928 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5929 }
5930 
5931 uint_t
5932 cpuid_getsig(struct cpu *cpu)
5933 {
5934 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5935 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5936 }
5937 
5938 uint32_t
5939 cpuid_getchiprev(struct cpu *cpu)
5940 {
5941 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5942 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5943 }
5944 
5945 const char *
5946 cpuid_getchiprevstr(struct cpu *cpu)
5947 {
5948 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5949 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5950 }
5951 
5952 uint32_t
5953 cpuid_getsockettype(struct cpu *cpu)
5954 {
5955 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5956 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5957 }
5958 
5959 const char *
5960 cpuid_getsocketstr(cpu_t *cpu)
5961 {
5962 	static const char *socketstr = NULL;
5963 	struct cpuid_info *cpi;
5964 
5965 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5966 	cpi = cpu->cpu_m.mcpu_cpi;
5967 
5968 	/* Assume that socket types are the same across the system */
5969 	if (socketstr == NULL)
5970 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5971 		    cpi->cpi_model, cpi->cpi_step);
5972 
5973 
5974 	return (socketstr);
5975 }
5976 
5977 x86_uarchrev_t
5978 cpuid_getuarchrev(cpu_t *cpu)
5979 {
5980 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
5981 }
5982 
5983 int
5984 cpuid_get_chipid(cpu_t *cpu)
5985 {
5986 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5987 
5988 	if (cpuid_is_cmt(cpu))
5989 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5990 	return (cpu->cpu_id);
5991 }
5992 
5993 id_t
5994 cpuid_get_coreid(cpu_t *cpu)
5995 {
5996 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5997 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5998 }
5999 
6000 int
6001 cpuid_get_pkgcoreid(cpu_t *cpu)
6002 {
6003 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6004 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6005 }
6006 
6007 int
6008 cpuid_get_clogid(cpu_t *cpu)
6009 {
6010 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6011 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6012 }
6013 
6014 int
6015 cpuid_get_cacheid(cpu_t *cpu)
6016 {
6017 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6018 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6019 }
6020 
6021 uint_t
6022 cpuid_get_procnodeid(cpu_t *cpu)
6023 {
6024 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6025 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6026 }
6027 
6028 uint_t
6029 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6030 {
6031 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6032 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6033 }
6034 
6035 uint_t
6036 cpuid_get_compunitid(cpu_t *cpu)
6037 {
6038 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6039 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6040 }
6041 
6042 uint_t
6043 cpuid_get_cores_per_compunit(cpu_t *cpu)
6044 {
6045 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6046 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6047 }
6048 
6049 uint32_t
6050 cpuid_get_apicid(cpu_t *cpu)
6051 {
6052 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6053 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6054 		return (UINT32_MAX);
6055 	} else {
6056 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6057 	}
6058 }
6059 
6060 void
6061 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6062 {
6063 	struct cpuid_info *cpi;
6064 
6065 	if (cpu == NULL)
6066 		cpu = CPU;
6067 	cpi = cpu->cpu_m.mcpu_cpi;
6068 
6069 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6070 
6071 	if (pabits)
6072 		*pabits = cpi->cpi_pabits;
6073 	if (vabits)
6074 		*vabits = cpi->cpi_vabits;
6075 }
6076 
6077 size_t
6078 cpuid_get_xsave_size()
6079 {
6080 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6081 	    sizeof (struct xsave_state)));
6082 }
6083 
6084 /*
6085  * Return true if the CPUs on this system require 'pointer clearing' for the
6086  * floating point error pointer exception handling. In the past, this has been
6087  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6088  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6089  * feature bit and is reflected in the cpi_fp_amd_save member.
6090  */
6091 boolean_t
6092 cpuid_need_fp_excp_handling()
6093 {
6094 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6095 	    cpuid_info0.cpi_fp_amd_save != 0);
6096 }
6097 
6098 /*
6099  * Returns the number of data TLB entries for a corresponding
6100  * pagesize.  If it can't be computed, or isn't known, the
6101  * routine returns zero.  If you ask about an architecturally
6102  * impossible pagesize, the routine will panic (so that the
6103  * hat implementor knows that things are inconsistent.)
6104  */
6105 uint_t
6106 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6107 {
6108 	struct cpuid_info *cpi;
6109 	uint_t dtlb_nent = 0;
6110 
6111 	if (cpu == NULL)
6112 		cpu = CPU;
6113 	cpi = cpu->cpu_m.mcpu_cpi;
6114 
6115 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6116 
6117 	/*
6118 	 * Check the L2 TLB info
6119 	 */
6120 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6121 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6122 
6123 		switch (pagesize) {
6124 
6125 		case 4 * 1024:
6126 			/*
6127 			 * All zero in the top 16 bits of the register
6128 			 * indicates a unified TLB. Size is in low 16 bits.
6129 			 */
6130 			if ((cp->cp_ebx & 0xffff0000) == 0)
6131 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6132 			else
6133 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6134 			break;
6135 
6136 		case 2 * 1024 * 1024:
6137 			if ((cp->cp_eax & 0xffff0000) == 0)
6138 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6139 			else
6140 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6141 			break;
6142 
6143 		default:
6144 			panic("unknown L2 pagesize");
6145 			/*NOTREACHED*/
6146 		}
6147 	}
6148 
6149 	if (dtlb_nent != 0)
6150 		return (dtlb_nent);
6151 
6152 	/*
6153 	 * No L2 TLB support for this size, try L1.
6154 	 */
6155 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6156 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6157 
6158 		switch (pagesize) {
6159 		case 4 * 1024:
6160 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6161 			break;
6162 		case 2 * 1024 * 1024:
6163 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6164 			break;
6165 		default:
6166 			panic("unknown L1 d-TLB pagesize");
6167 			/*NOTREACHED*/
6168 		}
6169 	}
6170 
6171 	return (dtlb_nent);
6172 }
6173 
6174 /*
6175  * Return 0 if the erratum is not present or not applicable, positive
6176  * if it is, and negative if the status of the erratum is unknown.
6177  *
6178  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6179  * Processors" #25759, Rev 3.57, August 2005
6180  */
6181 int
6182 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6183 {
6184 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6185 	uint_t eax;
6186 
6187 	/*
6188 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6189 	 * a legacy (32-bit) AMD CPU.
6190 	 */
6191 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6192 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6193 	    cpi->cpi_family == 6) {
6194 		return (0);
6195 	}
6196 
6197 	eax = cpi->cpi_std[1].cp_eax;
6198 
6199 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6200 #define	SH_B3(eax)	(eax == 0xf51)
6201 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6202 
6203 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6204 
6205 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6206 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6207 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6208 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6209 
6210 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6211 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6212 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6213 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6214 
6215 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6216 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6217 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6218 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6219 #define	BH_E4(eax)	(eax == 0x20fb1)
6220 #define	SH_E5(eax)	(eax == 0x20f42)
6221 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6222 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6223 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6224 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6225 			    DH_E6(eax) || JH_E6(eax))
6226 
6227 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6228 #define	DR_B0(eax)	(eax == 0x100f20)
6229 #define	DR_B1(eax)	(eax == 0x100f21)
6230 #define	DR_BA(eax)	(eax == 0x100f2a)
6231 #define	DR_B2(eax)	(eax == 0x100f22)
6232 #define	DR_B3(eax)	(eax == 0x100f23)
6233 #define	RB_C0(eax)	(eax == 0x100f40)
6234 
6235 	switch (erratum) {
6236 	case 1:
6237 		return (cpi->cpi_family < 0x10);
6238 	case 51:	/* what does the asterisk mean? */
6239 		return (B(eax) || SH_C0(eax) || CG(eax));
6240 	case 52:
6241 		return (B(eax));
6242 	case 57:
6243 		return (cpi->cpi_family <= 0x11);
6244 	case 58:
6245 		return (B(eax));
6246 	case 60:
6247 		return (cpi->cpi_family <= 0x11);
6248 	case 61:
6249 	case 62:
6250 	case 63:
6251 	case 64:
6252 	case 65:
6253 	case 66:
6254 	case 68:
6255 	case 69:
6256 	case 70:
6257 	case 71:
6258 		return (B(eax));
6259 	case 72:
6260 		return (SH_B0(eax));
6261 	case 74:
6262 		return (B(eax));
6263 	case 75:
6264 		return (cpi->cpi_family < 0x10);
6265 	case 76:
6266 		return (B(eax));
6267 	case 77:
6268 		return (cpi->cpi_family <= 0x11);
6269 	case 78:
6270 		return (B(eax) || SH_C0(eax));
6271 	case 79:
6272 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6273 	case 80:
6274 	case 81:
6275 	case 82:
6276 		return (B(eax));
6277 	case 83:
6278 		return (B(eax) || SH_C0(eax) || CG(eax));
6279 	case 85:
6280 		return (cpi->cpi_family < 0x10);
6281 	case 86:
6282 		return (SH_C0(eax) || CG(eax));
6283 	case 88:
6284 		return (B(eax) || SH_C0(eax));
6285 	case 89:
6286 		return (cpi->cpi_family < 0x10);
6287 	case 90:
6288 		return (B(eax) || SH_C0(eax) || CG(eax));
6289 	case 91:
6290 	case 92:
6291 		return (B(eax) || SH_C0(eax));
6292 	case 93:
6293 		return (SH_C0(eax));
6294 	case 94:
6295 		return (B(eax) || SH_C0(eax) || CG(eax));
6296 	case 95:
6297 		return (B(eax) || SH_C0(eax));
6298 	case 96:
6299 		return (B(eax) || SH_C0(eax) || CG(eax));
6300 	case 97:
6301 	case 98:
6302 		return (SH_C0(eax) || CG(eax));
6303 	case 99:
6304 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6305 	case 100:
6306 		return (B(eax) || SH_C0(eax));
6307 	case 101:
6308 	case 103:
6309 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6310 	case 104:
6311 		return (SH_C0(eax) || CG(eax) || D0(eax));
6312 	case 105:
6313 	case 106:
6314 	case 107:
6315 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6316 	case 108:
6317 		return (DH_CG(eax));
6318 	case 109:
6319 		return (SH_C0(eax) || CG(eax) || D0(eax));
6320 	case 110:
6321 		return (D0(eax) || EX(eax));
6322 	case 111:
6323 		return (CG(eax));
6324 	case 112:
6325 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6326 	case 113:
6327 		return (eax == 0x20fc0);
6328 	case 114:
6329 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6330 	case 115:
6331 		return (SH_E0(eax) || JH_E1(eax));
6332 	case 116:
6333 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6334 	case 117:
6335 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6336 	case 118:
6337 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6338 		    JH_E6(eax));
6339 	case 121:
6340 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6341 	case 122:
6342 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6343 	case 123:
6344 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6345 	case 131:
6346 		return (cpi->cpi_family < 0x10);
6347 	case 6336786:
6348 
6349 		/*
6350 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6351 		 * if this is a K8 family or newer processor. We're testing for
6352 		 * this 'erratum' to determine whether or not we have a constant
6353 		 * TSC.
6354 		 *
6355 		 * Our current fix for this is to disable the C1-Clock ramping.
6356 		 * However, this doesn't work on newer processor families nor
6357 		 * does it work when virtualized as those devices don't exist.
6358 		 */
6359 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6360 			return (0);
6361 		}
6362 
6363 		if (CPI_FAMILY(cpi) == 0xf) {
6364 			struct cpuid_regs regs;
6365 			regs.cp_eax = 0x80000007;
6366 			(void) __cpuid_insn(&regs);
6367 			return (!(regs.cp_edx & 0x100));
6368 		}
6369 		return (0);
6370 	case 147:
6371 		/*
6372 		 * This erratum (K8 #147) is not present on family 10 and newer.
6373 		 */
6374 		if (cpi->cpi_family >= 0x10) {
6375 			return (0);
6376 		}
6377 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6378 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6379 
6380 	case 6671130:
6381 		/*
6382 		 * check for processors (pre-Shanghai) that do not provide
6383 		 * optimal management of 1gb ptes in its tlb.
6384 		 */
6385 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6386 
6387 	case 298:
6388 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6389 		    DR_B2(eax) || RB_C0(eax));
6390 
6391 	case 721:
6392 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6393 
6394 	default:
6395 		return (-1);
6396 
6397 	}
6398 }
6399 
6400 /*
6401  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6402  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6403  */
6404 int
6405 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6406 {
6407 	struct cpuid_info	*cpi;
6408 	uint_t			osvwid;
6409 	static int		osvwfeature = -1;
6410 	uint64_t		osvwlength;
6411 
6412 
6413 	cpi = cpu->cpu_m.mcpu_cpi;
6414 
6415 	/* confirm OSVW supported */
6416 	if (osvwfeature == -1) {
6417 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6418 	} else {
6419 		/* assert that osvw feature setting is consistent on all cpus */
6420 		ASSERT(osvwfeature ==
6421 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6422 	}
6423 	if (!osvwfeature)
6424 		return (-1);
6425 
6426 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6427 
6428 	switch (erratum) {
6429 	case 298:	/* osvwid is 0 */
6430 		osvwid = 0;
6431 		if (osvwlength <= (uint64_t)osvwid) {
6432 			/* osvwid 0 is unknown */
6433 			return (-1);
6434 		}
6435 
6436 		/*
6437 		 * Check the OSVW STATUS MSR to determine the state
6438 		 * of the erratum where:
6439 		 *   0 - fixed by HW
6440 		 *   1 - BIOS has applied the workaround when BIOS
6441 		 *   workaround is available. (Or for other errata,
6442 		 *   OS workaround is required.)
6443 		 * For a value of 1, caller will confirm that the
6444 		 * erratum 298 workaround has indeed been applied by BIOS.
6445 		 *
6446 		 * A 1 may be set in cpus that have a HW fix
6447 		 * in a mixed cpu system. Regarding erratum 298:
6448 		 *   In a multiprocessor platform, the workaround above
6449 		 *   should be applied to all processors regardless of
6450 		 *   silicon revision when an affected processor is
6451 		 *   present.
6452 		 */
6453 
6454 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6455 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6456 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6457 
6458 	default:
6459 		return (-1);
6460 	}
6461 }
6462 
6463 static const char assoc_str[] = "associativity";
6464 static const char line_str[] = "line-size";
6465 static const char size_str[] = "size";
6466 
6467 static void
6468 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6469     uint32_t val)
6470 {
6471 	char buf[128];
6472 
6473 	/*
6474 	 * ndi_prop_update_int() is used because it is desirable for
6475 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6476 	 */
6477 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6478 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6479 }
6480 
6481 /*
6482  * Intel-style cache/tlb description
6483  *
6484  * Standard cpuid level 2 gives a randomly ordered
6485  * selection of tags that index into a table that describes
6486  * cache and tlb properties.
6487  */
6488 
6489 static const char l1_icache_str[] = "l1-icache";
6490 static const char l1_dcache_str[] = "l1-dcache";
6491 static const char l2_cache_str[] = "l2-cache";
6492 static const char l3_cache_str[] = "l3-cache";
6493 static const char itlb4k_str[] = "itlb-4K";
6494 static const char dtlb4k_str[] = "dtlb-4K";
6495 static const char itlb2M_str[] = "itlb-2M";
6496 static const char itlb4M_str[] = "itlb-4M";
6497 static const char dtlb4M_str[] = "dtlb-4M";
6498 static const char dtlb24_str[] = "dtlb0-2M-4M";
6499 static const char itlb424_str[] = "itlb-4K-2M-4M";
6500 static const char itlb24_str[] = "itlb-2M-4M";
6501 static const char dtlb44_str[] = "dtlb-4K-4M";
6502 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6503 static const char sl2_cache_str[] = "sectored-l2-cache";
6504 static const char itrace_str[] = "itrace-cache";
6505 static const char sl3_cache_str[] = "sectored-l3-cache";
6506 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6507 
6508 static const struct cachetab {
6509 	uint8_t		ct_code;
6510 	uint8_t		ct_assoc;
6511 	uint16_t	ct_line_size;
6512 	size_t		ct_size;
6513 	const char	*ct_label;
6514 } intel_ctab[] = {
6515 	/*
6516 	 * maintain descending order!
6517 	 *
6518 	 * Codes ignored - Reason
6519 	 * ----------------------
6520 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6521 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6522 	 */
6523 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6524 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6525 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6526 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6527 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6528 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6529 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6530 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6531 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6532 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6533 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6534 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6535 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6536 	{ 0xc0, 4, 0, 8, dtlb44_str },
6537 	{ 0xba, 4, 0, 64, dtlb4k_str },
6538 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6539 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6540 	{ 0xb2, 4, 0, 64, itlb4k_str },
6541 	{ 0xb0, 4, 0, 128, itlb4k_str },
6542 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6543 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6544 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6545 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6546 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6547 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6548 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6549 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6550 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6551 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6552 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6553 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6554 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6555 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6556 	{ 0x73, 8, 0, 64*1024, itrace_str},
6557 	{ 0x72, 8, 0, 32*1024, itrace_str},
6558 	{ 0x71, 8, 0, 16*1024, itrace_str},
6559 	{ 0x70, 8, 0, 12*1024, itrace_str},
6560 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6561 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6562 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6563 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6564 	{ 0x5d, 0, 0, 256, dtlb44_str},
6565 	{ 0x5c, 0, 0, 128, dtlb44_str},
6566 	{ 0x5b, 0, 0, 64, dtlb44_str},
6567 	{ 0x5a, 4, 0, 32, dtlb24_str},
6568 	{ 0x59, 0, 0, 16, dtlb4k_str},
6569 	{ 0x57, 4, 0, 16, dtlb4k_str},
6570 	{ 0x56, 4, 0, 16, dtlb4M_str},
6571 	{ 0x55, 0, 0, 7, itlb24_str},
6572 	{ 0x52, 0, 0, 256, itlb424_str},
6573 	{ 0x51, 0, 0, 128, itlb424_str},
6574 	{ 0x50, 0, 0, 64, itlb424_str},
6575 	{ 0x4f, 0, 0, 32, itlb4k_str},
6576 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6577 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6578 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6579 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6580 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6581 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6582 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6583 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6584 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6585 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6586 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6587 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6588 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6589 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6590 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6591 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6592 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6593 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6594 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6595 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6596 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6597 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6598 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6599 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6600 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6601 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6602 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6603 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6604 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6605 	{ 0x0b, 4, 0, 4, itlb4M_str},
6606 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6607 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6608 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6609 	{ 0x05, 4, 0, 32, dtlb4M_str},
6610 	{ 0x04, 4, 0, 8, dtlb4M_str},
6611 	{ 0x03, 4, 0, 64, dtlb4k_str},
6612 	{ 0x02, 4, 0, 2, itlb4M_str},
6613 	{ 0x01, 4, 0, 32, itlb4k_str},
6614 	{ 0 }
6615 };
6616 
6617 static const struct cachetab cyrix_ctab[] = {
6618 	{ 0x70, 4, 0, 32, "tlb-4K" },
6619 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6620 	{ 0 }
6621 };
6622 
6623 /*
6624  * Search a cache table for a matching entry
6625  */
6626 static const struct cachetab *
6627 find_cacheent(const struct cachetab *ct, uint_t code)
6628 {
6629 	if (code != 0) {
6630 		for (; ct->ct_code != 0; ct++)
6631 			if (ct->ct_code <= code)
6632 				break;
6633 		if (ct->ct_code == code)
6634 			return (ct);
6635 	}
6636 	return (NULL);
6637 }
6638 
6639 /*
6640  * Populate cachetab entry with L2 or L3 cache-information using
6641  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6642  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6643  * information is found.
6644  */
6645 static int
6646 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6647 {
6648 	uint32_t level, i;
6649 	int ret = 0;
6650 
6651 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6652 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6653 
6654 		if (level == 2 || level == 3) {
6655 			ct->ct_assoc =
6656 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6657 			ct->ct_line_size =
6658 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6659 			ct->ct_size = ct->ct_assoc *
6660 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6661 			    ct->ct_line_size *
6662 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6663 
6664 			if (level == 2) {
6665 				ct->ct_label = l2_cache_str;
6666 			} else if (level == 3) {
6667 				ct->ct_label = l3_cache_str;
6668 			}
6669 			ret = 1;
6670 		}
6671 	}
6672 
6673 	return (ret);
6674 }
6675 
6676 /*
6677  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6678  * The walk is terminated if the walker returns non-zero.
6679  */
6680 static void
6681 intel_walk_cacheinfo(struct cpuid_info *cpi,
6682     void *arg, int (*func)(void *, const struct cachetab *))
6683 {
6684 	const struct cachetab *ct;
6685 	struct cachetab des_49_ct, des_b1_ct;
6686 	uint8_t *dp;
6687 	int i;
6688 
6689 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6690 		return;
6691 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6692 		/*
6693 		 * For overloaded descriptor 0x49 we use cpuid function 4
6694 		 * if supported by the current processor, to create
6695 		 * cache information.
6696 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6697 		 * to disambiguate the cache information.
6698 		 */
6699 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6700 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6701 				ct = &des_49_ct;
6702 		} else if (*dp == 0xb1) {
6703 			des_b1_ct.ct_code = 0xb1;
6704 			des_b1_ct.ct_assoc = 4;
6705 			des_b1_ct.ct_line_size = 0;
6706 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6707 				des_b1_ct.ct_size = 8;
6708 				des_b1_ct.ct_label = itlb2M_str;
6709 			} else {
6710 				des_b1_ct.ct_size = 4;
6711 				des_b1_ct.ct_label = itlb4M_str;
6712 			}
6713 			ct = &des_b1_ct;
6714 		} else {
6715 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6716 				continue;
6717 			}
6718 		}
6719 
6720 		if (func(arg, ct) != 0) {
6721 			break;
6722 		}
6723 	}
6724 }
6725 
6726 /*
6727  * (Like the Intel one, except for Cyrix CPUs)
6728  */
6729 static void
6730 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6731     void *arg, int (*func)(void *, const struct cachetab *))
6732 {
6733 	const struct cachetab *ct;
6734 	uint8_t *dp;
6735 	int i;
6736 
6737 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6738 		return;
6739 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6740 		/*
6741 		 * Search Cyrix-specific descriptor table first ..
6742 		 */
6743 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6744 			if (func(arg, ct) != 0)
6745 				break;
6746 			continue;
6747 		}
6748 		/*
6749 		 * .. else fall back to the Intel one
6750 		 */
6751 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6752 			if (func(arg, ct) != 0)
6753 				break;
6754 			continue;
6755 		}
6756 	}
6757 }
6758 
6759 /*
6760  * A cacheinfo walker that adds associativity, line-size, and size properties
6761  * to the devinfo node it is passed as an argument.
6762  */
6763 static int
6764 add_cacheent_props(void *arg, const struct cachetab *ct)
6765 {
6766 	dev_info_t *devi = arg;
6767 
6768 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6769 	if (ct->ct_line_size != 0)
6770 		add_cache_prop(devi, ct->ct_label, line_str,
6771 		    ct->ct_line_size);
6772 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6773 	return (0);
6774 }
6775 
6776 
6777 static const char fully_assoc[] = "fully-associative?";
6778 
6779 /*
6780  * AMD style cache/tlb description
6781  *
6782  * Extended functions 5 and 6 directly describe properties of
6783  * tlbs and various cache levels.
6784  */
6785 static void
6786 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6787 {
6788 	switch (assoc) {
6789 	case 0:	/* reserved; ignore */
6790 		break;
6791 	default:
6792 		add_cache_prop(devi, label, assoc_str, assoc);
6793 		break;
6794 	case 0xff:
6795 		add_cache_prop(devi, label, fully_assoc, 1);
6796 		break;
6797 	}
6798 }
6799 
6800 static void
6801 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6802 {
6803 	if (size == 0)
6804 		return;
6805 	add_cache_prop(devi, label, size_str, size);
6806 	add_amd_assoc(devi, label, assoc);
6807 }
6808 
6809 static void
6810 add_amd_cache(dev_info_t *devi, const char *label,
6811     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6812 {
6813 	if (size == 0 || line_size == 0)
6814 		return;
6815 	add_amd_assoc(devi, label, assoc);
6816 	/*
6817 	 * Most AMD parts have a sectored cache. Multiple cache lines are
6818 	 * associated with each tag. A sector consists of all cache lines
6819 	 * associated with a tag. For example, the AMD K6-III has a sector
6820 	 * size of 2 cache lines per tag.
6821 	 */
6822 	if (lines_per_tag != 0)
6823 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6824 	add_cache_prop(devi, label, line_str, line_size);
6825 	add_cache_prop(devi, label, size_str, size * 1024);
6826 }
6827 
6828 static void
6829 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6830 {
6831 	switch (assoc) {
6832 	case 0:	/* off */
6833 		break;
6834 	case 1:
6835 	case 2:
6836 	case 4:
6837 		add_cache_prop(devi, label, assoc_str, assoc);
6838 		break;
6839 	case 6:
6840 		add_cache_prop(devi, label, assoc_str, 8);
6841 		break;
6842 	case 8:
6843 		add_cache_prop(devi, label, assoc_str, 16);
6844 		break;
6845 	case 0xf:
6846 		add_cache_prop(devi, label, fully_assoc, 1);
6847 		break;
6848 	default: /* reserved; ignore */
6849 		break;
6850 	}
6851 }
6852 
6853 static void
6854 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6855 {
6856 	if (size == 0 || assoc == 0)
6857 		return;
6858 	add_amd_l2_assoc(devi, label, assoc);
6859 	add_cache_prop(devi, label, size_str, size);
6860 }
6861 
6862 static void
6863 add_amd_l2_cache(dev_info_t *devi, const char *label,
6864     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6865 {
6866 	if (size == 0 || assoc == 0 || line_size == 0)
6867 		return;
6868 	add_amd_l2_assoc(devi, label, assoc);
6869 	if (lines_per_tag != 0)
6870 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6871 	add_cache_prop(devi, label, line_str, line_size);
6872 	add_cache_prop(devi, label, size_str, size * 1024);
6873 }
6874 
6875 static void
6876 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6877 {
6878 	struct cpuid_regs *cp;
6879 
6880 	if (cpi->cpi_xmaxeax < 0x80000005)
6881 		return;
6882 	cp = &cpi->cpi_extd[5];
6883 
6884 	/*
6885 	 * 4M/2M L1 TLB configuration
6886 	 *
6887 	 * We report the size for 2M pages because AMD uses two
6888 	 * TLB entries for one 4M page.
6889 	 */
6890 	add_amd_tlb(devi, "dtlb-2M",
6891 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6892 	add_amd_tlb(devi, "itlb-2M",
6893 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6894 
6895 	/*
6896 	 * 4K L1 TLB configuration
6897 	 */
6898 
6899 	switch (cpi->cpi_vendor) {
6900 		uint_t nentries;
6901 	case X86_VENDOR_TM:
6902 		if (cpi->cpi_family >= 5) {
6903 			/*
6904 			 * Crusoe processors have 256 TLB entries, but
6905 			 * cpuid data format constrains them to only
6906 			 * reporting 255 of them.
6907 			 */
6908 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6909 				nentries = 256;
6910 			/*
6911 			 * Crusoe processors also have a unified TLB
6912 			 */
6913 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6914 			    nentries);
6915 			break;
6916 		}
6917 		/*FALLTHROUGH*/
6918 	default:
6919 		add_amd_tlb(devi, itlb4k_str,
6920 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6921 		add_amd_tlb(devi, dtlb4k_str,
6922 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6923 		break;
6924 	}
6925 
6926 	/*
6927 	 * data L1 cache configuration
6928 	 */
6929 
6930 	add_amd_cache(devi, l1_dcache_str,
6931 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6932 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6933 
6934 	/*
6935 	 * code L1 cache configuration
6936 	 */
6937 
6938 	add_amd_cache(devi, l1_icache_str,
6939 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6940 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6941 
6942 	if (cpi->cpi_xmaxeax < 0x80000006)
6943 		return;
6944 	cp = &cpi->cpi_extd[6];
6945 
6946 	/* Check for a unified L2 TLB for large pages */
6947 
6948 	if (BITX(cp->cp_eax, 31, 16) == 0)
6949 		add_amd_l2_tlb(devi, "l2-tlb-2M",
6950 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6951 	else {
6952 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
6953 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6954 		add_amd_l2_tlb(devi, "l2-itlb-2M",
6955 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6956 	}
6957 
6958 	/* Check for a unified L2 TLB for 4K pages */
6959 
6960 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
6961 		add_amd_l2_tlb(devi, "l2-tlb-4K",
6962 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6963 	} else {
6964 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
6965 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6966 		add_amd_l2_tlb(devi, "l2-itlb-4K",
6967 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6968 	}
6969 
6970 	add_amd_l2_cache(devi, l2_cache_str,
6971 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6972 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6973 }
6974 
6975 /*
6976  * There are two basic ways that the x86 world describes it cache
6977  * and tlb architecture - Intel's way and AMD's way.
6978  *
6979  * Return which flavor of cache architecture we should use
6980  */
6981 static int
6982 x86_which_cacheinfo(struct cpuid_info *cpi)
6983 {
6984 	switch (cpi->cpi_vendor) {
6985 	case X86_VENDOR_Intel:
6986 		if (cpi->cpi_maxeax >= 2)
6987 			return (X86_VENDOR_Intel);
6988 		break;
6989 	case X86_VENDOR_AMD:
6990 		/*
6991 		 * The K5 model 1 was the first part from AMD that reported
6992 		 * cache sizes via extended cpuid functions.
6993 		 */
6994 		if (cpi->cpi_family > 5 ||
6995 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6996 			return (X86_VENDOR_AMD);
6997 		break;
6998 	case X86_VENDOR_HYGON:
6999 		return (X86_VENDOR_AMD);
7000 	case X86_VENDOR_TM:
7001 		if (cpi->cpi_family >= 5)
7002 			return (X86_VENDOR_AMD);
7003 		/*FALLTHROUGH*/
7004 	default:
7005 		/*
7006 		 * If they have extended CPU data for 0x80000005
7007 		 * then we assume they have AMD-format cache
7008 		 * information.
7009 		 *
7010 		 * If not, and the vendor happens to be Cyrix,
7011 		 * then try our-Cyrix specific handler.
7012 		 *
7013 		 * If we're not Cyrix, then assume we're using Intel's
7014 		 * table-driven format instead.
7015 		 */
7016 		if (cpi->cpi_xmaxeax >= 0x80000005)
7017 			return (X86_VENDOR_AMD);
7018 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7019 			return (X86_VENDOR_Cyrix);
7020 		else if (cpi->cpi_maxeax >= 2)
7021 			return (X86_VENDOR_Intel);
7022 		break;
7023 	}
7024 	return (-1);
7025 }
7026 
7027 void
7028 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7029     struct cpuid_info *cpi)
7030 {
7031 	dev_info_t *cpu_devi;
7032 	int create;
7033 
7034 	cpu_devi = (dev_info_t *)dip;
7035 
7036 	/* device_type */
7037 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7038 	    "device_type", "cpu");
7039 
7040 	/* reg */
7041 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7042 	    "reg", cpu_id);
7043 
7044 	/* cpu-mhz, and clock-frequency */
7045 	if (cpu_freq > 0) {
7046 		long long mul;
7047 
7048 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7049 		    "cpu-mhz", cpu_freq);
7050 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7051 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7052 			    "clock-frequency", (int)mul);
7053 	}
7054 
7055 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7056 
7057 	/* vendor-id */
7058 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7059 	    "vendor-id", cpi->cpi_vendorstr);
7060 
7061 	if (cpi->cpi_maxeax == 0) {
7062 		return;
7063 	}
7064 
7065 	/*
7066 	 * family, model, and step
7067 	 */
7068 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7069 	    "family", CPI_FAMILY(cpi));
7070 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7071 	    "cpu-model", CPI_MODEL(cpi));
7072 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7073 	    "stepping-id", CPI_STEP(cpi));
7074 
7075 	/* type */
7076 	switch (cpi->cpi_vendor) {
7077 	case X86_VENDOR_Intel:
7078 		create = 1;
7079 		break;
7080 	default:
7081 		create = 0;
7082 		break;
7083 	}
7084 	if (create)
7085 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7086 		    "type", CPI_TYPE(cpi));
7087 
7088 	/* ext-family */
7089 	switch (cpi->cpi_vendor) {
7090 	case X86_VENDOR_Intel:
7091 	case X86_VENDOR_AMD:
7092 		create = cpi->cpi_family >= 0xf;
7093 		break;
7094 	case X86_VENDOR_HYGON:
7095 		create = 1;
7096 		break;
7097 	default:
7098 		create = 0;
7099 		break;
7100 	}
7101 	if (create)
7102 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7103 		    "ext-family", CPI_FAMILY_XTD(cpi));
7104 
7105 	/* ext-model */
7106 	switch (cpi->cpi_vendor) {
7107 	case X86_VENDOR_Intel:
7108 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7109 		break;
7110 	case X86_VENDOR_AMD:
7111 		create = CPI_FAMILY(cpi) == 0xf;
7112 		break;
7113 	case X86_VENDOR_HYGON:
7114 		create = 1;
7115 		break;
7116 	default:
7117 		create = 0;
7118 		break;
7119 	}
7120 	if (create)
7121 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7122 		    "ext-model", CPI_MODEL_XTD(cpi));
7123 
7124 	/* generation */
7125 	switch (cpi->cpi_vendor) {
7126 	case X86_VENDOR_AMD:
7127 	case X86_VENDOR_HYGON:
7128 		/*
7129 		 * AMD K5 model 1 was the first part to support this
7130 		 */
7131 		create = cpi->cpi_xmaxeax >= 0x80000001;
7132 		break;
7133 	default:
7134 		create = 0;
7135 		break;
7136 	}
7137 	if (create)
7138 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7139 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7140 
7141 	/* brand-id */
7142 	switch (cpi->cpi_vendor) {
7143 	case X86_VENDOR_Intel:
7144 		/*
7145 		 * brand id first appeared on Pentium III Xeon model 8,
7146 		 * and Celeron model 8 processors and Opteron
7147 		 */
7148 		create = cpi->cpi_family > 6 ||
7149 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7150 		break;
7151 	case X86_VENDOR_AMD:
7152 		create = cpi->cpi_family >= 0xf;
7153 		break;
7154 	case X86_VENDOR_HYGON:
7155 		create = 1;
7156 		break;
7157 	default:
7158 		create = 0;
7159 		break;
7160 	}
7161 	if (create && cpi->cpi_brandid != 0) {
7162 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7163 		    "brand-id", cpi->cpi_brandid);
7164 	}
7165 
7166 	/* chunks, and apic-id */
7167 	switch (cpi->cpi_vendor) {
7168 		/*
7169 		 * first available on Pentium IV and Opteron (K8)
7170 		 */
7171 	case X86_VENDOR_Intel:
7172 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7173 		break;
7174 	case X86_VENDOR_AMD:
7175 		create = cpi->cpi_family >= 0xf;
7176 		break;
7177 	case X86_VENDOR_HYGON:
7178 		create = 1;
7179 		break;
7180 	default:
7181 		create = 0;
7182 		break;
7183 	}
7184 	if (create) {
7185 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7186 		    "chunks", CPI_CHUNKS(cpi));
7187 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7188 		    "apic-id", cpi->cpi_apicid);
7189 		if (cpi->cpi_chipid >= 0) {
7190 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7191 			    "chip#", cpi->cpi_chipid);
7192 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7193 			    "clog#", cpi->cpi_clogid);
7194 		}
7195 	}
7196 
7197 	/* cpuid-features */
7198 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7199 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7200 
7201 
7202 	/* cpuid-features-ecx */
7203 	switch (cpi->cpi_vendor) {
7204 	case X86_VENDOR_Intel:
7205 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7206 		break;
7207 	case X86_VENDOR_AMD:
7208 		create = cpi->cpi_family >= 0xf;
7209 		break;
7210 	case X86_VENDOR_HYGON:
7211 		create = 1;
7212 		break;
7213 	default:
7214 		create = 0;
7215 		break;
7216 	}
7217 	if (create)
7218 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7219 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7220 
7221 	/* ext-cpuid-features */
7222 	switch (cpi->cpi_vendor) {
7223 	case X86_VENDOR_Intel:
7224 	case X86_VENDOR_AMD:
7225 	case X86_VENDOR_HYGON:
7226 	case X86_VENDOR_Cyrix:
7227 	case X86_VENDOR_TM:
7228 	case X86_VENDOR_Centaur:
7229 		create = cpi->cpi_xmaxeax >= 0x80000001;
7230 		break;
7231 	default:
7232 		create = 0;
7233 		break;
7234 	}
7235 	if (create) {
7236 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7237 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7238 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7239 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7240 	}
7241 
7242 	/*
7243 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7244 	 * model 1, and Cyrix GXm.  On earlier models we try and
7245 	 * simulate something similar .. so this string should always
7246 	 * same -something- about the processor, however lame.
7247 	 */
7248 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7249 	    "brand-string", cpi->cpi_brandstr);
7250 
7251 	/*
7252 	 * Finally, cache and tlb information
7253 	 */
7254 	switch (x86_which_cacheinfo(cpi)) {
7255 	case X86_VENDOR_Intel:
7256 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7257 		break;
7258 	case X86_VENDOR_Cyrix:
7259 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7260 		break;
7261 	case X86_VENDOR_AMD:
7262 		amd_cache_info(cpi, cpu_devi);
7263 		break;
7264 	default:
7265 		break;
7266 	}
7267 }
7268 
7269 struct l2info {
7270 	int *l2i_csz;
7271 	int *l2i_lsz;
7272 	int *l2i_assoc;
7273 	int l2i_ret;
7274 };
7275 
7276 /*
7277  * A cacheinfo walker that fetches the size, line-size and associativity
7278  * of the L2 cache
7279  */
7280 static int
7281 intel_l2cinfo(void *arg, const struct cachetab *ct)
7282 {
7283 	struct l2info *l2i = arg;
7284 	int *ip;
7285 
7286 	if (ct->ct_label != l2_cache_str &&
7287 	    ct->ct_label != sl2_cache_str)
7288 		return (0);	/* not an L2 -- keep walking */
7289 
7290 	if ((ip = l2i->l2i_csz) != NULL)
7291 		*ip = ct->ct_size;
7292 	if ((ip = l2i->l2i_lsz) != NULL)
7293 		*ip = ct->ct_line_size;
7294 	if ((ip = l2i->l2i_assoc) != NULL)
7295 		*ip = ct->ct_assoc;
7296 	l2i->l2i_ret = ct->ct_size;
7297 	return (1);		/* was an L2 -- terminate walk */
7298 }
7299 
7300 /*
7301  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7302  *
7303  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7304  *	value is the associativity, the associativity for the L2 cache and
7305  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7306  *	an index into the amd_afd[] array to determine the associativity.
7307  *	-1 is undefined. 0 is fully associative.
7308  */
7309 
7310 static int amd_afd[] =
7311 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7312 
7313 static void
7314 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7315 {
7316 	struct cpuid_regs *cp;
7317 	uint_t size, assoc;
7318 	int i;
7319 	int *ip;
7320 
7321 	if (cpi->cpi_xmaxeax < 0x80000006)
7322 		return;
7323 	cp = &cpi->cpi_extd[6];
7324 
7325 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7326 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7327 		uint_t cachesz = size * 1024;
7328 		assoc = amd_afd[i];
7329 
7330 		ASSERT(assoc != -1);
7331 
7332 		if ((ip = l2i->l2i_csz) != NULL)
7333 			*ip = cachesz;
7334 		if ((ip = l2i->l2i_lsz) != NULL)
7335 			*ip = BITX(cp->cp_ecx, 7, 0);
7336 		if ((ip = l2i->l2i_assoc) != NULL)
7337 			*ip = assoc;
7338 		l2i->l2i_ret = cachesz;
7339 	}
7340 }
7341 
7342 int
7343 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7344 {
7345 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7346 	struct l2info __l2info, *l2i = &__l2info;
7347 
7348 	l2i->l2i_csz = csz;
7349 	l2i->l2i_lsz = lsz;
7350 	l2i->l2i_assoc = assoc;
7351 	l2i->l2i_ret = -1;
7352 
7353 	switch (x86_which_cacheinfo(cpi)) {
7354 	case X86_VENDOR_Intel:
7355 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7356 		break;
7357 	case X86_VENDOR_Cyrix:
7358 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7359 		break;
7360 	case X86_VENDOR_AMD:
7361 		amd_l2cacheinfo(cpi, l2i);
7362 		break;
7363 	default:
7364 		break;
7365 	}
7366 	return (l2i->l2i_ret);
7367 }
7368 
7369 #if !defined(__xpv)
7370 
7371 uint32_t *
7372 cpuid_mwait_alloc(cpu_t *cpu)
7373 {
7374 	uint32_t	*ret;
7375 	size_t		mwait_size;
7376 
7377 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7378 
7379 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7380 	if (mwait_size == 0)
7381 		return (NULL);
7382 
7383 	/*
7384 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7385 	 * allocations.  mwait_size is currently cache line sized.  Neither
7386 	 * of these implementation details are guarantied to be true in the
7387 	 * future.
7388 	 *
7389 	 * First try allocating mwait_size as kmem_alloc() currently returns
7390 	 * correctly aligned memory.  If kmem_alloc() does not return
7391 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7392 	 *
7393 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7394 	 * decide to free this memory.
7395 	 */
7396 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7397 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7398 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7399 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7400 		*ret = MWAIT_RUNNING;
7401 		return (ret);
7402 	} else {
7403 		kmem_free(ret, mwait_size);
7404 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7405 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7406 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7407 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7408 		*ret = MWAIT_RUNNING;
7409 		return (ret);
7410 	}
7411 }
7412 
7413 void
7414 cpuid_mwait_free(cpu_t *cpu)
7415 {
7416 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7417 		return;
7418 	}
7419 
7420 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7421 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7422 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7423 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7424 	}
7425 
7426 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7427 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7428 }
7429 
7430 void
7431 patch_tsc_read(int flag)
7432 {
7433 	size_t cnt;
7434 
7435 	switch (flag) {
7436 	case TSC_NONE:
7437 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7438 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7439 		break;
7440 	case TSC_RDTSC_LFENCE:
7441 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7442 		(void) memcpy((void *)tsc_read,
7443 		    (void *)&_tsc_lfence_start, cnt);
7444 		break;
7445 	case TSC_TSCP:
7446 		cnt = &_tscp_end - &_tscp_start;
7447 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7448 		break;
7449 	default:
7450 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7451 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7452 		break;
7453 	}
7454 	tsc_type = flag;
7455 }
7456 
7457 int
7458 cpuid_deep_cstates_supported(void)
7459 {
7460 	struct cpuid_info *cpi;
7461 	struct cpuid_regs regs;
7462 
7463 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7464 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7465 
7466 	cpi = CPU->cpu_m.mcpu_cpi;
7467 
7468 	switch (cpi->cpi_vendor) {
7469 	case X86_VENDOR_Intel:
7470 		if (cpi->cpi_xmaxeax < 0x80000007)
7471 			return (0);
7472 
7473 		/*
7474 		 * Does TSC run at a constant rate in all C-states?
7475 		 */
7476 		regs.cp_eax = 0x80000007;
7477 		(void) __cpuid_insn(&regs);
7478 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7479 
7480 	default:
7481 		return (0);
7482 	}
7483 }
7484 
7485 #endif	/* !__xpv */
7486 
7487 void
7488 post_startup_cpu_fixups(void)
7489 {
7490 #ifndef __xpv
7491 	/*
7492 	 * Some AMD processors support C1E state. Entering this state will
7493 	 * cause the local APIC timer to stop, which we can't deal with at
7494 	 * this time.
7495 	 */
7496 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7497 		on_trap_data_t otd;
7498 		uint64_t reg;
7499 
7500 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7501 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7502 			/* Disable C1E state if it is enabled by BIOS */
7503 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7504 			    AMD_ACTONCMPHALT_MASK) {
7505 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7506 				    AMD_ACTONCMPHALT_SHIFT);
7507 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7508 			}
7509 		}
7510 		no_trap();
7511 	}
7512 #endif	/* !__xpv */
7513 }
7514 
7515 void
7516 enable_pcid(void)
7517 {
7518 	if (x86_use_pcid == -1)
7519 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7520 
7521 	if (x86_use_invpcid == -1) {
7522 		x86_use_invpcid = is_x86_feature(x86_featureset,
7523 		    X86FSET_INVPCID);
7524 	}
7525 
7526 	if (!x86_use_pcid)
7527 		return;
7528 
7529 	/*
7530 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7531 	 * bits; better make sure there's nothing there.
7532 	 */
7533 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7534 
7535 	setcr4(getcr4() | CR4_PCIDE);
7536 }
7537 
7538 /*
7539  * Setup necessary registers to enable XSAVE feature on this processor.
7540  * This function needs to be called early enough, so that no xsave/xrstor
7541  * ops will execute on the processor before the MSRs are properly set up.
7542  *
7543  * Current implementation has the following assumption:
7544  * - cpuid_pass_basic() is done, so that X86 features are known.
7545  * - fpu_probe() is done, so that fp_save_mech is chosen.
7546  */
7547 void
7548 xsave_setup_msr(cpu_t *cpu)
7549 {
7550 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7551 	ASSERT(fp_save_mech == FP_XSAVE);
7552 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7553 
7554 	/* Enable OSXSAVE in CR4. */
7555 	setcr4(getcr4() | CR4_OSXSAVE);
7556 	/*
7557 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7558 	 * correct value.
7559 	 */
7560 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7561 	setup_xfem();
7562 }
7563 
7564 /*
7565  * Starting with the Westmere processor the local
7566  * APIC timer will continue running in all C-states,
7567  * including the deepest C-states.
7568  */
7569 int
7570 cpuid_arat_supported(void)
7571 {
7572 	struct cpuid_info *cpi;
7573 	struct cpuid_regs regs;
7574 
7575 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7576 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7577 
7578 	cpi = CPU->cpu_m.mcpu_cpi;
7579 
7580 	switch (cpi->cpi_vendor) {
7581 	case X86_VENDOR_Intel:
7582 		/*
7583 		 * Always-running Local APIC Timer is
7584 		 * indicated by CPUID.6.EAX[2].
7585 		 */
7586 		if (cpi->cpi_maxeax >= 6) {
7587 			regs.cp_eax = 6;
7588 			(void) cpuid_insn(NULL, &regs);
7589 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7590 		} else {
7591 			return (0);
7592 		}
7593 	default:
7594 		return (0);
7595 	}
7596 }
7597 
7598 /*
7599  * Check support for Intel ENERGY_PERF_BIAS feature
7600  */
7601 int
7602 cpuid_iepb_supported(struct cpu *cp)
7603 {
7604 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7605 	struct cpuid_regs regs;
7606 
7607 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7608 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7609 
7610 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7611 		return (0);
7612 	}
7613 
7614 	/*
7615 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7616 	 * capability bit CPUID.6.ECX.3
7617 	 */
7618 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7619 		return (0);
7620 
7621 	regs.cp_eax = 0x6;
7622 	(void) cpuid_insn(NULL, &regs);
7623 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7624 }
7625 
7626 /*
7627  * Check support for TSC deadline timer
7628  *
7629  * TSC deadline timer provides a superior software programming
7630  * model over local APIC timer that eliminates "time drifts".
7631  * Instead of specifying a relative time, software specifies an
7632  * absolute time as the target at which the processor should
7633  * generate a timer event.
7634  */
7635 int
7636 cpuid_deadline_tsc_supported(void)
7637 {
7638 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7639 	struct cpuid_regs regs;
7640 
7641 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7642 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7643 
7644 	switch (cpi->cpi_vendor) {
7645 	case X86_VENDOR_Intel:
7646 		if (cpi->cpi_maxeax >= 1) {
7647 			regs.cp_eax = 1;
7648 			(void) cpuid_insn(NULL, &regs);
7649 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7650 		} else {
7651 			return (0);
7652 		}
7653 	default:
7654 		return (0);
7655 	}
7656 }
7657 
7658 #if !defined(__xpv)
7659 /*
7660  * Patch in versions of bcopy for high performance Intel Nhm processors
7661  * and later...
7662  */
7663 void
7664 patch_memops(uint_t vendor)
7665 {
7666 	size_t cnt, i;
7667 	caddr_t to, from;
7668 
7669 	if ((vendor == X86_VENDOR_Intel) &&
7670 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7671 		cnt = &bcopy_patch_end - &bcopy_patch_start;
7672 		to = &bcopy_ck_size;
7673 		from = &bcopy_patch_start;
7674 		for (i = 0; i < cnt; i++) {
7675 			*to++ = *from++;
7676 		}
7677 	}
7678 }
7679 #endif  /*  !__xpv */
7680 
7681 /*
7682  * We're being asked to tell the system how many bits are required to represent
7683  * the various thread and strand IDs. While it's tempting to derive this based
7684  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7685  * correct. Instead, this needs to be based on the number of bits that the APIC
7686  * allows for these different configurations. We only update these to a larger
7687  * value if we find one.
7688  */
7689 void
7690 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7691 {
7692 	struct cpuid_info *cpi;
7693 
7694 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7695 	cpi = cpu->cpu_m.mcpu_cpi;
7696 
7697 	if (cpi->cpi_ncore_bits > *core_nbits) {
7698 		*core_nbits = cpi->cpi_ncore_bits;
7699 	}
7700 
7701 	if (cpi->cpi_nthread_bits > *strand_nbits) {
7702 		*strand_nbits = cpi->cpi_nthread_bits;
7703 	}
7704 }
7705 
7706 void
7707 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7708 {
7709 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7710 	struct cpuid_regs cp;
7711 
7712 	/*
7713 	 * Reread the CPUID portions that we need for various security
7714 	 * information.
7715 	 */
7716 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7717 		/*
7718 		 * Check if we now have leaf 7 available to us.
7719 		 */
7720 		if (cpi->cpi_maxeax < 7) {
7721 			bzero(&cp, sizeof (cp));
7722 			cp.cp_eax = 0;
7723 			cpi->cpi_maxeax = __cpuid_insn(&cp);
7724 			if (cpi->cpi_maxeax < 7)
7725 				return;
7726 		}
7727 
7728 		bzero(&cp, sizeof (cp));
7729 		cp.cp_eax = 7;
7730 		cp.cp_ecx = 0;
7731 		(void) __cpuid_insn(&cp);
7732 		cpi->cpi_std[7] = cp;
7733 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7734 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
7735 		/* No xcpuid support */
7736 		if (cpi->cpi_family < 5 ||
7737 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7738 			return;
7739 
7740 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7741 			bzero(&cp, sizeof (cp));
7742 			cp.cp_eax = CPUID_LEAF_EXT_0;
7743 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7744 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7745 				return;
7746 			}
7747 		}
7748 
7749 		bzero(&cp, sizeof (cp));
7750 		cp.cp_eax = CPUID_LEAF_EXT_8;
7751 		(void) __cpuid_insn(&cp);
7752 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7753 		cpi->cpi_extd[8] = cp;
7754 	} else {
7755 		/*
7756 		 * Nothing to do here. Return an empty set which has already
7757 		 * been zeroed for us.
7758 		 */
7759 		return;
7760 	}
7761 	cpuid_scan_security(cpu, fset);
7762 }
7763 
7764 /* ARGSUSED */
7765 static int
7766 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7767 {
7768 	uchar_t *fset;
7769 	boolean_t first_pass = (boolean_t)arg1;
7770 
7771 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7772 	if (first_pass && CPU->cpu_id != 0)
7773 		return (0);
7774 	if (!first_pass && CPU->cpu_id == 0)
7775 		return (0);
7776 	cpuid_pass_ucode(CPU, fset);
7777 
7778 	return (0);
7779 }
7780 
7781 /*
7782  * After a microcode update where the version has changed, then we need to
7783  * rescan CPUID. To do this we check every CPU to make sure that they have the
7784  * same microcode. Then we perform a cross call to all such CPUs. It's the
7785  * caller's job to make sure that no one else can end up doing an update while
7786  * this is going on.
7787  *
7788  * We assume that the system is microcode capable if we're called.
7789  */
7790 void
7791 cpuid_post_ucodeadm(void)
7792 {
7793 	uint32_t rev;
7794 	int i;
7795 	struct cpu *cpu;
7796 	cpuset_t cpuset;
7797 	void *argdata;
7798 	uchar_t *f0;
7799 
7800 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7801 
7802 	mutex_enter(&cpu_lock);
7803 	cpu = cpu_get(0);
7804 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7805 	CPUSET_ONLY(cpuset, 0);
7806 	for (i = 1; i < max_ncpus; i++) {
7807 		if ((cpu = cpu_get(i)) == NULL)
7808 			continue;
7809 
7810 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7811 			panic("post microcode update CPU %d has differing "
7812 			    "microcode revision (%u) from CPU 0 (%u)",
7813 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7814 		}
7815 		CPUSET_ADD(cpuset, i);
7816 	}
7817 
7818 	/*
7819 	 * We do the cross calls in two passes. The first pass is only for the
7820 	 * boot CPU. The second pass is for all of the other CPUs. This allows
7821 	 * the boot CPU to go through and change behavior related to patching or
7822 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
7823 	 * other CPUs to follow suit.
7824 	 */
7825 	kpreempt_disable();
7826 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7827 	    cpuid_post_ucodeadm_xc);
7828 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7829 	    cpuid_post_ucodeadm_xc);
7830 	kpreempt_enable();
7831 
7832 	/*
7833 	 * OK, now look at each CPU and see if their feature sets are equal.
7834 	 */
7835 	f0 = argdata;
7836 	for (i = 1; i < max_ncpus; i++) {
7837 		uchar_t *fset;
7838 		if (!CPU_IN_SET(cpuset, i))
7839 			continue;
7840 
7841 		fset = (uchar_t *)((uintptr_t)argdata +
7842 		    sizeof (x86_featureset) * i);
7843 
7844 		if (!compare_x86_featureset(f0, fset)) {
7845 			panic("Post microcode update CPU %d has "
7846 			    "differing security feature (%p) set from CPU 0 "
7847 			    "(%p), not appending to feature set", i,
7848 			    (void *)fset, (void *)f0);
7849 		}
7850 	}
7851 
7852 	mutex_exit(&cpu_lock);
7853 
7854 	for (i = 0; i < NUM_X86_FEATURES; i++) {
7855 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7856 		    x86_feature_names[i]);
7857 		if (is_x86_feature(f0, i)) {
7858 			add_x86_feature(x86_featureset, i);
7859 		}
7860 	}
7861 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7862 }
7863 
7864 typedef void (*cpuid_pass_f)(cpu_t *, void *);
7865 
7866 typedef struct cpuid_pass_def {
7867 	cpuid_pass_t cpd_pass;
7868 	cpuid_pass_f cpd_func;
7869 } cpuid_pass_def_t;
7870 
7871 /*
7872  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
7873  * normal sense and should not appear here.
7874  */
7875 static const cpuid_pass_def_t cpuid_pass_defs[] = {
7876 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
7877 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
7878 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
7879 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
7880 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
7881 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
7882 };
7883 
7884 void
7885 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
7886 {
7887 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
7888 
7889 	if (cp == NULL)
7890 		cp = CPU;
7891 
7892 	/*
7893 	 * Space statically allocated for BSP, ensure pointer is set
7894 	 */
7895 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
7896 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
7897 
7898 	ASSERT(cpuid_checkpass(cp, pass - 1));
7899 
7900 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
7901 		if (cpuid_pass_defs[i].cpd_pass == pass) {
7902 			cpuid_pass_defs[i].cpd_func(cp, arg);
7903 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
7904 			return;
7905 		}
7906 	}
7907 
7908 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
7909 	    pass, cp->cpu_id);
7910 }
7911 
7912 /*
7913  * Extract the processor family from a chiprev.  Processor families are not the
7914  * same as cpuid families; see comments above and in x86_archext.h.
7915  */
7916 x86_processor_family_t
7917 chiprev_family(const x86_chiprev_t cr)
7918 {
7919 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
7920 }
7921 
7922 /*
7923  * A chiprev matches its template if the vendor and family are identical and the
7924  * revision of the chiprev matches one of the bits set in the template.  Callers
7925  * may bitwise-OR together chiprevs of the same vendor and family to form the
7926  * template, or use the _ANY variant.  It is not possible to match chiprevs of
7927  * multiple vendors or processor families with a single call.  Note that this
7928  * function operates on processor families, not cpuid families.
7929  */
7930 boolean_t
7931 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
7932 {
7933 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
7934 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
7935 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
7936 }
7937 
7938 /*
7939  * A chiprev is at least min if the vendor and family are identical and the
7940  * revision of the chiprev is at least as recent as that of min.  Processor
7941  * families are considered unordered and cannot be compared using this function.
7942  * Note that this function operates on processor families, not cpuid families.
7943  * Use of the _ANY chiprev variant with this function is not useful; it will
7944  * always return B_FALSE if the _ANY variant is supplied as the minimum
7945  * revision.  To determine only whether a chiprev is of a given processor
7946  * family, test the return value of chiprev_family() instead.
7947  */
7948 boolean_t
7949 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
7950 {
7951 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
7952 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
7953 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
7954 }
7955 
7956 /*
7957  * The uarch functions operate in a manner similar to the chiprev functions
7958  * above.  While it is tempting to allow these to operate on microarchitectures
7959  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
7960  * than ZEN2), we elect not to do so because a manufacturer may supply
7961  * processors of multiple different microarchitecture families each of which may
7962  * be internally ordered but unordered with respect to those of other families.
7963  */
7964 x86_uarch_t
7965 uarchrev_uarch(const x86_uarchrev_t ur)
7966 {
7967 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
7968 }
7969 
7970 boolean_t
7971 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
7972 {
7973 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
7974 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
7975 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
7976 }
7977 
7978 boolean_t
7979 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
7980 {
7981 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
7982 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
7983 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
7984 }
7985