xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision ed093b41)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2023 Oxide Computer Company
28  * Copyright 2022 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * The Microcode Pass
328  *
329  * After a microcode update, we do a selective rescan of the cpuid leaves to
330  * determine what features have changed. Microcode updates can provide more
331  * details about security related features to deal with issues like Spectre and
332  * L1TF. On occasion, vendors have violated their contract and removed bits.
333  * However, we don't try to detect that because that puts us in a situation that
334  * we really can't deal with. As such, the only thing we rescan are security
335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
336  * different sequence on APs and therefore is not part of the sequential order;
337  * It is invoked directly instead of by cpuid_execpass() and its completion
338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
339  * a more complex dependency mechanism if warranted by future developments.
340  *
341  * All of the passes are run on all CPUs. However, for the most part we only
342  * care about what the boot CPU says about this information and use the other
343  * CPUs as a rough guide to sanity check that we have the same feature set.
344  *
345  * We do not support running multiple logical CPUs with disjoint, let alone
346  * different, feature sets.
347  *
348  * ------------------
349  * Processor Topology
350  * ------------------
351  *
352  * One of the important things that we need to do is to understand the topology
353  * of the underlying processor. When we say topology in this case, we're trying
354  * to understand the relationship between the logical CPUs that the operating
355  * system sees and the underlying physical layout. Different logical CPUs may
356  * share different resources which can have important consequences for the
357  * performance of the system. For example, they may share caches, execution
358  * units, and more.
359  *
360  * The topology of the processor changes from generation to generation and
361  * vendor to vendor.  Along with that, different vendors use different
362  * terminology, and the operating system itself uses occasionally overlapping
363  * terminology. It's important to understand what this topology looks like so
364  * one can understand the different things that we try to calculate and
365  * determine.
366  *
367  * To get started, let's talk about a little bit of terminology that we've used
368  * so far, is used throughout this file, and is fairly generic across multiple
369  * vendors:
370  *
371  * CPU
372  *	A central processing unit (CPU) refers to a logical and/or virtual
373  *	entity that the operating system can execute instructions on. The
374  *	underlying resources for this CPU may be shared between multiple
375  *	entities; however, to the operating system it is a discrete unit.
376  *
377  * PROCESSOR and PACKAGE
378  *
379  *	Generally, when we use the term 'processor' on its own, we are referring
380  *	to the physical entity that one buys and plugs into a board. However,
381  *	because processor has been overloaded and one might see it used to mean
382  *	multiple different levels, we will instead use the term 'package' for
383  *	the rest of this file. The term package comes from the electrical
384  *	engineering side and refers to the physical entity that encloses the
385  *	electronics inside. Strictly speaking the package can contain more than
386  *	just the CPU, for example, on many processors it may also have what's
387  *	called an 'integrated graphical processing unit (GPU)'. Because the
388  *	package can encapsulate multiple units, it is the largest physical unit
389  *	that we refer to.
390  *
391  * SOCKET
392  *
393  *	A socket refers to unit on a system board (generally the motherboard)
394  *	that can receive a package. A single package, or processor, is plugged
395  *	into a single socket. A system may have multiple sockets. Often times,
396  *	the term socket is used interchangeably with package and refers to the
397  *	electrical component that has plugged in, and not the receptacle itself.
398  *
399  * CORE
400  *
401  *	A core refers to the physical instantiation of a CPU, generally, with a
402  *	full set of hardware resources available to it. A package may contain
403  *	multiple cores inside of it or it may just have a single one. A
404  *	processor with more than one core is often referred to as 'multi-core'.
405  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
406  *	that has 'multi-core' processors.
407  *
408  *	A core may expose a single logical CPU to the operating system, or it
409  *	may expose multiple CPUs, which we call threads, defined below.
410  *
411  *	Some resources may still be shared by cores in the same package. For
412  *	example, many processors will share the level 3 cache between cores.
413  *	Some AMD generations share hardware resources between cores. For more
414  *	information on that see the section 'AMD Topology'.
415  *
416  * THREAD and STRAND
417  *
418  *	In this file, generally a thread refers to a hardware resources and not
419  *	the operating system's logical abstraction. A thread is always exposed
420  *	as an independent logical CPU to the operating system. A thread belongs
421  *	to a specific core. A core may have more than one thread. When that is
422  *	the case, the threads that are part of the same core are often referred
423  *	to as 'siblings'.
424  *
425  *	When multiple threads exist, this is generally referred to as
426  *	simultaneous multi-threading (SMT). When Intel introduced this in their
427  *	processors they called it hyper-threading (HT). When multiple threads
428  *	are active in a core, they split the resources of the core. For example,
429  *	two threads may share the same set of hardware execution units.
430  *
431  *	The operating system often uses the term 'strand' to refer to a thread.
432  *	This helps disambiguate it from the software concept.
433  *
434  * CHIP
435  *
436  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
437  *	base meaning, it is used to refer to a single integrated circuit, which
438  *	may or may not be the only thing in the package. In illumos, when you
439  *	see the term 'chip' it is almost always referring to the same thing as
440  *	the 'package'. However, many vendors may use chip to refer to one of
441  *	many integrated circuits that have been placed in the package. As an
442  *	example, see the subsequent definition.
443  *
444  *	To try and keep things consistent, we will only use chip when referring
445  *	to the entire integrated circuit package, with the exception of the
446  *	definition of multi-chip module (because it is in the name) and use the
447  *	term 'die' when we want the more general, potential sub-component
448  *	definition.
449  *
450  * DIE
451  *
452  *	A die refers to an integrated circuit. Inside of the package there may
453  *	be a single die or multiple dies. This is sometimes called a 'chip' in
454  *	vendor's parlance, but in this file, we use the term die to refer to a
455  *	subcomponent.
456  *
457  * MULTI-CHIP MODULE
458  *
459  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
460  *	are connected together in the same package. When a multi-chip design is
461  *	used, generally each chip is manufactured independently and then joined
462  *	together in the package. For example, on AMD's Zen microarchitecture
463  *	(family 0x17), the package contains several dies (the second meaning of
464  *	chip from above) that are connected together.
465  *
466  * CACHE
467  *
468  *	A cache is a part of the processor that maintains copies of recently
469  *	accessed memory. Caches are split into levels and then into types.
470  *	Commonly there are one to three levels, called level one, two, and
471  *	three. The lower the level, the smaller it is, the closer it is to the
472  *	execution units of the CPU, and the faster it is to access. The layout
473  *	and design of the cache come in many different flavors, consult other
474  *	resources for a discussion of those.
475  *
476  *	Caches are generally split into two types, the instruction and data
477  *	cache. The caches contain what their names suggest, the instruction
478  *	cache has executable program text, while the data cache has all other
479  *	memory that the processor accesses. As of this writing, data is kept
480  *	coherent between all of the caches on x86, so if one modifies program
481  *	text before it is executed, that will be in the data cache, and the
482  *	instruction cache will be synchronized with that change when the
483  *	processor actually executes those instructions. This coherency also
484  *	covers the fact that data could show up in multiple caches.
485  *
486  *	Generally, the lowest level caches are specific to a core. However, the
487  *	last layer cache is shared between some number of cores. The number of
488  *	CPUs sharing this last level cache is important. This has implications
489  *	for the choices that the scheduler makes, as accessing memory that might
490  *	be in a remote cache after thread migration can be quite expensive.
491  *
492  *	Sometimes, the word cache is abbreviated with a '$', because in US
493  *	English the word cache is pronounced the same as cash. So L1D$ refers to
494  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
495  *	in the rest of this theory statement for clarity.
496  *
497  * MEMORY CONTROLLER
498  *
499  *	The memory controller is a component that provides access to DRAM. Each
500  *	memory controller can access a set number of DRAM channels. Each channel
501  *	can have a number of DIMMs (sticks of memory) associated with it. A
502  *	given package may have more than one memory controller. The association
503  *	of the memory controller to a group of cores is important as it is
504  *	cheaper to access memory on the controller that you are associated with.
505  *
506  * NUMA
507  *
508  *	NUMA or non-uniform memory access, describes a way that systems are
509  *	built. On x86, any processor core can address all of the memory in the
510  *	system. However, When using multiple sockets or possibly within a
511  *	multi-chip module, some of that memory is physically closer and some of
512  *	it is further. Memory that is further away is more expensive to access.
513  *	Consider the following image of multiple sockets with memory:
514  *
515  *	+--------+                                                +--------+
516  *	| DIMM A |         +----------+      +----------+         | DIMM D |
517  *	+--------+-+       |          |      |          |       +-+------+-+
518  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
519  *	  +--------+-+     |          |      |          |     +-+------+-+
520  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
521  *	    +--------+                                        +--------+
522  *
523  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
524  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
525  *	access DIMMs A-C and more expensive to access D-F as it has to go
526  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
527  *	D-F are cheaper than A-C. While the socket form is the most common, when
528  *	using multi-chip modules, this can also sometimes occur. For another
529  *	example of this that's more involved, see the AMD topology section.
530  *
531  *
532  * Intel Topology
533  * --------------
534  *
535  * Most Intel processors since Nehalem, (as of this writing the current gen
536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
537  * the package is a single monolithic die. MCMs currently aren't used. Most
538  * parts have three levels of caches, with the L3 cache being shared between
539  * all of the cores on the package. The L1/L2 cache is generally specific to
540  * an individual core. The following image shows at a simplified level what
541  * this looks like. The memory controller is commonly part of something called
542  * the 'Uncore', that used to be separate physical chips that were not a part of
543  * the package, but are now part of the same chip.
544  *
545  *  +-----------------------------------------------------------------------+
546  *  | Package                                                               |
547  *  |  +-------------------+  +-------------------+  +-------------------+  |
548  *  |  | Core              |  | Core              |  | Core              |  |
549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
558  *  |  +-------------------+  +-------------------+  +-------------------+  |
559  *  | +-------------------------------------------------------------------+ |
560  *  | |                         Shared L3 Cache                           | |
561  *  | +-------------------------------------------------------------------+ |
562  *  | +-------------------------------------------------------------------+ |
563  *  | |                        Memory Controller                          | |
564  *  | +-------------------------------------------------------------------+ |
565  *  +-----------------------------------------------------------------------+
566  *
567  * A side effect of this current architecture is that what we care about from a
568  * scheduling and topology perspective, is simplified. In general we care about
569  * understanding which logical CPUs are part of the same core and socket.
570  *
571  * To determine the relationship between threads and cores, Intel initially used
572  * the identifier in the advanced programmable interrupt controller (APIC). They
573  * also added cpuid leaf 4 to give additional information about the number of
574  * threads and CPUs in the processor. With the addition of x2apic (which
575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
576  * additional cpuid topology leaf 0xB was added.
577  *
578  * AMD Topology
579  * ------------
580  *
581  * When discussing AMD topology, we want to break this into three distinct
582  * generations of topology. There's the basic topology that has been used in
583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
587  * additional terminology that's worth talking about.
588  *
589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
590  * that they considered SMT. Whether or not the AMD processors have SMT
591  * influences many things including scheduling and reliability, availability,
592  * and serviceability (RAS) features.
593  *
594  * NODE
595  *
596  *	AMD uses the term node to refer to a die that contains a number of cores
597  *	and I/O resources. Depending on the processor family and model, more
598  *	than one node can be present in the package. When there is more than one
599  *	node this indicates a multi-chip module. Usually each node has its own
600  *	access to memory and I/O devices. This is important and generally
601  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
602  *	result, we track this relationship in the operating system.
603  *
604  *	In processors with an L3 cache, the L3 cache is generally shared across
605  *	the entire node, though the way this is carved up varies from generation
606  *	to generation.
607  *
608  * BULLDOZER
609  *
610  *	Starting with the Bulldozer family (0x15) and continuing until the
611  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
612  *	compute unit. In a compute unit, two traditional cores share a number of
613  *	hardware resources. Critically, they share the FPU, L1 instruction
614  *	cache, and the L2 cache. Several compute units were then combined inside
615  *	of a single node.  Because the integer execution units, L1 data cache,
616  *	and some other resources were not shared between the cores, AMD never
617  *	considered this to be SMT.
618  *
619  * ZEN
620  *
621  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
622  *	is called Zeppelin. These modules are similar to the idea of nodes used
623  *	previously. Each of these nodes has two DRAM channels which all of the
624  *	cores in the node can access uniformly. These nodes are linked together
625  *	in the package, creating a NUMA environment.
626  *
627  *	The Zeppelin die itself contains two different 'core complexes'. Each
628  *	core complex consists of four cores which each have two threads, for a
629  *	total of 8 logical CPUs per complex. Unlike other generations,
630  *	where all the logical CPUs in a given node share the L3 cache, here each
631  *	core complex has its own shared L3 cache.
632  *
633  *	A further thing that we need to consider is that in some configurations,
634  *	particularly with the Threadripper line of processors, not every die
635  *	actually has its memory controllers wired up to actual memory channels.
636  *	This means that some cores have memory attached to them and others
637  *	don't.
638  *
639  *	To put Zen in perspective, consider the following images:
640  *
641  *      +--------------------------------------------------------+
642  *      | Core Complex                                           |
643  *      | +-------------------+    +-------------------+  +---+  |
644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
650  *      | +-------------------+    +-------------------+  | C |  |
651  *      | +-------------------+    +-------------------+  | a |  |
652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
658  *      | +-------------------+    +-------------------+  +---+  |
659  *      |                                                        |
660  *	+--------------------------------------------------------+
661  *
662  *  This first image represents a single Zen core complex that consists of four
663  *  cores.
664  *
665  *
666  *	+--------------------------------------------------------+
667  *	| Zeppelin Die                                           |
668  *	|  +--------------------------------------------------+  |
669  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
670  *	|  +--------------------------------------------------+  |
671  *      |                           HH                           |
672  *	|          +-----------+    HH    +-----------+          |
673  *	|          |           |    HH    |           |          |
674  *	|          |    Core   |==========|    Core   |          |
675  *	|          |  Complex  |==========|  Complex  |          |
676  *	|          |           |    HH    |           |          |
677  *	|          +-----------+    HH    +-----------+          |
678  *      |                           HH                           |
679  *	|  +--------------------------------------------------+  |
680  *	|  |                Memory Controller                 |  |
681  *	|  +--------------------------------------------------+  |
682  *      |                                                        |
683  *	+--------------------------------------------------------+
684  *
685  *  This image represents a single Zeppelin Die. Note how both cores are
686  *  connected to the same memory controller and I/O units. While each core
687  *  complex has its own L3 cache as seen in the first image, they both have
688  *  uniform access to memory.
689  *
690  *
691  *                      PP                     PP
692  *                      PP                     PP
693  *           +----------PP---------------------PP---------+
694  *           |          PP                     PP         |
695  *           |    +-----------+          +-----------+    |
696  *           |    |           |          |           |    |
697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
699  *           |    |           |          |           |    |
700  *           |    +-----------+ooo    ...+-----------+    |
701  *           |          HH      ooo  ...       HH         |
702  *           |          HH        oo..         HH         |
703  *           |          HH        ..oo         HH         |
704  *           |          HH      ...  ooo       HH         |
705  *           |    +-----------+...    ooo+-----------+    |
706  *           |    |           |          |           |    |
707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
709  *           |    |           |          |           |    |
710  *           |    +-----------+          +-----------+    |
711  *           |          PP                     PP         |
712  *           +----------PP---------------------PP---------+
713  *                      PP                     PP
714  *                      PP                     PP
715  *
716  *  This image represents a single Zen package. In this example, it has four
717  *  Zeppelin dies, though some configurations only have a single one. In this
718  *  example, each die is directly connected to the next. Also, each die is
719  *  represented as being connected to memory by the 'M' character and connected
720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
721  *  die is made up of two core complexes, we have multiple different NUMA
722  *  domains that we care about for these systems.
723  *
724  * ZEN 2
725  *
726  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
727  *	each Zeppelin Die had its own I/O die, that has been moved out of the
728  *	core complex in Zen 2. The actual core complex looks pretty similar, but
729  *	now the die actually looks much simpler:
730  *
731  *      +--------------------------------------------------------+
732  *      | Zen 2 Core Complex Die    HH                           |
733  *      |                           HH                           |
734  *      |          +-----------+    HH    +-----------+          |
735  *      |          |           |    HH    |           |          |
736  *      |          |    Core   |==========|    Core   |          |
737  *      |          |  Complex  |==========|  Complex  |          |
738  *      |          |           |    HH    |           |          |
739  *      |          +-----------+    HH    +-----------+          |
740  *      |                           HH                           |
741  *      |                           HH                           |
742  *      +--------------------------------------------------------+
743  *
744  *	From here, when we add the central I/O die, this changes things a bit.
745  *	Each die is connected to the I/O die, rather than trying to interconnect
746  *	them directly. The following image takes the same Zen 1 image that we
747  *	had earlier and shows what it looks like with the I/O die instead:
748  *
749  *                                 PP    PP
750  *                                 PP    PP
751  *           +---------------------PP----PP---------------------+
752  *           |                     PP    PP                     |
753  *           |  +-----------+      PP    PP      +-----------+  |
754  *           |  |           |      PP    PP      |           |  |
755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
757  *           |  |         |o|oooo|          |oooo|o|         |  |
758  *           |  +-----------+    |          |    +-----------+  |
759  *           |                   |   I/O    |                   |
760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
762  *           |                   |          |                   |
763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
765  *           |                   |          |                   |
766  *           |  +-----------+    |          |    +-----------+  |
767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
769  *           |  |    Die    |      PP    PP      |    Die    |  |
770  *           |  |           |      PP    PP      |           |  |
771  *           |  +-----------+      PP    PP      +-----------+  |
772  *           |                     PP    PP                     |
773  *           +---------------------PP----PP---------------------+
774  *                                 PP    PP
775  *                                 PP    PP
776  *
777  *	The above has four core complex dies installed, though the Zen 2 EPYC
778  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
779  *	generally only have one to two. The more notable difference here is how
780  *	everything communicates. Note that memory and PCIe come out of the
781  *	central die. This changes the way that one die accesses a resource. It
782  *	basically always has to go to the I/O die, where as in Zen 1 it may have
783  *	satisfied it locally. In general, this ends up being a better strategy
784  *	for most things, though it is possible to still treat everything in four
785  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
786  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
787  *	now there is only one 'node' present.
788  *
789  * ZEN 3
790  *
791  *	From an architectural perspective, Zen 3 is a much smaller change from
792  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
793  *	its microarchitectural changes. The biggest thing for us is how the die
794  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
795  *	cache. However, in Zen 3, the L3 is now shared between the entire core
796  *	complex die and is no longer partitioned between each core complex. This
797  *	means that all cores on the die can share the same L3 cache. Otherwise,
798  *	the general layout of the overall package with various core complexes
799  *	and an I/O die stays the same. Here's what the Core Complex Die looks
800  *	like in a bit more detail:
801  *
802  *               +-------------------------------------------------+
803  *               | Zen 3 Core Complex Die                          |
804  *               | +-------------------+    +-------------------+  |
805  *               | | Core       +----+ |    | Core       +----+ |  |
806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
811  *               | +-------------------+    +-------------------+  |
812  *               | +-------------------+    +-------------------+  |
813  *               | | Core       +----+ |    | Core       +----+ |  |
814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
819  *               | +-------------------+    +-------------------+  |
820  *               |                                                 |
821  *               | +--------------------------------------------+  |
822  *               | |                 L3 Cache                   |  |
823  *               | +--------------------------------------------+  |
824  *               |                                                 |
825  *               | +-------------------+    +-------------------+  |
826  *               | | Core       +----+ |    | Core       +----+ |  |
827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
832  *               | +-------------------+    +-------------------+  |
833  *               | +-------------------+    +-------------------+  |
834  *               | | Core       +----+ |    | Core       +----+ |  |
835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
840  *               | +-------------------+    +-------------------+  |
841  *               +-------------------------------------------------+
842  *
843  *	While it is not pictured, there are connections from the die to the
844  *	broader data fabric and additional functional blocks to support that
845  *	communication and coherency.
846  *
847  * CPUID LEAVES
848  *
849  * There are a few different CPUID leaves that we can use to try and understand
850  * the actual state of the world. As part of the introduction of family 0xf, AMD
851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
852  * processors that are in the system. Because families before Zen didn't have
853  * SMT, this was always the number of cores that were in the system. However, it
854  * should always be thought of as the number of logical threads to be consistent
855  * between generations. In addition we also get the size of the APIC ID that is
856  * used to represent the number of logical processors. This is important for
857  * deriving topology information.
858  *
859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
860  * bit between Bulldozer and later families, but it is quite useful in
861  * determining the topology information. Because this information has changed
862  * across family generations, it's worth calling out what these mean
863  * explicitly. The registers have the following meanings:
864  *
865  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
866  *		APIC ID, even though on systems without x2apic support, it will
867  *		be limited to 8 bits.
868  *
869  *	%ebx	On Bulldozer-era systems this contains information about the
870  *		number of cores that are in a compute unit (cores that share
871  *		resources). It also contains a per-package compute unit ID that
872  *		identifies which compute unit the logical CPU is a part of.
873  *
874  *		On Zen-era systems this instead contains the number of threads
875  *		per core and the ID of the core that the logical CPU is a part
876  *		of. Note, this ID is unique only to the package, it is not
877  *		globally unique across the entire system.
878  *
879  *	%ecx	This contains the number of nodes that exist in the package. It
880  *		also contains an ID that identifies which node the logical CPU
881  *		is a part of.
882  *
883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
884  * cache layout to determine which logical CPUs are sharing which caches.
885  *
886  * illumos Topology
887  * ----------------
888  *
889  * Based on the above we synthesize the information into several different
890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
891  * of what each member is supposed to represent and their uniqueness. In
892  * general, there are two levels of uniqueness that we care about. We care about
893  * an ID that is globally unique. That means that it will be unique across all
894  * entities in the system. For example, the default logical CPU ID is globally
895  * unique. On the other hand, there is some information that we only care about
896  * being unique within the context of a single package / socket. Here are the
897  * variables that we keep track of and their meaning.
898  *
899  * Several of the values that are asking for an identifier, with the exception
900  * of cpi_apicid, are allowed to be synthetic.
901  *
902  *
903  * cpi_apicid
904  *
905  *	This is the value of the CPU's APIC id. This should be the full 32-bit
906  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
907  *	APIC ID. This value is globally unique between all logical CPUs across
908  *	all packages. This is usually required by the APIC.
909  *
910  * cpi_chipid
911  *
912  *	This value indicates the ID of the package that the logical CPU is a
913  *	part of. This value is allowed to be synthetic. It is usually derived by
914  *	taking the CPU's APIC ID and determining how many bits are used to
915  *	represent CPU cores in the package. All logical CPUs that are part of
916  *	the same package must have the same value.
917  *
918  * cpi_coreid
919  *
920  *	This represents the ID of a CPU core. Two logical CPUs should only have
921  *	the same cpi_coreid value if they are part of the same core. These
922  *	values may be synthetic. On systems that support SMT, this value is
923  *	usually derived from the APIC ID, otherwise it is often synthetic and
924  *	just set to the value of the cpu_id in the cpu_t.
925  *
926  * cpi_pkgcoreid
927  *
928  *	This is similar to the cpi_coreid in that logical CPUs that are part of
929  *	the same core should have the same ID. The main difference is that these
930  *	values are only required to be unique to a given socket.
931  *
932  * cpi_clogid
933  *
934  *	This represents the logical ID of a logical CPU. This value should be
935  *	unique within a given socket for each logical CPU. This is allowed to be
936  *	synthetic, though it is usually based off of the CPU's apic ID. The
937  *	broader system expects that logical CPUs that have are part of the same
938  *	core have contiguous numbers. For example, if there were two threads per
939  *	core, then the core IDs divided by two should be the same and the first
940  *	modulus two should be zero and the second one. For example, IDs 4 and 5
941  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
942  *	6 represent two logical CPUs that are part of different cores.
943  *
944  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
945  *	from the same source, strictly speaking, they don't have to be and the
946  *	two values should be considered logically independent. One should not
947  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
948  *	some kind of relationship. While this is tempting, we've seen cases on
949  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
950  *
951  * cpi_ncpu_per_chip
952  *
953  *	This value indicates the total number of logical CPUs that exist in the
954  *	physical package. Critically, this is not the number of logical CPUs
955  *	that exist for just the single core.
956  *
957  *	This value should be the same for all logical CPUs in the same package.
958  *
959  * cpi_ncore_per_chip
960  *
961  *	This value indicates the total number of physical CPU cores that exist
962  *	in the package. The system compares this value with cpi_ncpu_per_chip to
963  *	determine if simultaneous multi-threading (SMT) is enabled. When
964  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
965  *	the X86FSET_HTT feature is not set. If this value is greater than one,
966  *	than we consider the processor to have the feature X86FSET_CMP, to
967  *	indicate that there is support for more than one core.
968  *
969  *	This value should be the same for all logical CPUs in the same package.
970  *
971  * cpi_procnodes_per_pkg
972  *
973  *	This value indicates the number of 'nodes' that exist in the package.
974  *	When processors are actually a multi-chip module, this represents the
975  *	number of such modules that exist in the package. Currently, on Intel
976  *	based systems this member is always set to 1.
977  *
978  *	This value should be the same for all logical CPUs in the same package.
979  *
980  * cpi_procnodeid
981  *
982  *	This value indicates the ID of the node that the logical CPU is a part
983  *	of. All logical CPUs that are in the same node must have the same value
984  *	here. This value must be unique across all of the packages in the
985  *	system.  On Intel based systems, this is currently set to the value in
986  *	cpi_chipid because there is only one node.
987  *
988  * cpi_cores_per_compunit
989  *
990  *	This value indicates the number of cores that are part of a compute
991  *	unit. See the AMD topology section for this. This member only has real
992  *	meaning currently for AMD Bulldozer family processors. For all other
993  *	processors, this should currently be set to 1.
994  *
995  * cpi_compunitid
996  *
997  *	This indicates the compute unit that the logical CPU belongs to. For
998  *	processors without AMD Bulldozer-style compute units this should be set
999  *	to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *	This indicates the number of logical CPUs that are sharing the same last
1004  *	level cache. This value should be the same for all CPUs that are sharing
1005  *	that cache. The last cache refers to the cache that is closest to memory
1006  *	and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *	This indicates the ID of the last cache that the logical CPU uses. This
1011  *	cache is often shared between multiple logical CPUs and is the cache
1012  *	that is closest to memory and furthest away from the CPU. This value
1013  *	should be the same for a group of logical CPUs only if they actually
1014  *	share the same last level cache. IDs should not overlap between
1015  *	packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *	This indicates the number of bits that are required to represent all of
1020  *	the cores in the system. As cores are derived based on their APIC IDs,
1021  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *	this value to be larger than the actual number of IDs that are present
1023  *	in the system. This is used to size tables by the CMI framework. It is
1024  *	only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *	This indicates the number of bits required to represent all of the IDs
1029  *	that cover the logical CPUs that exist on a given core. It's OK for this
1030  *	value to be larger than the actual number of IDs that are present in the
1031  *	system.  This is used to size tables by the CMI framework. It is
1032  *	only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *
1117  * Each of these requires different sets of mitigations and has different attack
1118  * surfaces. For the most part, this discussion is about protecting the kernel
1119  * from non-kernel executing environments such as user processes and hardware
1120  * virtual machines. Unfortunately, there are a number of user vs. user
1121  * scenarios that exist with these. The rest of this section will describe the
1122  * overall approach that the system has taken to address these as well as their
1123  * shortcomings. Unfortunately, not all of the above have been handled today.
1124  *
1125  * SPECTRE v2, ret2spec, SpectreRSB
1126  *
1127  * The second variant of the spectre attack focuses on performing branch target
1128  * injection. This generally impacts indirect call instructions in the system.
1129  * There are four different ways to mitigate this issue that are commonly
1130  * described today:
1131  *
1132  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1133  *  2. Using Retpolines and RSB Stuffing
1134  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1135  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1136  *
1137  * IBRS uses a feature added to microcode to restrict speculation, among other
1138  * things. This form of mitigation has not been used as it has been generally
1139  * seen as too expensive and requires reactivation upon various transitions in
1140  * the system.
1141  *
1142  * As a less impactful alternative to IBRS, retpolines were developed by
1143  * Google. These basically require one to replace indirect calls with a specific
1144  * trampoline that will cause speculation to fail and break the attack.
1145  * Retpolines require compiler support. We always build with retpolines in the
1146  * external thunk mode. This means that a traditional indirect call is replaced
1147  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1148  * of this is that all indirect function calls are performed through a register.
1149  *
1150  * We have to use a common external location of the thunk and not inline it into
1151  * the callsite so that way we can have a single place to patch these functions.
1152  * As it turns out, we currently have two different forms of retpolines that
1153  * exist in the system:
1154  *
1155  *  1. A full retpoline
1156  *  2. A no-op version
1157  *
1158  * The first one is used in the general case. Historically, there was an
1159  * AMD-specific optimized retopoline variant that was based around using a
1160  * serializing lfence instruction; however, in March 2022 it was announced that
1161  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1162  * use it and it is no longer available in the system.
1163  *
1164  * The third form described above is the most curious. It turns out that the way
1165  * that retpolines are implemented is that they rely on how speculation is
1166  * performed on a 'ret' instruction. Intel has continued to optimize this
1167  * process (which is partly why we need to have return stack buffer stuffing,
1168  * but more on that in a bit) and in processors starting with Cascade Lake
1169  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1170  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1171  *
1172  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1173  * physical core. However, if this is the case, we don't want to use retpolines
1174  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1175  * function (called a thunk) into a jmp instruction. This means that we're still
1176  * paying the cost of an extra jump to the external thunk, but it gives us
1177  * flexibility and the ability to have a single kernel image that works across a
1178  * wide variety of systems and hardware features.
1179  *
1180  * Unfortunately, this alone is insufficient. First, Skylake systems have
1181  * additional speculation for the Return Stack Buffer (RSB) which is used to
1182  * return from call instructions which retpolines take advantage of. However,
1183  * this problem is not just limited to Skylake and is actually more pernicious.
1184  * The SpectreRSB paper introduces several more problems that can arise with
1185  * dealing with this. The RSB can be poisoned just like the indirect branch
1186  * predictor. This means that one needs to clear the RSB when transitioning
1187  * between two different privilege domains. Some examples include:
1188  *
1189  *  - Switching between two different user processes
1190  *  - Going between user land and the kernel
1191  *  - Returning to the kernel from a hardware virtual machine
1192  *
1193  * Mitigating this involves combining a couple of different things. The first is
1194  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1195  * Bridge. When an RSB entry refers to a user address and we're executing in the
1196  * kernel, speculation through it will be stopped when SMEP is enabled. This
1197  * protects against a number of the different cases that we would normally be
1198  * worried about such as when we enter the kernel from user land.
1199  *
1200  * To prevent against additional manipulation of the RSB from other contexts
1201  * such as a non-root VMX context attacking the kernel we first look to
1202  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1203  * nothing else that we need to do to protect the kernel at this time.
1204  *
1205  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1206  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1207  * Currently this is employed on context switch and vmx_exit. The
1208  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1209  *
1210  * If SMEP is not present, then we would have to stuff the RSB every time we
1211  * transitioned from user mode to the kernel, which isn't very practical right
1212  * now.
1213  *
1214  * To fully protect user to user and vmx to vmx attacks from these classes of
1215  * issues, we would also need to allow them to opt into performing an Indirect
1216  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1217  *
1218  * The fourth form of mitigation here is specific to AMD and is called Automated
1219  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1220  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1221  * (extended feature enable register) MSR. This bit basically says that IBRS
1222  * acts as though it is always active when executing at CPL0 and when executing
1223  * in the 'host' context when SEV-SNP is enabled.
1224  *
1225  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1226  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1227  * to the kernel, we must still consider the remaining cases that exist, just
1228  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1229  * traditional technique to work, this is not true on all CPUs. While a write to
1230  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1231  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1232  * guard page is present between user and kernel address spaces and SMEP is
1233  * enabled, then there is no need to clear the RSB at all.
1234  *
1235  * By default, the system will enable RSB stuffing and the required variant of
1236  * retpolines and store that information in the x86_spectrev2_mitigation value.
1237  * This will be evaluated after a microcode update as well, though it is
1238  * expected that microcode updates will not take away features. This may mean
1239  * that a late loaded microcode may not end up in the optimal configuration
1240  * (though this should be rare).
1241  *
1242  * Currently we do not build kmdb with retpolines or perform any additional side
1243  * channel security mitigations for it. One complication with kmdb is that it
1244  * requires its own retpoline thunks and it would need to adjust itself based on
1245  * what the kernel does. The threat model of kmdb is more limited and therefore
1246  * it may make more sense to investigate using prediction barriers as the whole
1247  * system is only executing a single instruction at a time while in kmdb.
1248  *
1249  * SPECTRE v1, v4
1250  *
1251  * The v1 and v4 variants of spectre are not currently mitigated in the
1252  * system and require other classes of changes to occur in the code.
1253  *
1254  * SPECTRE v1 (SWAPGS VARIANT)
1255  *
1256  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1257  * can generally affect any branch-dependent code. The swapgs issue is one
1258  * variant of this. If we are coming in from userspace, we can have code like
1259  * this:
1260  *
1261  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1262  *	je	1f
1263  *	movq	$0, REGOFF_SAVFP(%rsp)
1264  *	swapgs
1265  *	1:
1266  *	movq	%gs:CPU_THREAD, %rax
1267  *
1268  * If an attacker can cause a mis-speculation of the branch here, we could skip
1269  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1270  * load. If subsequent code can act as the usual Spectre cache gadget, this
1271  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1272  * any use of the %gs override.
1273  *
1274  * The other case is also an issue: if we're coming into a trap from kernel
1275  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1276  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1277  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1278  * case, and the fix is the same in both cases (an lfence at the branch target
1279  * 1: in this example), we'll just do it unconditionally.
1280  *
1281  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1282  * harder for user-space to actually set a useful %gsbase value: although it's
1283  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1284  * mitigate anyway.
1285  *
1286  * MELTDOWN
1287  *
1288  * Meltdown, or spectre v3, allowed a user process to read any data in their
1289  * address space regardless of whether or not the page tables in question
1290  * allowed the user to have the ability to read them. The solution to meltdown
1291  * is kernel page table isolation. In this world, there are two page tables that
1292  * are used for a process, one in user land and one in the kernel. To implement
1293  * this we use per-CPU page tables and switch between the user and kernel
1294  * variants when entering and exiting the kernel.  For more information about
1295  * this process and how the trampolines work, please see the big theory
1296  * statements and additional comments in:
1297  *
1298  *  - uts/i86pc/ml/kpti_trampolines.s
1299  *  - uts/i86pc/vm/hat_i86.c
1300  *
1301  * While Meltdown only impacted Intel systems and there are also Intel systems
1302  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1303  * kernel page table isolation enabled. While this may at first seem weird, an
1304  * important thing to remember is that you can't speculatively read an address
1305  * if it's never in your page table at all. Having user processes without kernel
1306  * pages present provides us with an important layer of defense in the kernel
1307  * against any other side channel attacks that exist and have yet to be
1308  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1309  * default, no matter the x86 system.
1310  *
1311  * L1 TERMINAL FAULT
1312  *
1313  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1314  * execution uses page table entries. Effectively, it is two different problems.
1315  * The first is that it ignores the not present bit in the page table entries
1316  * when performing speculative execution. This means that something can
1317  * speculatively read the listed physical address if it's present in the L1
1318  * cache under certain conditions (see Intel's documentation for the full set of
1319  * conditions). Secondly, this can be used to bypass hardware virtualization
1320  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1321  * instructions.
1322  *
1323  * For the non-hardware virtualized case, this is relatively easy to deal with.
1324  * We must make sure that all unmapped pages have an address of zero. This means
1325  * that they could read the first 4k of physical memory; however, we never use
1326  * that first page in the operating system and always skip putting it in our
1327  * memory map, even if firmware tells us we can use it in our memory map. While
1328  * other systems try to put extra metadata in the address and reserved bits,
1329  * which led to this being problematic in those cases, we do not.
1330  *
1331  * For hardware virtual machines things are more complicated. Because they can
1332  * construct their own page tables, it isn't hard for them to perform this
1333  * attack against any physical address. The one wrinkle is that this physical
1334  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1335  * to flush the L1 data cache. We wrap this up in the function
1336  * spec_uarch_flush(). This function is also used in the mitigation of
1337  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1338  * hypervisors such as KVM or bhyve are responsible for performing this before
1339  * entering the guest.
1340  *
1341  * Because this attack takes place in the L1 cache, there's another wrinkle
1342  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1343  * designs. This means that when a thread enters a hardware virtualized context
1344  * and flushes the L1 data cache, the other thread on the processor may then go
1345  * ahead and put new data in it that can be potentially attacked. While one
1346  * solution is to disable SMT on the system, another option that is available is
1347  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1348  * goes through and makes sure that if a HVM is being scheduled on one thread,
1349  * then the thing on the other thread is from the same hardware virtual machine.
1350  * If an interrupt comes in or the guest exits to the broader system, then the
1351  * other SMT thread will be kicked out.
1352  *
1353  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1354  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1355  * perform L1TF related mitigations.
1356  *
1357  * MICROARCHITECTURAL DATA SAMPLING
1358  *
1359  * Microarchitectural data sampling (MDS) is a combination of four discrete
1360  * vulnerabilities that are similar issues affecting various parts of the CPU's
1361  * microarchitectural implementation around load, store, and fill buffers.
1362  * Specifically it is made up of the following subcomponents:
1363  *
1364  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1365  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1366  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1367  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1368  *
1369  * To begin addressing these, Intel has introduced another feature in microcode
1370  * called MD_CLEAR. This changes the verw instruction to operate in a different
1371  * way. This allows us to execute the verw instruction in a particular way to
1372  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1373  * updated when this microcode is present to flush this state.
1374  *
1375  * Primarily we need to flush this state whenever we transition from the kernel
1376  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1377  * little bit different. Here the structures are statically sized when a logical
1378  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1379  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1380  * mwait, or another ACPI method. To perform these flushes, we call
1381  * x86_md_clear() at all of these transition points.
1382  *
1383  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1384  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1385  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1386  * a no-op.
1387  *
1388  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1389  * particular, everything we've discussed above is only valid for a single
1390  * thread executing on a core. In the case where you have hyper-threading
1391  * present, this attack can be performed between threads. The theoretical fix
1392  * for this is to ensure that both threads are always in the same security
1393  * domain. This means that they are executing in the same ring and mutually
1394  * trust each other. Practically speaking, this would mean that a system call
1395  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1396  * Rather than implement this, we recommend that one disables hyper-threading
1397  * through the use of psradm -aS.
1398  *
1399  * TSX ASYNCHRONOUS ABORT
1400  *
1401  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1402  * behaves like MDS, but leverages Intel's transactional instructions as another
1403  * vector. Effectively, when a transaction hits one of these cases (unmapped
1404  * page, various cache snoop activity, etc.) then the same data can be exposed
1405  * as in the case of MDS. This means that you can attack your twin.
1406  *
1407  * Intel has described that there are two different ways that we can mitigate
1408  * this problem on affected processors:
1409  *
1410  *   1) We can use the same techniques used to deal with MDS. Flushing the
1411  *      microarchitectural buffers and disabling hyperthreading will mitigate
1412  *      this in the same way.
1413  *
1414  *   2) Using microcode to disable TSX.
1415  *
1416  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1417  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1418  * That's OK as we're already doing all such mitigations. On the other hand,
1419  * processors with MDS_NO are all supposed to receive microcode updates that
1420  * enumerate support for disabling TSX. In general, we'd rather use this method
1421  * when available as it doesn't require disabling hyperthreading to be
1422  * effective. Currently we basically are relying on microcode for processors
1423  * that enumerate MDS_NO.
1424  *
1425  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1426  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1427  * different powers. The first allows us to cause all transactions to
1428  * immediately abort. The second gives us a means of disabling TSX completely,
1429  * which includes removing it from cpuid. If we have support for this in
1430  * microcode during the first cpuid pass, then we'll disable TSX completely such
1431  * that user land never has a chance to observe the bit. However, if we are late
1432  * loading the microcode, then we must use the functionality to cause
1433  * transactions to automatically abort. This is necessary for user land's sake.
1434  * Once a program sees a cpuid bit, it must not be taken away.
1435  *
1436  * We track whether or not we should do this based on what cpuid pass we're in.
1437  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1438  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1439  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1440  * second time after we do the initial microcode update.  As a result we need to
1441  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1442  * suitable microcode on the current CPU (which happens prior to
1443  * cpuid_pass_ucode()).
1444  *
1445  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1446  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1447  * unfortunate feature in a number of ways, and taking the opportunity to
1448  * finally be able to turn it off is likely to be of benefit in the future.
1449  *
1450  * SUMMARY
1451  *
1452  * The following table attempts to summarize the mitigations for various issues
1453  * and what's done in various places:
1454  *
1455  *  - Spectre v1: Not currently mitigated
1456  *  - swapgs: lfences after swapgs paths
1457  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1458  *  - Meltdown: Kernel Page Table Isolation
1459  *  - Spectre v3a: Updated CPU microcode
1460  *  - Spectre v4: Not currently mitigated
1461  *  - SpectreRSB: SMEP and RSB Stuffing
1462  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1463  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1464  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1465  *
1466  * The following table indicates the x86 feature set bits that indicate that a
1467  * given problem has been solved or a notable feature is present:
1468  *
1469  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1470  *  - MDS_NO: All forms of MDS
1471  *  - TAA_NO: TAA
1472  */
1473 
1474 #include <sys/types.h>
1475 #include <sys/archsystm.h>
1476 #include <sys/x86_archext.h>
1477 #include <sys/kmem.h>
1478 #include <sys/systm.h>
1479 #include <sys/cmn_err.h>
1480 #include <sys/sunddi.h>
1481 #include <sys/sunndi.h>
1482 #include <sys/cpuvar.h>
1483 #include <sys/processor.h>
1484 #include <sys/sysmacros.h>
1485 #include <sys/pg.h>
1486 #include <sys/fp.h>
1487 #include <sys/controlregs.h>
1488 #include <sys/bitmap.h>
1489 #include <sys/auxv_386.h>
1490 #include <sys/memnode.h>
1491 #include <sys/pci_cfgspace.h>
1492 #include <sys/comm_page.h>
1493 #include <sys/mach_mmu.h>
1494 #include <sys/ucode.h>
1495 #include <sys/tsc.h>
1496 #include <sys/kobj.h>
1497 #include <sys/asm_misc.h>
1498 
1499 #ifdef __xpv
1500 #include <sys/hypervisor.h>
1501 #else
1502 #include <sys/ontrap.h>
1503 #endif
1504 
1505 uint_t x86_vendor = X86_VENDOR_IntelClone;
1506 uint_t x86_type = X86_TYPE_OTHER;
1507 uint_t x86_clflush_size = 0;
1508 
1509 #if defined(__xpv)
1510 int x86_use_pcid = 0;
1511 int x86_use_invpcid = 0;
1512 #else
1513 int x86_use_pcid = -1;
1514 int x86_use_invpcid = -1;
1515 #endif
1516 
1517 typedef enum {
1518 	X86_SPECTREV2_RETPOLINE,
1519 	X86_SPECTREV2_ENHANCED_IBRS,
1520 	X86_SPECTREV2_AUTO_IBRS,
1521 	X86_SPECTREV2_DISABLED
1522 } x86_spectrev2_mitigation_t;
1523 
1524 uint_t x86_disable_spectrev2 = 0;
1525 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1526     X86_SPECTREV2_RETPOLINE;
1527 
1528 /*
1529  * The mitigation status for TAA:
1530  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1531  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1532  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1533  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1534  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1535  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1536  */
1537 typedef enum {
1538 	X86_TAA_NOTHING,
1539 	X86_TAA_DISABLED,
1540 	X86_TAA_MD_CLEAR,
1541 	X86_TAA_TSX_FORCE_ABORT,
1542 	X86_TAA_TSX_DISABLE,
1543 	X86_TAA_HW_MITIGATED
1544 } x86_taa_mitigation_t;
1545 
1546 uint_t x86_disable_taa = 0;
1547 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1548 
1549 uint_t pentiumpro_bug4046376;
1550 
1551 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1552 
1553 static char *x86_feature_names[NUM_X86_FEATURES] = {
1554 	"lgpg",
1555 	"tsc",
1556 	"msr",
1557 	"mtrr",
1558 	"pge",
1559 	"de",
1560 	"cmov",
1561 	"mmx",
1562 	"mca",
1563 	"pae",
1564 	"cv8",
1565 	"pat",
1566 	"sep",
1567 	"sse",
1568 	"sse2",
1569 	"htt",
1570 	"asysc",
1571 	"nx",
1572 	"sse3",
1573 	"cx16",
1574 	"cmp",
1575 	"tscp",
1576 	"mwait",
1577 	"sse4a",
1578 	"cpuid",
1579 	"ssse3",
1580 	"sse4_1",
1581 	"sse4_2",
1582 	"1gpg",
1583 	"clfsh",
1584 	"64",
1585 	"aes",
1586 	"pclmulqdq",
1587 	"xsave",
1588 	"avx",
1589 	"vmx",
1590 	"svm",
1591 	"topoext",
1592 	"f16c",
1593 	"rdrand",
1594 	"x2apic",
1595 	"avx2",
1596 	"bmi1",
1597 	"bmi2",
1598 	"fma",
1599 	"smep",
1600 	"smap",
1601 	"adx",
1602 	"rdseed",
1603 	"mpx",
1604 	"avx512f",
1605 	"avx512dq",
1606 	"avx512pf",
1607 	"avx512er",
1608 	"avx512cd",
1609 	"avx512bw",
1610 	"avx512vl",
1611 	"avx512fma",
1612 	"avx512vbmi",
1613 	"avx512_vpopcntdq",
1614 	"avx512_4vnniw",
1615 	"avx512_4fmaps",
1616 	"xsaveopt",
1617 	"xsavec",
1618 	"xsaves",
1619 	"sha",
1620 	"umip",
1621 	"pku",
1622 	"ospke",
1623 	"pcid",
1624 	"invpcid",
1625 	"ibrs",
1626 	"ibpb",
1627 	"stibp",
1628 	"ssbd",
1629 	"ssbd_virt",
1630 	"rdcl_no",
1631 	"ibrs_all",
1632 	"rsba",
1633 	"ssb_no",
1634 	"stibp_all",
1635 	"flush_cmd",
1636 	"l1d_vmentry_no",
1637 	"fsgsbase",
1638 	"clflushopt",
1639 	"clwb",
1640 	"monitorx",
1641 	"clzero",
1642 	"xop",
1643 	"fma4",
1644 	"tbm",
1645 	"avx512_vnni",
1646 	"amd_pcec",
1647 	"md_clear",
1648 	"mds_no",
1649 	"core_thermal",
1650 	"pkg_thermal",
1651 	"tsx_ctrl",
1652 	"taa_no",
1653 	"ppin",
1654 	"vaes",
1655 	"vpclmulqdq",
1656 	"lfence_serializing",
1657 	"gfni",
1658 	"avx512_vp2intersect",
1659 	"avx512_bitalg",
1660 	"avx512_vbmi2",
1661 	"avx512_bf16",
1662 	"auto_ibrs"
1663 };
1664 
1665 boolean_t
1666 is_x86_feature(void *featureset, uint_t feature)
1667 {
1668 	ASSERT(feature < NUM_X86_FEATURES);
1669 	return (BT_TEST((ulong_t *)featureset, feature));
1670 }
1671 
1672 void
1673 add_x86_feature(void *featureset, uint_t feature)
1674 {
1675 	ASSERT(feature < NUM_X86_FEATURES);
1676 	BT_SET((ulong_t *)featureset, feature);
1677 }
1678 
1679 void
1680 remove_x86_feature(void *featureset, uint_t feature)
1681 {
1682 	ASSERT(feature < NUM_X86_FEATURES);
1683 	BT_CLEAR((ulong_t *)featureset, feature);
1684 }
1685 
1686 boolean_t
1687 compare_x86_featureset(void *setA, void *setB)
1688 {
1689 	/*
1690 	 * We assume that the unused bits of the bitmap are always zero.
1691 	 */
1692 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1693 		return (B_TRUE);
1694 	} else {
1695 		return (B_FALSE);
1696 	}
1697 }
1698 
1699 void
1700 print_x86_featureset(void *featureset)
1701 {
1702 	uint_t i;
1703 
1704 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1705 		if (is_x86_feature(featureset, i)) {
1706 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1707 			    x86_feature_names[i]);
1708 		}
1709 	}
1710 }
1711 
1712 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1713 static size_t xsave_state_size = 0;
1714 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1715 boolean_t xsave_force_disable = B_FALSE;
1716 extern int disable_smap;
1717 
1718 /*
1719  * This is set to platform type we are running on.
1720  */
1721 static int platform_type = -1;
1722 
1723 #if !defined(__xpv)
1724 /*
1725  * Variable to patch if hypervisor platform detection needs to be
1726  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1727  */
1728 int enable_platform_detection = 1;
1729 #endif
1730 
1731 /*
1732  * monitor/mwait info.
1733  *
1734  * size_actual and buf_actual are the real address and size allocated to get
1735  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1736  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1737  * processor cache-line alignment, but this is not guarantied in the furture.
1738  */
1739 struct mwait_info {
1740 	size_t		mon_min;	/* min size to avoid missed wakeups */
1741 	size_t		mon_max;	/* size to avoid false wakeups */
1742 	size_t		size_actual;	/* size actually allocated */
1743 	void		*buf_actual;	/* memory actually allocated */
1744 	uint32_t	support;	/* processor support of monitor/mwait */
1745 };
1746 
1747 /*
1748  * xsave/xrestor info.
1749  *
1750  * This structure contains HW feature bits and the size of the xsave save area.
1751  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1752  * (xsave_state) to describe the xsave layout. However, at runtime the
1753  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1754  * xsave_state structure simply represents the legacy layout of the beginning
1755  * of the xsave area.
1756  */
1757 struct xsave_info {
1758 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1759 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1760 	size_t		xsav_max_size;  /* max size save area for HW features */
1761 	size_t		ymm_size;	/* AVX: size of ymm save area */
1762 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1763 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1764 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1765 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1766 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1767 	size_t		opmask_size;	/* AVX512: size of opmask save */
1768 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1769 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1770 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1771 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1772 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1773 };
1774 
1775 
1776 /*
1777  * These constants determine how many of the elements of the
1778  * cpuid we cache in the cpuid_info data structure; the
1779  * remaining elements are accessible via the cpuid instruction.
1780  */
1781 
1782 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1783 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1784 
1785 /*
1786  * See the big theory statement for a more detailed explanation of what some of
1787  * these members mean.
1788  */
1789 struct cpuid_info {
1790 	uint_t cpi_pass;		/* last pass completed */
1791 	/*
1792 	 * standard function information
1793 	 */
1794 	uint_t cpi_maxeax;		/* fn 0: %eax */
1795 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1796 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1797 
1798 	uint_t cpi_family;		/* fn 1: extended family */
1799 	uint_t cpi_model;		/* fn 1: extended model */
1800 	uint_t cpi_step;		/* fn 1: stepping */
1801 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1802 					/*		AMD: package/socket # */
1803 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1804 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1805 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1806 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1807 	uint_t cpi_ncache;		/* fn 2: number of elements */
1808 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1809 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1810 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1811 					/* Intel fn: 4, AMD fn: 8000001d */
1812 	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1813 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1814 	struct cpuid_regs cpi_sub7[1];	/* Leaf 7, sub-leaf 1 */
1815 	/*
1816 	 * extended function information
1817 	 */
1818 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1819 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1820 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1821 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1822 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1823 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1824 
1825 	id_t cpi_coreid;		/* same coreid => strands share core */
1826 	int cpi_pkgcoreid;		/* core number within single package */
1827 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1828 					/* Intel: fn 4: %eax[31-26] */
1829 
1830 	/*
1831 	 * These values represent the number of bits that are required to store
1832 	 * information about the number of cores and threads.
1833 	 */
1834 	uint_t cpi_ncore_bits;
1835 	uint_t cpi_nthread_bits;
1836 	/*
1837 	 * supported feature information
1838 	 */
1839 	uint32_t cpi_support[6];
1840 #define	STD_EDX_FEATURES	0
1841 #define	AMD_EDX_FEATURES	1
1842 #define	TM_EDX_FEATURES		2
1843 #define	STD_ECX_FEATURES	3
1844 #define	AMD_ECX_FEATURES	4
1845 #define	STD_EBX_FEATURES	5
1846 	/*
1847 	 * Synthesized information, where known.
1848 	 */
1849 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1850 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1851 	uint32_t cpi_socket;		/* Chip package/socket type */
1852 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1853 
1854 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1855 	uint32_t cpi_apicid;
1856 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1857 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1858 					/* Intel: 1 */
1859 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1860 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1861 
1862 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1863 };
1864 
1865 
1866 static struct cpuid_info cpuid_info0;
1867 
1868 /*
1869  * These bit fields are defined by the Intel Application Note AP-485
1870  * "Intel Processor Identification and the CPUID Instruction"
1871  */
1872 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1873 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1874 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1875 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1876 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1877 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1878 
1879 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1880 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1881 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1882 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1883 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1884 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1885 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1886 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1887 
1888 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1889 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1890 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1891 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1892 
1893 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1894 #define	CPI_XMAXEAX_MAX		0x80000100
1895 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1896 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1897 
1898 /*
1899  * Function 4 (Deterministic Cache Parameters) macros
1900  * Defined by Intel Application Note AP-485
1901  */
1902 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1903 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1904 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1905 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1906 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1907 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1908 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1909 
1910 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1911 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1912 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1913 
1914 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1915 
1916 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1917 
1918 
1919 /*
1920  * A couple of shorthand macros to identify "later" P6-family chips
1921  * like the Pentium M and Core.  First, the "older" P6-based stuff
1922  * (loosely defined as "pre-Pentium-4"):
1923  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1924  */
1925 #define	IS_LEGACY_P6(cpi) (			\
1926 	cpi->cpi_family == 6 &&			\
1927 		(cpi->cpi_model == 1 ||		\
1928 		cpi->cpi_model == 3 ||		\
1929 		cpi->cpi_model == 5 ||		\
1930 		cpi->cpi_model == 6 ||		\
1931 		cpi->cpi_model == 7 ||		\
1932 		cpi->cpi_model == 8 ||		\
1933 		cpi->cpi_model == 0xA ||	\
1934 		cpi->cpi_model == 0xB)		\
1935 )
1936 
1937 /* A "new F6" is everything with family 6 that's not the above */
1938 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1939 
1940 /* Extended family/model support */
1941 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1942 	cpi->cpi_family >= 0xf)
1943 
1944 /*
1945  * Info for monitor/mwait idle loop.
1946  *
1947  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1948  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1949  * 2006.
1950  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1951  * Documentation Updates" #33633, Rev 2.05, December 2006.
1952  */
1953 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1954 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1955 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1956 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1957 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1958 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1959 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1960 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1961 /*
1962  * Number of sub-cstates for a given c-state.
1963  */
1964 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1965 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1966 
1967 /*
1968  * XSAVE leaf 0xD enumeration
1969  */
1970 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1971 #define	CPUID_LEAFD_2_YMM_SIZE		256
1972 
1973 /*
1974  * Common extended leaf names to cut down on typos.
1975  */
1976 #define	CPUID_LEAF_EXT_0		0x80000000
1977 #define	CPUID_LEAF_EXT_8		0x80000008
1978 #define	CPUID_LEAF_EXT_1d		0x8000001d
1979 #define	CPUID_LEAF_EXT_1e		0x8000001e
1980 #define	CPUID_LEAF_EXT_21		0x80000021
1981 
1982 /*
1983  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1984  * file to try and keep people using the expected cpuid_* interfaces.
1985  */
1986 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1987 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1988 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1989 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1990 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
1991 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1992 
1993 /*
1994  * Apply up various platform-dependent restrictions where the
1995  * underlying platform restrictions mean the CPU can be marked
1996  * as less capable than its cpuid instruction would imply.
1997  */
1998 #if defined(__xpv)
1999 static void
2000 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2001 {
2002 	switch (eax) {
2003 	case 1: {
2004 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2005 		    0 : CPUID_INTC_EDX_MCA;
2006 		cp->cp_edx &=
2007 		    ~(mcamask |
2008 		    CPUID_INTC_EDX_PSE |
2009 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2010 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2011 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2012 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2013 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2014 		break;
2015 	}
2016 
2017 	case 0x80000001:
2018 		cp->cp_edx &=
2019 		    ~(CPUID_AMD_EDX_PSE |
2020 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2021 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2022 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2023 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2024 		    CPUID_AMD_EDX_TSCP);
2025 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2026 		break;
2027 	default:
2028 		break;
2029 	}
2030 
2031 	switch (vendor) {
2032 	case X86_VENDOR_Intel:
2033 		switch (eax) {
2034 		case 4:
2035 			/*
2036 			 * Zero out the (ncores-per-chip - 1) field
2037 			 */
2038 			cp->cp_eax &= 0x03fffffff;
2039 			break;
2040 		default:
2041 			break;
2042 		}
2043 		break;
2044 	case X86_VENDOR_AMD:
2045 	case X86_VENDOR_HYGON:
2046 		switch (eax) {
2047 
2048 		case 0x80000001:
2049 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2050 			break;
2051 
2052 		case CPUID_LEAF_EXT_8:
2053 			/*
2054 			 * Zero out the (ncores-per-chip - 1) field
2055 			 */
2056 			cp->cp_ecx &= 0xffffff00;
2057 			break;
2058 		default:
2059 			break;
2060 		}
2061 		break;
2062 	default:
2063 		break;
2064 	}
2065 }
2066 #else
2067 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2068 #endif
2069 
2070 /*
2071  *  Some undocumented ways of patching the results of the cpuid
2072  *  instruction to permit running Solaris 10 on future cpus that
2073  *  we don't currently support.  Could be set to non-zero values
2074  *  via settings in eeprom.
2075  */
2076 
2077 uint32_t cpuid_feature_ecx_include;
2078 uint32_t cpuid_feature_ecx_exclude;
2079 uint32_t cpuid_feature_edx_include;
2080 uint32_t cpuid_feature_edx_exclude;
2081 
2082 /*
2083  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2084  */
2085 void
2086 cpuid_alloc_space(cpu_t *cpu)
2087 {
2088 	/*
2089 	 * By convention, cpu0 is the boot cpu, which is set up
2090 	 * before memory allocation is available.  All other cpus get
2091 	 * their cpuid_info struct allocated here.
2092 	 */
2093 	ASSERT(cpu->cpu_id != 0);
2094 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2095 	cpu->cpu_m.mcpu_cpi =
2096 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2097 }
2098 
2099 void
2100 cpuid_free_space(cpu_t *cpu)
2101 {
2102 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2103 	int i;
2104 
2105 	ASSERT(cpi != NULL);
2106 	ASSERT(cpi != &cpuid_info0);
2107 
2108 	/*
2109 	 * Free up any cache leaf related dynamic storage. The first entry was
2110 	 * cached from the standard cpuid storage, so we should not free it.
2111 	 */
2112 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2113 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2114 	if (cpi->cpi_cache_leaf_size > 0)
2115 		kmem_free(cpi->cpi_cache_leaves,
2116 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2117 
2118 	kmem_free(cpi, sizeof (*cpi));
2119 	cpu->cpu_m.mcpu_cpi = NULL;
2120 }
2121 
2122 #if !defined(__xpv)
2123 /*
2124  * Determine the type of the underlying platform. This is used to customize
2125  * initialization of various subsystems (e.g. TSC). determine_platform() must
2126  * only ever be called once to prevent two processors from seeing different
2127  * values of platform_type. Must be called before cpuid_pass_ident(), the
2128  * earliest consumer to execute; the identification pass will call
2129  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2130  */
2131 void
2132 determine_platform(void)
2133 {
2134 	struct cpuid_regs cp;
2135 	uint32_t base;
2136 	uint32_t regs[4];
2137 	char *hvstr = (char *)regs;
2138 
2139 	ASSERT(platform_type == -1);
2140 
2141 	platform_type = HW_NATIVE;
2142 
2143 	if (!enable_platform_detection)
2144 		return;
2145 
2146 	/*
2147 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2148 	 * vendor signature, and set platform type accordingly.
2149 	 *
2150 	 * References:
2151 	 * http://lkml.org/lkml/2008/10/1/246
2152 	 * http://kb.vmware.com/kb/1009458
2153 	 */
2154 	cp.cp_eax = 0x1;
2155 	(void) __cpuid_insn(&cp);
2156 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2157 		cp.cp_eax = 0x40000000;
2158 		(void) __cpuid_insn(&cp);
2159 		regs[0] = cp.cp_ebx;
2160 		regs[1] = cp.cp_ecx;
2161 		regs[2] = cp.cp_edx;
2162 		regs[3] = 0;
2163 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2164 			platform_type = HW_XEN_HVM;
2165 			return;
2166 		}
2167 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2168 			platform_type = HW_VMWARE;
2169 			return;
2170 		}
2171 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2172 			platform_type = HW_KVM;
2173 			return;
2174 		}
2175 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2176 			platform_type = HW_BHYVE;
2177 			return;
2178 		}
2179 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
2180 			platform_type = HW_MICROSOFT;
2181 	} else {
2182 		/*
2183 		 * Check older VMware hardware versions. VMware hypervisor is
2184 		 * detected by performing an IN operation to VMware hypervisor
2185 		 * port and checking that value returned in %ebx is VMware
2186 		 * hypervisor magic value.
2187 		 *
2188 		 * References: http://kb.vmware.com/kb/1009458
2189 		 */
2190 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2191 		if (regs[1] == VMWARE_HVMAGIC) {
2192 			platform_type = HW_VMWARE;
2193 			return;
2194 		}
2195 	}
2196 
2197 	/*
2198 	 * Check Xen hypervisor. In a fully virtualized domain,
2199 	 * Xen's pseudo-cpuid function returns a string representing the
2200 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2201 	 * supported cpuid function. We need at least a (base + 2) leaf value
2202 	 * to do what we want to do. Try different base values, since the
2203 	 * hypervisor might use a different one depending on whether Hyper-V
2204 	 * emulation is switched on by default or not.
2205 	 */
2206 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2207 		cp.cp_eax = base;
2208 		(void) __cpuid_insn(&cp);
2209 		regs[0] = cp.cp_ebx;
2210 		regs[1] = cp.cp_ecx;
2211 		regs[2] = cp.cp_edx;
2212 		regs[3] = 0;
2213 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2214 		    cp.cp_eax >= (base + 2)) {
2215 			platform_type &= ~HW_NATIVE;
2216 			platform_type |= HW_XEN_HVM;
2217 			return;
2218 		}
2219 	}
2220 }
2221 
2222 int
2223 get_hwenv(void)
2224 {
2225 	ASSERT(platform_type != -1);
2226 	return (platform_type);
2227 }
2228 
2229 int
2230 is_controldom(void)
2231 {
2232 	return (0);
2233 }
2234 
2235 #else
2236 
2237 int
2238 get_hwenv(void)
2239 {
2240 	return (HW_XEN_PV);
2241 }
2242 
2243 int
2244 is_controldom(void)
2245 {
2246 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2247 }
2248 
2249 #endif	/* __xpv */
2250 
2251 /*
2252  * Make sure that we have gathered all of the CPUID leaves that we might need to
2253  * determine topology. We assume that the standard leaf 1 has already been done
2254  * and that xmaxeax has already been calculated.
2255  */
2256 static void
2257 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2258 {
2259 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2260 
2261 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2262 		struct cpuid_regs *cp;
2263 
2264 		cp = &cpi->cpi_extd[8];
2265 		cp->cp_eax = CPUID_LEAF_EXT_8;
2266 		(void) __cpuid_insn(cp);
2267 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2268 	}
2269 
2270 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2271 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2272 		struct cpuid_regs *cp;
2273 
2274 		cp = &cpi->cpi_extd[0x1e];
2275 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2276 		(void) __cpuid_insn(cp);
2277 	}
2278 }
2279 
2280 /*
2281  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2282  * it to everything else. If not, and we're on an AMD system where 8000001e is
2283  * valid, then we use that. Othewrise, we fall back to the default value for the
2284  * APIC ID in leaf 1.
2285  */
2286 static uint32_t
2287 cpuid_gather_apicid(struct cpuid_info *cpi)
2288 {
2289 	/*
2290 	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2291 	 * it, we need to gather it again.
2292 	 */
2293 	if (cpi->cpi_maxeax >= 0xB) {
2294 		struct cpuid_regs regs;
2295 		struct cpuid_regs *cp;
2296 
2297 		cp = &regs;
2298 		cp->cp_eax = 0xB;
2299 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2300 		(void) __cpuid_insn(cp);
2301 
2302 		if (cp->cp_ebx != 0) {
2303 			return (cp->cp_edx);
2304 		}
2305 	}
2306 
2307 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2308 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2309 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2310 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2311 		return (cpi->cpi_extd[0x1e].cp_eax);
2312 	}
2313 
2314 	return (CPI_APIC_ID(cpi));
2315 }
2316 
2317 /*
2318  * For AMD processors, attempt to calculate the number of chips and cores that
2319  * exist. The way that we do this varies based on the generation, because the
2320  * generations themselves have changed dramatically.
2321  *
2322  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2323  * However, with the advent of family 17h (Zen) it actually tells us the number
2324  * of threads, so we need to look at leaf 0x8000001e if available to determine
2325  * its value. Otherwise, for all prior families, the number of enabled cores is
2326  * the same as threads.
2327  *
2328  * If we do not have leaf 0x80000008, then we assume that this processor does
2329  * not have anything. AMD's older CPUID specification says there's no reason to
2330  * fall back to leaf 1.
2331  *
2332  * In some virtualization cases we will not have leaf 8000001e or it will be
2333  * zero. When that happens we assume the number of threads is one.
2334  */
2335 static void
2336 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2337 {
2338 	uint_t nthreads, nthread_per_core;
2339 
2340 	nthreads = nthread_per_core = 1;
2341 
2342 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2343 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2344 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2345 		nthreads = CPI_CPU_COUNT(cpi);
2346 	}
2347 
2348 	/*
2349 	 * For us to have threads, and know about it, we have to be at least at
2350 	 * family 17h and have the cpuid bit that says we have extended
2351 	 * topology.
2352 	 */
2353 	if (cpi->cpi_family >= 0x17 &&
2354 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2355 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2356 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2357 	}
2358 
2359 	*ncpus = nthreads;
2360 	*ncores = nthreads / nthread_per_core;
2361 }
2362 
2363 /*
2364  * Seed the initial values for the cores and threads for an Intel based
2365  * processor. These values will be overwritten if we detect that the processor
2366  * supports CPUID leaf 0xb.
2367  */
2368 static void
2369 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2370 {
2371 	/*
2372 	 * Only seed the number of physical cores from the first level leaf 4
2373 	 * information. The number of threads there indicate how many share the
2374 	 * L1 cache, which may or may not have anything to do with the number of
2375 	 * logical CPUs per core.
2376 	 */
2377 	if (cpi->cpi_maxeax >= 4) {
2378 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2379 	} else {
2380 		*ncores = 1;
2381 	}
2382 
2383 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2384 		*ncpus = CPI_CPU_COUNT(cpi);
2385 	} else {
2386 		*ncpus = *ncores;
2387 	}
2388 }
2389 
2390 static boolean_t
2391 cpuid_leafB_getids(cpu_t *cpu)
2392 {
2393 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2394 	struct cpuid_regs regs;
2395 	struct cpuid_regs *cp;
2396 
2397 	if (cpi->cpi_maxeax < 0xB)
2398 		return (B_FALSE);
2399 
2400 	cp = &regs;
2401 	cp->cp_eax = 0xB;
2402 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2403 
2404 	(void) __cpuid_insn(cp);
2405 
2406 	/*
2407 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2408 	 * indicates that the extended topology enumeration leaf is
2409 	 * available.
2410 	 */
2411 	if (cp->cp_ebx != 0) {
2412 		uint32_t x2apic_id = 0;
2413 		uint_t coreid_shift = 0;
2414 		uint_t ncpu_per_core = 1;
2415 		uint_t chipid_shift = 0;
2416 		uint_t ncpu_per_chip = 1;
2417 		uint_t i;
2418 		uint_t level;
2419 
2420 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2421 			cp->cp_eax = 0xB;
2422 			cp->cp_ecx = i;
2423 
2424 			(void) __cpuid_insn(cp);
2425 			level = CPI_CPU_LEVEL_TYPE(cp);
2426 
2427 			if (level == 1) {
2428 				x2apic_id = cp->cp_edx;
2429 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2430 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2431 			} else if (level == 2) {
2432 				x2apic_id = cp->cp_edx;
2433 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2434 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2435 			}
2436 		}
2437 
2438 		/*
2439 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2440 		 */
2441 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2442 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2443 		    ncpu_per_core;
2444 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2445 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2446 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2447 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2448 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2449 		cpi->cpi_compunitid = cpi->cpi_coreid;
2450 
2451 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2452 			cpi->cpi_nthread_bits = coreid_shift;
2453 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2454 		}
2455 
2456 		return (B_TRUE);
2457 	} else {
2458 		return (B_FALSE);
2459 	}
2460 }
2461 
2462 static void
2463 cpuid_intel_getids(cpu_t *cpu, void *feature)
2464 {
2465 	uint_t i;
2466 	uint_t chipid_shift = 0;
2467 	uint_t coreid_shift = 0;
2468 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2469 
2470 	/*
2471 	 * There are no compute units or processor nodes currently on Intel.
2472 	 * Always set these to one.
2473 	 */
2474 	cpi->cpi_procnodes_per_pkg = 1;
2475 	cpi->cpi_cores_per_compunit = 1;
2476 
2477 	/*
2478 	 * If cpuid Leaf B is present, use that to try and get this information.
2479 	 * It will be the most accurate for Intel CPUs.
2480 	 */
2481 	if (cpuid_leafB_getids(cpu))
2482 		return;
2483 
2484 	/*
2485 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2486 	 * and ncore_per_chip. These represent the largest power of two values
2487 	 * that we need to cover all of the IDs in the system. Therefore, we use
2488 	 * those values to seed the number of bits needed to cover information
2489 	 * in the case when leaf B is not available. These values will probably
2490 	 * be larger than required, but that's OK.
2491 	 */
2492 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2493 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2494 
2495 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2496 		chipid_shift++;
2497 
2498 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2499 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2500 
2501 	if (is_x86_feature(feature, X86FSET_CMP)) {
2502 		/*
2503 		 * Multi-core (and possibly multi-threaded)
2504 		 * processors.
2505 		 */
2506 		uint_t ncpu_per_core = 0;
2507 
2508 		if (cpi->cpi_ncore_per_chip == 1)
2509 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2510 		else if (cpi->cpi_ncore_per_chip > 1)
2511 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2512 			    cpi->cpi_ncore_per_chip;
2513 		/*
2514 		 * 8bit APIC IDs on dual core Pentiums
2515 		 * look like this:
2516 		 *
2517 		 * +-----------------------+------+------+
2518 		 * | Physical Package ID   |  MC  |  HT  |
2519 		 * +-----------------------+------+------+
2520 		 * <------- chipid -------->
2521 		 * <------- coreid --------------->
2522 		 *			   <--- clogid -->
2523 		 *			   <------>
2524 		 *			   pkgcoreid
2525 		 *
2526 		 * Where the number of bits necessary to
2527 		 * represent MC and HT fields together equals
2528 		 * to the minimum number of bits necessary to
2529 		 * store the value of cpi->cpi_ncpu_per_chip.
2530 		 * Of those bits, the MC part uses the number
2531 		 * of bits necessary to store the value of
2532 		 * cpi->cpi_ncore_per_chip.
2533 		 */
2534 		for (i = 1; i < ncpu_per_core; i <<= 1)
2535 			coreid_shift++;
2536 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2537 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2538 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2539 		/*
2540 		 * Single-core multi-threaded processors.
2541 		 */
2542 		cpi->cpi_coreid = cpi->cpi_chipid;
2543 		cpi->cpi_pkgcoreid = 0;
2544 	} else {
2545 		/*
2546 		 * Single-core single-thread processors.
2547 		 */
2548 		cpi->cpi_coreid = cpu->cpu_id;
2549 		cpi->cpi_pkgcoreid = 0;
2550 	}
2551 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2552 	cpi->cpi_compunitid = cpi->cpi_coreid;
2553 }
2554 
2555 /*
2556  * Historically, AMD has had CMP chips with only a single thread per core.
2557  * However, starting in family 17h (Zen), this has changed and they now have
2558  * multiple threads. Our internal core id needs to be a unique value.
2559  *
2560  * To determine the core id of an AMD system, if we're from a family before 17h,
2561  * then we just use the cpu id, as that gives us a good value that will be
2562  * unique for each core. If instead, we're on family 17h or later, then we need
2563  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2564  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2565  * We can't use the normal core id in that leaf as it's only unique within the
2566  * socket, which is perfect for cpi_pkgcoreid, but not us.
2567  */
2568 static id_t
2569 cpuid_amd_get_coreid(cpu_t *cpu)
2570 {
2571 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2572 
2573 	if (cpi->cpi_family >= 0x17 &&
2574 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2575 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2576 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2577 		if (nthreads > 1) {
2578 			VERIFY3U(nthreads, ==, 2);
2579 			return (cpi->cpi_apicid >> 1);
2580 		}
2581 	}
2582 
2583 	return (cpu->cpu_id);
2584 }
2585 
2586 /*
2587  * IDs on AMD is a more challenging task. This is notable because of the
2588  * following two facts:
2589  *
2590  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2591  *     also no way to get an actual unique core id from the system. As such, we
2592  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2593  *     however, guarantee that sibling cores of a chip will have sequential
2594  *     coreids starting at a multiple of the number of cores per chip - that is
2595  *     usually the case, but if the APIC IDs have been set up in a different
2596  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2597  *
2598  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2599  *     called compute units. These compute units share the L1I cache, L2 cache,
2600  *     and the FPU. To deal with this, a new topology leaf was added in
2601  *     0x8000001e. However, parts of this leaf have different meanings
2602  *     once we get to family 0x17.
2603  */
2604 
2605 static void
2606 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2607 {
2608 	int i, first_half, coreidsz;
2609 	uint32_t nb_caps_reg;
2610 	uint_t node2_1;
2611 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2612 	struct cpuid_regs *cp;
2613 
2614 	/*
2615 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2616 	 * hasn't been stripped by virtualization). We always set the compute
2617 	 * unit id to the same value. Also, initialize the default number of
2618 	 * cores per compute unit and nodes per package. This will be
2619 	 * overwritten when we know information about a particular family.
2620 	 */
2621 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2622 	cpi->cpi_compunitid = cpi->cpi_coreid;
2623 	cpi->cpi_cores_per_compunit = 1;
2624 	cpi->cpi_procnodes_per_pkg = 1;
2625 
2626 	/*
2627 	 * To construct the logical ID, we need to determine how many APIC IDs
2628 	 * are dedicated to the cores and threads. This is provided for us in
2629 	 * 0x80000008. However, if it's not present (say due to virtualization),
2630 	 * then we assume it's one. This should be present on all 64-bit AMD
2631 	 * processors.  It was added in family 0xf (Hammer).
2632 	 */
2633 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2634 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2635 
2636 		/*
2637 		 * In AMD parlance chip is really a node while illumos
2638 		 * uses chip as equivalent to socket/package.
2639 		 */
2640 		if (coreidsz == 0) {
2641 			/* Use legacy method */
2642 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2643 				coreidsz++;
2644 			if (coreidsz == 0)
2645 				coreidsz = 1;
2646 		}
2647 	} else {
2648 		/* Assume single-core part */
2649 		coreidsz = 1;
2650 	}
2651 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2652 
2653 	/*
2654 	 * The package core ID varies depending on the family. While it may be
2655 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2656 	 * this value is the core id in the given node. For non-virtualized
2657 	 * family 17h, we need to take the logical core id and shift off the
2658 	 * threads like we do when getting the core id.  Otherwise, we can use
2659 	 * the clogid as is. When family 17h is virtualized, the clogid should
2660 	 * be sufficient as if we don't have valid data in the leaf, then we
2661 	 * won't think we have SMT, in which case the cpi_clogid should be
2662 	 * sufficient.
2663 	 */
2664 	if (cpi->cpi_family >= 0x17 &&
2665 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2666 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2667 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2668 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2669 		if (nthreads > 1) {
2670 			VERIFY3U(nthreads, ==, 2);
2671 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2672 		} else {
2673 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2674 		}
2675 	} else {
2676 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2677 	}
2678 
2679 	/*
2680 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2681 	 * (bulldozer) or newer, then we can derive all of this from leaf
2682 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2683 	 */
2684 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2685 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2686 		cp = &cpi->cpi_extd[0x1e];
2687 
2688 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2689 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2690 
2691 		/*
2692 		 * For Bulldozer-era CPUs, recalculate the compute unit
2693 		 * information.
2694 		 */
2695 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2696 			cpi->cpi_cores_per_compunit =
2697 			    BITX(cp->cp_ebx, 15, 8) + 1;
2698 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2699 			    (cpi->cpi_ncore_per_chip /
2700 			    cpi->cpi_cores_per_compunit) *
2701 			    (cpi->cpi_procnodeid /
2702 			    cpi->cpi_procnodes_per_pkg);
2703 		}
2704 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2705 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2706 	} else if (cpi->cpi_family == 0x10) {
2707 		/*
2708 		 * See if we are a multi-node processor.
2709 		 * All processors in the system have the same number of nodes
2710 		 */
2711 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2712 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2713 			/* Single-node */
2714 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2715 			    coreidsz);
2716 		} else {
2717 
2718 			/*
2719 			 * Multi-node revision D (2 nodes per package
2720 			 * are supported)
2721 			 */
2722 			cpi->cpi_procnodes_per_pkg = 2;
2723 
2724 			first_half = (cpi->cpi_pkgcoreid <=
2725 			    (cpi->cpi_ncore_per_chip/2 - 1));
2726 
2727 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2728 				/* We are BSP */
2729 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2730 			} else {
2731 
2732 				/* We are AP */
2733 				/* NodeId[2:1] bits to use for reading F3xe8 */
2734 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2735 
2736 				nb_caps_reg =
2737 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2738 
2739 				/*
2740 				 * Check IntNodeNum bit (31:30, but bit 31 is
2741 				 * always 0 on dual-node processors)
2742 				 */
2743 				if (BITX(nb_caps_reg, 30, 30) == 0)
2744 					cpi->cpi_procnodeid = node2_1 +
2745 					    !first_half;
2746 				else
2747 					cpi->cpi_procnodeid = node2_1 +
2748 					    first_half;
2749 			}
2750 		}
2751 	} else {
2752 		cpi->cpi_procnodeid = 0;
2753 	}
2754 
2755 	cpi->cpi_chipid =
2756 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2757 
2758 	cpi->cpi_ncore_bits = coreidsz;
2759 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2760 	    cpi->cpi_ncore_per_chip);
2761 }
2762 
2763 static void
2764 spec_uarch_flush_noop(void)
2765 {
2766 }
2767 
2768 /*
2769  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2770  * MDS-related micro-architectural state that would normally happen by calling
2771  * x86_md_clear().
2772  */
2773 static void
2774 spec_uarch_flush_msr(void)
2775 {
2776 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2777 }
2778 
2779 /*
2780  * This function points to a function that will flush certain
2781  * micro-architectural state on the processor. This flush is used to mitigate
2782  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2783  * function can point to one of three functions:
2784  *
2785  * - A noop which is done because we either are vulnerable, but do not have
2786  *   microcode available to help deal with a fix, or because we aren't
2787  *   vulnerable.
2788  *
2789  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2790  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2791  *   however, it only flushes the MDS related micro-architectural state on the
2792  *   current hyperthread, it does not do anything for the twin.
2793  *
2794  * - x86_md_clear which will flush the MDS related state. This is done when we
2795  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2796  *   (RDCL_NO is set).
2797  */
2798 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2799 
2800 static void
2801 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2802 {
2803 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2804 
2805 	/*
2806 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2807 	 * has been fixed in hardware, it doesn't cover everything related to
2808 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2809 	 * need to mitigate this.
2810 	 */
2811 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2812 	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2813 		return;
2814 	}
2815 
2816 	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2817 		const uint8_t nop = NOP_INSTR;
2818 		uint8_t *md = (uint8_t *)x86_md_clear;
2819 
2820 		*md = nop;
2821 	}
2822 
2823 	membar_producer();
2824 }
2825 
2826 static void
2827 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2828 {
2829 	boolean_t need_l1d, need_mds;
2830 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2831 
2832 	/*
2833 	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2834 	 * hardware, then there's nothing left for us to do for enabling the
2835 	 * flush. We can also go ahead and say that SMT exclusion is
2836 	 * unnecessary.
2837 	 */
2838 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2839 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2840 	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2841 		extern int smt_exclusion;
2842 		smt_exclusion = 0;
2843 		spec_uarch_flush = spec_uarch_flush_noop;
2844 		membar_producer();
2845 		return;
2846 	}
2847 
2848 	/*
2849 	 * The locations where we need to perform an L1D flush are required both
2850 	 * for mitigating L1TF and MDS. When verw support is present in
2851 	 * microcode, then the L1D flush will take care of doing that as well.
2852 	 * However, if we have a system where RDCL_NO is present, but we don't
2853 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2854 	 * L1D flush.
2855 	 */
2856 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2857 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2858 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2859 		need_l1d = B_TRUE;
2860 	} else {
2861 		need_l1d = B_FALSE;
2862 	}
2863 
2864 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2865 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2866 		need_mds = B_TRUE;
2867 	} else {
2868 		need_mds = B_FALSE;
2869 	}
2870 
2871 	if (need_l1d) {
2872 		spec_uarch_flush = spec_uarch_flush_msr;
2873 	} else if (need_mds) {
2874 		spec_uarch_flush = x86_md_clear;
2875 	} else {
2876 		/*
2877 		 * We have no hardware mitigations available to us.
2878 		 */
2879 		spec_uarch_flush = spec_uarch_flush_noop;
2880 	}
2881 	membar_producer();
2882 }
2883 
2884 /*
2885  * We default to enabling RSB mitigations.
2886  *
2887  * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2888  * post-barrier RSB guessing suggests we should enable RSB mitigations always
2889  * unless specifically instructed not to.
2890  *
2891  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
2892  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
2893  * also states that as long as SMEP and we maintain at least one page between
2894  * the kernel and user space (we have much more of a red zone), then we do not
2895  * need to clear the RSB. We constrain this to only when Automatic IRBS is
2896  * present.
2897  */
2898 static void
2899 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2900 {
2901 	const uint8_t ret = RET_INSTR;
2902 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2903 
2904 	switch (mit) {
2905 	case X86_SPECTREV2_AUTO_IBRS:
2906 	case X86_SPECTREV2_DISABLED:
2907 		*stuff = ret;
2908 		break;
2909 	default:
2910 		break;
2911 	}
2912 }
2913 
2914 static void
2915 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2916 {
2917 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2918 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2919 	    "_r14", "_r15" };
2920 	const uint_t nthunks = ARRAY_SIZE(thunks);
2921 	const char *type;
2922 	uint_t i;
2923 
2924 	if (mit == x86_spectrev2_mitigation)
2925 		return;
2926 
2927 	switch (mit) {
2928 	case X86_SPECTREV2_RETPOLINE:
2929 		type = "gen";
2930 		break;
2931 	case X86_SPECTREV2_AUTO_IBRS:
2932 	case X86_SPECTREV2_ENHANCED_IBRS:
2933 	case X86_SPECTREV2_DISABLED:
2934 		type = "jmp";
2935 		break;
2936 	default:
2937 		panic("asked to update retpoline state with unknown state!");
2938 	}
2939 
2940 	for (i = 0; i < nthunks; i++) {
2941 		uintptr_t source, dest;
2942 		int ssize, dsize;
2943 		char sourcebuf[64], destbuf[64];
2944 
2945 		(void) snprintf(destbuf, sizeof (destbuf),
2946 		    "__x86_indirect_thunk%s", thunks[i]);
2947 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2948 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2949 
2950 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2951 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2952 		VERIFY3U(source, !=, 0);
2953 		VERIFY3U(dest, !=, 0);
2954 		VERIFY3S(dsize, >=, ssize);
2955 		bcopy((void *)source, (void *)dest, ssize);
2956 	}
2957 }
2958 
2959 static void
2960 cpuid_enable_enhanced_ibrs(void)
2961 {
2962 	uint64_t val;
2963 
2964 	val = rdmsr(MSR_IA32_SPEC_CTRL);
2965 	val |= IA32_SPEC_CTRL_IBRS;
2966 	wrmsr(MSR_IA32_SPEC_CTRL, val);
2967 }
2968 
2969 static void
2970 cpuid_enable_auto_ibrs(void)
2971 {
2972 	uint64_t val;
2973 
2974 	val = rdmsr(MSR_AMD_EFER);
2975 	val |= AMD_EFER_AIBRSE;
2976 	wrmsr(MSR_AMD_EFER, val);
2977 }
2978 
2979 /*
2980  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2981  * we can disable TSX, we do so.
2982  *
2983  * This determination is done only on the boot CPU, potentially after loading
2984  * updated microcode.
2985  */
2986 static void
2987 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2988 {
2989 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2990 
2991 	VERIFY(cpu->cpu_id == 0);
2992 
2993 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2994 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2995 		return;
2996 	}
2997 
2998 	if (x86_disable_taa) {
2999 		x86_taa_mitigation = X86_TAA_DISABLED;
3000 		return;
3001 	}
3002 
3003 	/*
3004 	 * If we do not have the ability to disable TSX, then our only
3005 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3006 	 * MDS mitigation as described above.  The latter relies upon us having
3007 	 * configured MDS mitigations correctly! This includes disabling SMT if
3008 	 * we want to cross-CPU-thread protection.
3009 	 */
3010 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3011 		/*
3012 		 * It's not clear whether any parts will enumerate TAA_NO
3013 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3014 		 */
3015 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3016 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3017 			return;
3018 		}
3019 
3020 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3021 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3022 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3023 		} else {
3024 			x86_taa_mitigation = X86_TAA_NOTHING;
3025 		}
3026 		return;
3027 	}
3028 
3029 	/*
3030 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3031 	 * enough in boot.
3032 	 *
3033 	 * Otherwise, we'll fall back to causing transactions to abort as our
3034 	 * mitigation. TSX-using code will always take the fallback path.
3035 	 */
3036 	if (cpi->cpi_pass < 4) {
3037 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3038 	} else {
3039 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3040 	}
3041 }
3042 
3043 /*
3044  * As mentioned, we should only touch the MSR when we've got a suitable
3045  * microcode loaded on this CPU.
3046  */
3047 static void
3048 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3049 {
3050 	uint64_t val;
3051 
3052 	switch (taa) {
3053 	case X86_TAA_TSX_DISABLE:
3054 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3055 			return;
3056 		val = rdmsr(MSR_IA32_TSX_CTRL);
3057 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3058 		wrmsr(MSR_IA32_TSX_CTRL, val);
3059 		break;
3060 	case X86_TAA_TSX_FORCE_ABORT:
3061 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3062 			return;
3063 		val = rdmsr(MSR_IA32_TSX_CTRL);
3064 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3065 		wrmsr(MSR_IA32_TSX_CTRL, val);
3066 		break;
3067 	case X86_TAA_HW_MITIGATED:
3068 	case X86_TAA_MD_CLEAR:
3069 	case X86_TAA_DISABLED:
3070 	case X86_TAA_NOTHING:
3071 		break;
3072 	}
3073 }
3074 
3075 static void
3076 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3077 {
3078 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3079 	x86_spectrev2_mitigation_t v2mit;
3080 
3081 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3082 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3083 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3084 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3085 			add_x86_feature(featureset, X86FSET_IBPB);
3086 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3087 			add_x86_feature(featureset, X86FSET_IBRS);
3088 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3089 			add_x86_feature(featureset, X86FSET_STIBP);
3090 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3091 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3092 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3093 			add_x86_feature(featureset, X86FSET_SSBD);
3094 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3095 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3096 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3097 			add_x86_feature(featureset, X86FSET_SSB_NO);
3098 
3099 		/*
3100 		 * Rather than Enhanced IBRS, AMD has a different feature that
3101 		 * is a bit in EFER that can be enabled and will basically do
3102 		 * the right thing while executing in the kernel.
3103 		 */
3104 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3105 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3106 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3107 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3108 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3109 		}
3110 
3111 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3112 	    cpi->cpi_maxeax >= 7) {
3113 		struct cpuid_regs *ecp;
3114 		ecp = &cpi->cpi_std[7];
3115 
3116 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3117 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3118 		}
3119 
3120 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3121 			add_x86_feature(featureset, X86FSET_IBRS);
3122 			add_x86_feature(featureset, X86FSET_IBPB);
3123 		}
3124 
3125 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3126 			add_x86_feature(featureset, X86FSET_STIBP);
3127 		}
3128 
3129 		/*
3130 		 * Don't read the arch caps MSR on xpv where we lack the
3131 		 * on_trap().
3132 		 */
3133 #ifndef __xpv
3134 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3135 			on_trap_data_t otd;
3136 
3137 			/*
3138 			 * Be paranoid and assume we'll get a #GP.
3139 			 */
3140 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3141 				uint64_t reg;
3142 
3143 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3144 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3145 					add_x86_feature(featureset,
3146 					    X86FSET_RDCL_NO);
3147 				}
3148 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3149 					add_x86_feature(featureset,
3150 					    X86FSET_IBRS_ALL);
3151 				}
3152 				if (reg & IA32_ARCH_CAP_RSBA) {
3153 					add_x86_feature(featureset,
3154 					    X86FSET_RSBA);
3155 				}
3156 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3157 					add_x86_feature(featureset,
3158 					    X86FSET_L1D_VM_NO);
3159 				}
3160 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3161 					add_x86_feature(featureset,
3162 					    X86FSET_SSB_NO);
3163 				}
3164 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3165 					add_x86_feature(featureset,
3166 					    X86FSET_MDS_NO);
3167 				}
3168 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3169 					add_x86_feature(featureset,
3170 					    X86FSET_TSX_CTRL);
3171 				}
3172 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3173 					add_x86_feature(featureset,
3174 					    X86FSET_TAA_NO);
3175 				}
3176 			}
3177 			no_trap();
3178 		}
3179 #endif	/* !__xpv */
3180 
3181 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3182 			add_x86_feature(featureset, X86FSET_SSBD);
3183 
3184 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3185 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3186 	}
3187 
3188 	/*
3189 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3190 	 * will have already run this function and determined what we need to
3191 	 * do. This gives us a hook for per-HW thread mitigations such as
3192 	 * enhanced IBRS, or disabling TSX.
3193 	 */
3194 	if (cpu->cpu_id != 0) {
3195 		switch (x86_spectrev2_mitigation) {
3196 		case X86_SPECTREV2_ENHANCED_IBRS:
3197 			cpuid_enable_enhanced_ibrs();
3198 			break;
3199 		case X86_SPECTREV2_AUTO_IBRS:
3200 			cpuid_enable_auto_ibrs();
3201 			break;
3202 		default:
3203 			break;
3204 		}
3205 
3206 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3207 		return;
3208 	}
3209 
3210 	/*
3211 	 * Go through and initialize various security mechanisms that we should
3212 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3213 	 * TAA.
3214 	 */
3215 
3216 	/*
3217 	 * By default we've come in with retpolines enabled. Check whether we
3218 	 * should disable them or enable enhanced or automatic IBRS. RSB
3219 	 * stuffing is enabled by default. Note, we do not allow the use of AMD
3220 	 * optimized retpolines as it was disclosed by AMD in March 2022 that
3221 	 * they were still vulnerable. Prior to that point, we used them.
3222 	 */
3223 	if (x86_disable_spectrev2 != 0) {
3224 		v2mit = X86_SPECTREV2_DISABLED;
3225 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3226 		cpuid_enable_auto_ibrs();
3227 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3228 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3229 		cpuid_enable_enhanced_ibrs();
3230 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3231 	} else {
3232 		v2mit = X86_SPECTREV2_RETPOLINE;
3233 	}
3234 
3235 	cpuid_patch_retpolines(v2mit);
3236 	cpuid_patch_rsb(v2mit);
3237 	x86_spectrev2_mitigation = v2mit;
3238 	membar_producer();
3239 
3240 	/*
3241 	 * We need to determine what changes are required for mitigating L1TF
3242 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3243 	 * is required.
3244 	 *
3245 	 * If any of these are present, then we need to flush u-arch state at
3246 	 * various points. For MDS, we need to do so whenever we change to a
3247 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3248 	 * flush the L1D cache at VM entry. When we have microcode that handles
3249 	 * MDS, the L1D flush also clears the other u-arch state that the
3250 	 * md_clear does.
3251 	 */
3252 
3253 	/*
3254 	 * Update whether or not we need to be taking explicit action against
3255 	 * MDS.
3256 	 */
3257 	cpuid_update_md_clear(cpu, featureset);
3258 
3259 	/*
3260 	 * Determine whether SMT exclusion is required and whether or not we
3261 	 * need to perform an l1d flush.
3262 	 */
3263 	cpuid_update_l1d_flush(cpu, featureset);
3264 
3265 	/*
3266 	 * Determine what our mitigation strategy should be for TAA and then
3267 	 * also apply TAA mitigations.
3268 	 */
3269 	cpuid_update_tsx(cpu, featureset);
3270 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3271 }
3272 
3273 /*
3274  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3275  */
3276 void
3277 setup_xfem(void)
3278 {
3279 	uint64_t flags = XFEATURE_LEGACY_FP;
3280 
3281 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3282 
3283 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3284 		flags |= XFEATURE_SSE;
3285 
3286 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3287 		flags |= XFEATURE_AVX;
3288 
3289 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3290 		flags |= XFEATURE_AVX512;
3291 
3292 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3293 
3294 	xsave_bv_all = flags;
3295 }
3296 
3297 static void
3298 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3299 {
3300 	struct cpuid_info *cpi;
3301 
3302 	cpi = cpu->cpu_m.mcpu_cpi;
3303 
3304 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3305 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3306 		cpuid_gather_amd_topology_leaves(cpu);
3307 	}
3308 
3309 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3310 
3311 	/*
3312 	 * Before we can calculate the IDs that we should assign to this
3313 	 * processor, we need to understand how many cores and threads it has.
3314 	 */
3315 	switch (cpi->cpi_vendor) {
3316 	case X86_VENDOR_Intel:
3317 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3318 		    &cpi->cpi_ncore_per_chip);
3319 		break;
3320 	case X86_VENDOR_AMD:
3321 	case X86_VENDOR_HYGON:
3322 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3323 		    &cpi->cpi_ncore_per_chip);
3324 		break;
3325 	default:
3326 		/*
3327 		 * If we have some other x86 compatible chip, it's not clear how
3328 		 * they would behave. The most common case is virtualization
3329 		 * today, though there are also 64-bit VIA chips. Assume that
3330 		 * all we can get is the basic Leaf 1 HTT information.
3331 		 */
3332 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3333 			cpi->cpi_ncore_per_chip = 1;
3334 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3335 		}
3336 		break;
3337 	}
3338 
3339 	/*
3340 	 * Based on the calculated number of threads and cores, potentially
3341 	 * assign the HTT and CMT features.
3342 	 */
3343 	if (cpi->cpi_ncore_per_chip > 1) {
3344 		add_x86_feature(featureset, X86FSET_CMP);
3345 	}
3346 
3347 	if (cpi->cpi_ncpu_per_chip > 1 &&
3348 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3349 		add_x86_feature(featureset, X86FSET_HTT);
3350 	}
3351 
3352 	/*
3353 	 * Now that has been set up, we need to go through and calculate all of
3354 	 * the rest of the parameters that exist. If we think the CPU doesn't
3355 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3356 	 * up information in some way. The most likely case for this is
3357 	 * virtualization where we have a lot of partial topology information.
3358 	 */
3359 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3360 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3361 		/*
3362 		 * This is a single core, single-threaded processor.
3363 		 */
3364 		cpi->cpi_procnodes_per_pkg = 1;
3365 		cpi->cpi_cores_per_compunit = 1;
3366 		cpi->cpi_compunitid = 0;
3367 		cpi->cpi_chipid = -1;
3368 		cpi->cpi_clogid = 0;
3369 		cpi->cpi_coreid = cpu->cpu_id;
3370 		cpi->cpi_pkgcoreid = 0;
3371 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3372 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3373 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3374 		} else {
3375 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3376 		}
3377 	} else {
3378 		switch (cpi->cpi_vendor) {
3379 		case X86_VENDOR_Intel:
3380 			cpuid_intel_getids(cpu, featureset);
3381 			break;
3382 		case X86_VENDOR_AMD:
3383 		case X86_VENDOR_HYGON:
3384 			cpuid_amd_getids(cpu, featureset);
3385 			break;
3386 		default:
3387 			/*
3388 			 * In this case, it's hard to say what we should do.
3389 			 * We're going to model them to the OS as single core
3390 			 * threads. We don't have a good identifier for them, so
3391 			 * we're just going to use the cpu id all on a single
3392 			 * chip.
3393 			 *
3394 			 * This case has historically been different from the
3395 			 * case above where we don't have HTT or CMP. While they
3396 			 * could be combined, we've opted to keep it separate to
3397 			 * minimize the risk of topology changes in weird cases.
3398 			 */
3399 			cpi->cpi_procnodes_per_pkg = 1;
3400 			cpi->cpi_cores_per_compunit = 1;
3401 			cpi->cpi_chipid = 0;
3402 			cpi->cpi_coreid = cpu->cpu_id;
3403 			cpi->cpi_clogid = cpu->cpu_id;
3404 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3405 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3406 			cpi->cpi_compunitid = cpi->cpi_coreid;
3407 			break;
3408 		}
3409 	}
3410 }
3411 
3412 /*
3413  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3414  * always gather leaf 6 if it's supported; however, we only look for features on
3415  * Intel systems as AMD does not currently define any of the features we look
3416  * for below.
3417  */
3418 static void
3419 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3420 {
3421 	struct cpuid_regs *cp;
3422 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3423 
3424 	if (cpi->cpi_maxeax < 6) {
3425 		return;
3426 	}
3427 
3428 	cp = &cpi->cpi_std[6];
3429 	cp->cp_eax = 6;
3430 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3431 	(void) __cpuid_insn(cp);
3432 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3433 
3434 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3435 		return;
3436 	}
3437 
3438 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3439 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3440 	}
3441 
3442 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3443 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3444 	}
3445 }
3446 
3447 /*
3448  * This is used when we discover that we have AVX support in cpuid. This
3449  * proceeds to scan for the rest of the AVX derived features.
3450  */
3451 static void
3452 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3453 {
3454 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3455 
3456 	/*
3457 	 * If we don't have AVX, don't bother with most of this.
3458 	 */
3459 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3460 		return;
3461 
3462 	add_x86_feature(featureset, X86FSET_AVX);
3463 
3464 	/*
3465 	 * Intel says we can't check these without also
3466 	 * checking AVX.
3467 	 */
3468 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3469 		add_x86_feature(featureset, X86FSET_F16C);
3470 
3471 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3472 		add_x86_feature(featureset, X86FSET_FMA);
3473 
3474 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3475 		add_x86_feature(featureset, X86FSET_BMI1);
3476 
3477 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3478 		add_x86_feature(featureset, X86FSET_BMI2);
3479 
3480 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3481 		add_x86_feature(featureset, X86FSET_AVX2);
3482 
3483 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3484 		add_x86_feature(featureset, X86FSET_VAES);
3485 
3486 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3487 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3488 
3489 	/*
3490 	 * The rest of the AVX features require AVX512. Do not check them unless
3491 	 * it is present.
3492 	 */
3493 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3494 		return;
3495 	add_x86_feature(featureset, X86FSET_AVX512F);
3496 
3497 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3498 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3499 
3500 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3501 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3502 
3503 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3504 		add_x86_feature(featureset, X86FSET_AVX512PF);
3505 
3506 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3507 		add_x86_feature(featureset, X86FSET_AVX512ER);
3508 
3509 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3510 		add_x86_feature(featureset, X86FSET_AVX512CD);
3511 
3512 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3513 		add_x86_feature(featureset, X86FSET_AVX512BW);
3514 
3515 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3516 		add_x86_feature(featureset, X86FSET_AVX512VL);
3517 
3518 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3519 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3520 
3521 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3522 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3523 
3524 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3525 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3526 
3527 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3528 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3529 
3530 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3531 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3532 
3533 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3534 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3535 
3536 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3537 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3538 
3539 	/*
3540 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3541 	 * we don't need to.
3542 	 */
3543 	if (cpi->cpi_std[7].cp_eax < 1)
3544 		return;
3545 
3546 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3547 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3548 }
3549 
3550 /*
3551  * PPIN is the protected processor inventory number. On AMD this is an actual
3552  * feature bit. However, on Intel systems we need to read the platform
3553  * information MSR if we're on a specific model.
3554  */
3555 #if !defined(__xpv)
3556 static void
3557 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3558 {
3559 	on_trap_data_t otd;
3560 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3561 
3562 	switch (cpi->cpi_vendor) {
3563 	case X86_VENDOR_AMD:
3564 		/*
3565 		 * This leaf will have already been gathered in the topology
3566 		 * functions.
3567 		 */
3568 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3569 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3570 				add_x86_feature(featureset, X86FSET_PPIN);
3571 			}
3572 		}
3573 		break;
3574 	case X86_VENDOR_Intel:
3575 		if (cpi->cpi_family != 6)
3576 			break;
3577 		switch (cpi->cpi_model) {
3578 		case INTC_MODEL_IVYBRIDGE_XEON:
3579 		case INTC_MODEL_HASWELL_XEON:
3580 		case INTC_MODEL_BROADWELL_XEON:
3581 		case INTC_MODEL_BROADWELL_XEON_D:
3582 		case INTC_MODEL_SKYLAKE_XEON:
3583 		case INTC_MODEL_ICELAKE_XEON:
3584 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3585 				uint64_t value;
3586 
3587 				value = rdmsr(MSR_PLATFORM_INFO);
3588 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3589 					add_x86_feature(featureset,
3590 					    X86FSET_PPIN);
3591 				}
3592 			}
3593 			no_trap();
3594 			break;
3595 		default:
3596 			break;
3597 		}
3598 		break;
3599 	default:
3600 		break;
3601 	}
3602 }
3603 #endif	/* ! __xpv */
3604 
3605 static void
3606 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3607 {
3608 	uchar_t *featureset = (uchar_t *)arg;
3609 
3610 	/*
3611 	 * We don't run on any processor that doesn't have cpuid, and could not
3612 	 * possibly have arrived here.
3613 	 */
3614 	add_x86_feature(featureset, X86FSET_CPUID);
3615 }
3616 
3617 static void
3618 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3619 {
3620 	struct cpuid_info *cpi;
3621 	struct cpuid_regs *cp;
3622 
3623 	/*
3624 	 * We require that virtual/native detection be complete and that PCI
3625 	 * config space access has been set up; at present there is no reliable
3626 	 * way to determine the latter.
3627 	 */
3628 #if !defined(__xpv)
3629 	ASSERT3S(platform_type, !=, -1);
3630 #endif	/* !__xpv */
3631 
3632 	cpi = cpu->cpu_m.mcpu_cpi;
3633 	ASSERT(cpi != NULL);
3634 
3635 	cp = &cpi->cpi_std[0];
3636 	cp->cp_eax = 0;
3637 	cpi->cpi_maxeax = __cpuid_insn(cp);
3638 	{
3639 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3640 		*iptr++ = cp->cp_ebx;
3641 		*iptr++ = cp->cp_edx;
3642 		*iptr++ = cp->cp_ecx;
3643 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3644 	}
3645 
3646 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3647 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3648 
3649 	/*
3650 	 * Limit the range in case of weird hardware
3651 	 */
3652 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3653 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3654 	if (cpi->cpi_maxeax < 1)
3655 		return;
3656 
3657 	cp = &cpi->cpi_std[1];
3658 	cp->cp_eax = 1;
3659 	(void) __cpuid_insn(cp);
3660 
3661 	/*
3662 	 * Extract identifying constants for easy access.
3663 	 */
3664 	cpi->cpi_model = CPI_MODEL(cpi);
3665 	cpi->cpi_family = CPI_FAMILY(cpi);
3666 
3667 	if (cpi->cpi_family == 0xf)
3668 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3669 
3670 	/*
3671 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3672 	 * Intel, and presumably everyone else, uses model == 0xf, as
3673 	 * one would expect (max value means possible overflow).  Sigh.
3674 	 */
3675 
3676 	switch (cpi->cpi_vendor) {
3677 	case X86_VENDOR_Intel:
3678 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3679 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3680 		break;
3681 	case X86_VENDOR_AMD:
3682 		if (CPI_FAMILY(cpi) == 0xf)
3683 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3684 		break;
3685 	case X86_VENDOR_HYGON:
3686 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3687 		break;
3688 	default:
3689 		if (cpi->cpi_model == 0xf)
3690 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3691 		break;
3692 	}
3693 
3694 	cpi->cpi_step = CPI_STEP(cpi);
3695 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3696 
3697 	/*
3698 	 * Synthesize chip "revision" and socket type
3699 	 */
3700 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3701 	    cpi->cpi_model, cpi->cpi_step);
3702 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3703 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3704 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3705 	    cpi->cpi_model, cpi->cpi_step);
3706 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3707 	    cpi->cpi_model, cpi->cpi_step);
3708 }
3709 
3710 static void
3711 cpuid_pass_basic(cpu_t *cpu, void *arg)
3712 {
3713 	uchar_t *featureset = (uchar_t *)arg;
3714 	uint32_t mask_ecx, mask_edx;
3715 	struct cpuid_info *cpi;
3716 	struct cpuid_regs *cp;
3717 	int xcpuid;
3718 #if !defined(__xpv)
3719 	extern int idle_cpu_prefer_mwait;
3720 #endif
3721 
3722 	cpi = cpu->cpu_m.mcpu_cpi;
3723 	ASSERT(cpi != NULL);
3724 
3725 	if (cpi->cpi_maxeax < 1)
3726 		return;
3727 
3728 	/*
3729 	 * This was filled during the identification pass.
3730 	 */
3731 	cp = &cpi->cpi_std[1];
3732 
3733 	/*
3734 	 * *default* assumptions:
3735 	 * - believe %edx feature word
3736 	 * - ignore %ecx feature word
3737 	 * - 32-bit virtual and physical addressing
3738 	 */
3739 	mask_edx = 0xffffffff;
3740 	mask_ecx = 0;
3741 
3742 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3743 
3744 	switch (cpi->cpi_vendor) {
3745 	case X86_VENDOR_Intel:
3746 		if (cpi->cpi_family == 5)
3747 			x86_type = X86_TYPE_P5;
3748 		else if (IS_LEGACY_P6(cpi)) {
3749 			x86_type = X86_TYPE_P6;
3750 			pentiumpro_bug4046376 = 1;
3751 			/*
3752 			 * Clear the SEP bit when it was set erroneously
3753 			 */
3754 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3755 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3756 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3757 			x86_type = X86_TYPE_P4;
3758 			/*
3759 			 * We don't currently depend on any of the %ecx
3760 			 * features until Prescott, so we'll only check
3761 			 * this from P4 onwards.  We might want to revisit
3762 			 * that idea later.
3763 			 */
3764 			mask_ecx = 0xffffffff;
3765 		} else if (cpi->cpi_family > 0xf)
3766 			mask_ecx = 0xffffffff;
3767 		/*
3768 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3769 		 * to obtain the monitor linesize.
3770 		 */
3771 		if (cpi->cpi_maxeax < 5)
3772 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3773 		break;
3774 	case X86_VENDOR_IntelClone:
3775 	default:
3776 		break;
3777 	case X86_VENDOR_AMD:
3778 #if defined(OPTERON_ERRATUM_108)
3779 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3780 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3781 			cpi->cpi_model = 0xc;
3782 		} else
3783 #endif
3784 		if (cpi->cpi_family == 5) {
3785 			/*
3786 			 * AMD K5 and K6
3787 			 *
3788 			 * These CPUs have an incomplete implementation
3789 			 * of MCA/MCE which we mask away.
3790 			 */
3791 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3792 
3793 			/*
3794 			 * Model 0 uses the wrong (APIC) bit
3795 			 * to indicate PGE.  Fix it here.
3796 			 */
3797 			if (cpi->cpi_model == 0) {
3798 				if (cp->cp_edx & 0x200) {
3799 					cp->cp_edx &= ~0x200;
3800 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3801 				}
3802 			}
3803 
3804 			/*
3805 			 * Early models had problems w/ MMX; disable.
3806 			 */
3807 			if (cpi->cpi_model < 6)
3808 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3809 		}
3810 
3811 		/*
3812 		 * For newer families, SSE3 and CX16, at least, are valid;
3813 		 * enable all
3814 		 */
3815 		if (cpi->cpi_family >= 0xf)
3816 			mask_ecx = 0xffffffff;
3817 		/*
3818 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3819 		 * to obtain the monitor linesize.
3820 		 */
3821 		if (cpi->cpi_maxeax < 5)
3822 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3823 
3824 #if !defined(__xpv)
3825 		/*
3826 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3827 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3828 		 * know for certain that in at least family 17h, per AMD, mwait
3829 		 * is preferred. Families in-between are less certain.
3830 		 */
3831 		if (cpi->cpi_family < 0x17) {
3832 			idle_cpu_prefer_mwait = 0;
3833 		}
3834 #endif
3835 
3836 		break;
3837 	case X86_VENDOR_HYGON:
3838 		/* Enable all for Hygon Dhyana CPU */
3839 		mask_ecx = 0xffffffff;
3840 		break;
3841 	case X86_VENDOR_TM:
3842 		/*
3843 		 * workaround the NT workaround in CMS 4.1
3844 		 */
3845 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3846 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3847 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3848 		break;
3849 	case X86_VENDOR_Centaur:
3850 		/*
3851 		 * workaround the NT workarounds again
3852 		 */
3853 		if (cpi->cpi_family == 6)
3854 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3855 		break;
3856 	case X86_VENDOR_Cyrix:
3857 		/*
3858 		 * We rely heavily on the probing in locore
3859 		 * to actually figure out what parts, if any,
3860 		 * of the Cyrix cpuid instruction to believe.
3861 		 */
3862 		switch (x86_type) {
3863 		case X86_TYPE_CYRIX_486:
3864 			mask_edx = 0;
3865 			break;
3866 		case X86_TYPE_CYRIX_6x86:
3867 			mask_edx = 0;
3868 			break;
3869 		case X86_TYPE_CYRIX_6x86L:
3870 			mask_edx =
3871 			    CPUID_INTC_EDX_DE |
3872 			    CPUID_INTC_EDX_CX8;
3873 			break;
3874 		case X86_TYPE_CYRIX_6x86MX:
3875 			mask_edx =
3876 			    CPUID_INTC_EDX_DE |
3877 			    CPUID_INTC_EDX_MSR |
3878 			    CPUID_INTC_EDX_CX8 |
3879 			    CPUID_INTC_EDX_PGE |
3880 			    CPUID_INTC_EDX_CMOV |
3881 			    CPUID_INTC_EDX_MMX;
3882 			break;
3883 		case X86_TYPE_CYRIX_GXm:
3884 			mask_edx =
3885 			    CPUID_INTC_EDX_MSR |
3886 			    CPUID_INTC_EDX_CX8 |
3887 			    CPUID_INTC_EDX_CMOV |
3888 			    CPUID_INTC_EDX_MMX;
3889 			break;
3890 		case X86_TYPE_CYRIX_MediaGX:
3891 			break;
3892 		case X86_TYPE_CYRIX_MII:
3893 		case X86_TYPE_VIA_CYRIX_III:
3894 			mask_edx =
3895 			    CPUID_INTC_EDX_DE |
3896 			    CPUID_INTC_EDX_TSC |
3897 			    CPUID_INTC_EDX_MSR |
3898 			    CPUID_INTC_EDX_CX8 |
3899 			    CPUID_INTC_EDX_PGE |
3900 			    CPUID_INTC_EDX_CMOV |
3901 			    CPUID_INTC_EDX_MMX;
3902 			break;
3903 		default:
3904 			break;
3905 		}
3906 		break;
3907 	}
3908 
3909 #if defined(__xpv)
3910 	/*
3911 	 * Do not support MONITOR/MWAIT under a hypervisor
3912 	 */
3913 	mask_ecx &= ~CPUID_INTC_ECX_MON;
3914 	/*
3915 	 * Do not support XSAVE under a hypervisor for now
3916 	 */
3917 	xsave_force_disable = B_TRUE;
3918 
3919 #endif	/* __xpv */
3920 
3921 	if (xsave_force_disable) {
3922 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3923 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3924 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3925 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3926 	}
3927 
3928 	/*
3929 	 * Now we've figured out the masks that determine
3930 	 * which bits we choose to believe, apply the masks
3931 	 * to the feature words, then map the kernel's view
3932 	 * of these feature words into its feature word.
3933 	 */
3934 	cp->cp_edx &= mask_edx;
3935 	cp->cp_ecx &= mask_ecx;
3936 
3937 	/*
3938 	 * apply any platform restrictions (we don't call this
3939 	 * immediately after __cpuid_insn here, because we need the
3940 	 * workarounds applied above first)
3941 	 */
3942 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3943 
3944 	/*
3945 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3946 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
3947 	 * 7 has sub-leaves determined by ecx.
3948 	 */
3949 	if (cpi->cpi_maxeax >= 7) {
3950 		struct cpuid_regs *ecp;
3951 		ecp = &cpi->cpi_std[7];
3952 		ecp->cp_eax = 7;
3953 		ecp->cp_ecx = 0;
3954 		(void) __cpuid_insn(ecp);
3955 
3956 		/*
3957 		 * If XSAVE has been disabled, just ignore all of the
3958 		 * extended-save-area dependent flags here. By removing most of
3959 		 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
3960 		 * end up looking at additional xsave dependent leaves right
3961 		 * now.
3962 		 */
3963 		if (xsave_force_disable) {
3964 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3965 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3966 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3967 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3968 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3969 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3970 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3971 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3972 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3973 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
3974 		}
3975 
3976 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3977 			add_x86_feature(featureset, X86FSET_SMEP);
3978 
3979 		/*
3980 		 * We check disable_smap here in addition to in startup_smap()
3981 		 * to ensure CPUs that aren't the boot CPU don't accidentally
3982 		 * include it in the feature set and thus generate a mismatched
3983 		 * x86 feature set across CPUs.
3984 		 */
3985 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3986 		    disable_smap == 0)
3987 			add_x86_feature(featureset, X86FSET_SMAP);
3988 
3989 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3990 			add_x86_feature(featureset, X86FSET_RDSEED);
3991 
3992 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3993 			add_x86_feature(featureset, X86FSET_ADX);
3994 
3995 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3996 			add_x86_feature(featureset, X86FSET_FSGSBASE);
3997 
3998 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3999 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4000 
4001 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4002 			add_x86_feature(featureset, X86FSET_INVPCID);
4003 
4004 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4005 			add_x86_feature(featureset, X86FSET_UMIP);
4006 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4007 			add_x86_feature(featureset, X86FSET_PKU);
4008 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4009 			add_x86_feature(featureset, X86FSET_OSPKE);
4010 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4011 			add_x86_feature(featureset, X86FSET_GFNI);
4012 
4013 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4014 			add_x86_feature(featureset, X86FSET_CLWB);
4015 
4016 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4017 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4018 				add_x86_feature(featureset, X86FSET_MPX);
4019 		}
4020 
4021 		/*
4022 		 * If we have subleaf 1 available, grab and store that. This is
4023 		 * used for more AVX and related features.
4024 		 */
4025 		if (ecp->cp_eax >= 1) {
4026 			struct cpuid_regs *c71;
4027 			c71 = &cpi->cpi_sub7[0];
4028 			c71->cp_eax = 7;
4029 			c71->cp_ecx = 1;
4030 			(void) __cpuid_insn(c71);
4031 		}
4032 	}
4033 
4034 	/*
4035 	 * fold in overrides from the "eeprom" mechanism
4036 	 */
4037 	cp->cp_edx |= cpuid_feature_edx_include;
4038 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4039 
4040 	cp->cp_ecx |= cpuid_feature_ecx_include;
4041 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4042 
4043 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4044 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4045 	}
4046 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4047 		add_x86_feature(featureset, X86FSET_TSC);
4048 	}
4049 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4050 		add_x86_feature(featureset, X86FSET_MSR);
4051 	}
4052 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4053 		add_x86_feature(featureset, X86FSET_MTRR);
4054 	}
4055 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4056 		add_x86_feature(featureset, X86FSET_PGE);
4057 	}
4058 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4059 		add_x86_feature(featureset, X86FSET_CMOV);
4060 	}
4061 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4062 		add_x86_feature(featureset, X86FSET_MMX);
4063 	}
4064 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4065 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4066 		add_x86_feature(featureset, X86FSET_MCA);
4067 	}
4068 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4069 		add_x86_feature(featureset, X86FSET_PAE);
4070 	}
4071 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4072 		add_x86_feature(featureset, X86FSET_CX8);
4073 	}
4074 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4075 		add_x86_feature(featureset, X86FSET_CX16);
4076 	}
4077 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4078 		add_x86_feature(featureset, X86FSET_PAT);
4079 	}
4080 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4081 		add_x86_feature(featureset, X86FSET_SEP);
4082 	}
4083 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4084 		/*
4085 		 * In our implementation, fxsave/fxrstor
4086 		 * are prerequisites before we'll even
4087 		 * try and do SSE things.
4088 		 */
4089 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4090 			add_x86_feature(featureset, X86FSET_SSE);
4091 		}
4092 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4093 			add_x86_feature(featureset, X86FSET_SSE2);
4094 		}
4095 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4096 			add_x86_feature(featureset, X86FSET_SSE3);
4097 		}
4098 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4099 			add_x86_feature(featureset, X86FSET_SSSE3);
4100 		}
4101 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4102 			add_x86_feature(featureset, X86FSET_SSE4_1);
4103 		}
4104 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4105 			add_x86_feature(featureset, X86FSET_SSE4_2);
4106 		}
4107 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4108 			add_x86_feature(featureset, X86FSET_AES);
4109 		}
4110 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4111 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4112 		}
4113 
4114 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4115 			add_x86_feature(featureset, X86FSET_SHA);
4116 
4117 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4118 			add_x86_feature(featureset, X86FSET_XSAVE);
4119 
4120 			/* We only test AVX & AVX512 when there is XSAVE */
4121 			cpuid_basic_avx(cpu, featureset);
4122 		}
4123 	}
4124 
4125 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4126 		add_x86_feature(featureset, X86FSET_PCID);
4127 	}
4128 
4129 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4130 		add_x86_feature(featureset, X86FSET_X2APIC);
4131 	}
4132 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4133 		add_x86_feature(featureset, X86FSET_DE);
4134 	}
4135 #if !defined(__xpv)
4136 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4137 
4138 		/*
4139 		 * We require the CLFLUSH instruction for erratum workaround
4140 		 * to use MONITOR/MWAIT.
4141 		 */
4142 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4143 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4144 			add_x86_feature(featureset, X86FSET_MWAIT);
4145 		} else {
4146 			extern int idle_cpu_assert_cflush_monitor;
4147 
4148 			/*
4149 			 * All processors we are aware of which have
4150 			 * MONITOR/MWAIT also have CLFLUSH.
4151 			 */
4152 			if (idle_cpu_assert_cflush_monitor) {
4153 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4154 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4155 			}
4156 		}
4157 	}
4158 #endif	/* __xpv */
4159 
4160 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4161 		add_x86_feature(featureset, X86FSET_VMX);
4162 	}
4163 
4164 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4165 		add_x86_feature(featureset, X86FSET_RDRAND);
4166 
4167 	/*
4168 	 * Only need it first time, rest of the cpus would follow suit.
4169 	 * we only capture this for the bootcpu.
4170 	 */
4171 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4172 		add_x86_feature(featureset, X86FSET_CLFSH);
4173 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4174 	}
4175 	if (is_x86_feature(featureset, X86FSET_PAE))
4176 		cpi->cpi_pabits = 36;
4177 
4178 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4179 		struct cpuid_regs r, *ecp;
4180 
4181 		ecp = &r;
4182 		ecp->cp_eax = 0xD;
4183 		ecp->cp_ecx = 1;
4184 		ecp->cp_edx = ecp->cp_ebx = 0;
4185 		(void) __cpuid_insn(ecp);
4186 
4187 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4188 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4189 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4190 			add_x86_feature(featureset, X86FSET_XSAVEC);
4191 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4192 			add_x86_feature(featureset, X86FSET_XSAVES);
4193 
4194 		/*
4195 		 * Zen 2 family processors suffer from erratum 1386 that causes
4196 		 * xsaves to not function correctly in some circumstances. There
4197 		 * are no supervisor states in Zen 2 and earlier. Practically
4198 		 * speaking this has no impact for us as we currently do not
4199 		 * leverage compressed xsave formats. To safeguard against
4200 		 * issues in the future where we may opt to using it, we remove
4201 		 * it from the feature set now. While Matisse has a microcode
4202 		 * update available with a fix, not all Zen 2 CPUs do so it's
4203 		 * simpler for the moment to unconditionally remove it.
4204 		 */
4205 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4206 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4207 			remove_x86_feature(featureset, X86FSET_XSAVES);
4208 		}
4209 	}
4210 
4211 	/*
4212 	 * Work on the "extended" feature information, doing
4213 	 * some basic initialization to be used in the extended pass.
4214 	 */
4215 	xcpuid = 0;
4216 	switch (cpi->cpi_vendor) {
4217 	case X86_VENDOR_Intel:
4218 		/*
4219 		 * On KVM we know we will have proper support for extended
4220 		 * cpuid.
4221 		 */
4222 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4223 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4224 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4225 			xcpuid++;
4226 		break;
4227 	case X86_VENDOR_AMD:
4228 		if (cpi->cpi_family > 5 ||
4229 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4230 			xcpuid++;
4231 		break;
4232 	case X86_VENDOR_Cyrix:
4233 		/*
4234 		 * Only these Cyrix CPUs are -known- to support
4235 		 * extended cpuid operations.
4236 		 */
4237 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4238 		    x86_type == X86_TYPE_CYRIX_GXm)
4239 			xcpuid++;
4240 		break;
4241 	case X86_VENDOR_HYGON:
4242 	case X86_VENDOR_Centaur:
4243 	case X86_VENDOR_TM:
4244 	default:
4245 		xcpuid++;
4246 		break;
4247 	}
4248 
4249 	if (xcpuid) {
4250 		cp = &cpi->cpi_extd[0];
4251 		cp->cp_eax = CPUID_LEAF_EXT_0;
4252 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4253 	}
4254 
4255 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4256 
4257 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4258 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4259 
4260 		switch (cpi->cpi_vendor) {
4261 		case X86_VENDOR_Intel:
4262 		case X86_VENDOR_AMD:
4263 		case X86_VENDOR_HYGON:
4264 			if (cpi->cpi_xmaxeax < 0x80000001)
4265 				break;
4266 			cp = &cpi->cpi_extd[1];
4267 			cp->cp_eax = 0x80000001;
4268 			(void) __cpuid_insn(cp);
4269 
4270 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4271 			    cpi->cpi_family == 5 &&
4272 			    cpi->cpi_model == 6 &&
4273 			    cpi->cpi_step == 6) {
4274 				/*
4275 				 * K6 model 6 uses bit 10 to indicate SYSC
4276 				 * Later models use bit 11. Fix it here.
4277 				 */
4278 				if (cp->cp_edx & 0x400) {
4279 					cp->cp_edx &= ~0x400;
4280 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4281 				}
4282 			}
4283 
4284 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4285 
4286 			/*
4287 			 * Compute the additions to the kernel's feature word.
4288 			 */
4289 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4290 				add_x86_feature(featureset, X86FSET_NX);
4291 			}
4292 
4293 			/*
4294 			 * Regardless whether or not we boot 64-bit,
4295 			 * we should have a way to identify whether
4296 			 * the CPU is capable of running 64-bit.
4297 			 */
4298 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4299 				add_x86_feature(featureset, X86FSET_64);
4300 			}
4301 
4302 			/* 1 GB large page - enable only for 64 bit kernel */
4303 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4304 				add_x86_feature(featureset, X86FSET_1GPG);
4305 			}
4306 
4307 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4308 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4309 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4310 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4311 				add_x86_feature(featureset, X86FSET_SSE4A);
4312 			}
4313 
4314 			/*
4315 			 * It's really tricky to support syscall/sysret in
4316 			 * the i386 kernel; we rely on sysenter/sysexit
4317 			 * instead.  In the amd64 kernel, things are -way-
4318 			 * better.
4319 			 */
4320 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4321 				add_x86_feature(featureset, X86FSET_ASYSC);
4322 			}
4323 
4324 			/*
4325 			 * While we're thinking about system calls, note
4326 			 * that AMD processors don't support sysenter
4327 			 * in long mode at all, so don't try to program them.
4328 			 */
4329 			if (x86_vendor == X86_VENDOR_AMD ||
4330 			    x86_vendor == X86_VENDOR_HYGON) {
4331 				remove_x86_feature(featureset, X86FSET_SEP);
4332 			}
4333 
4334 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4335 				add_x86_feature(featureset, X86FSET_TSCP);
4336 			}
4337 
4338 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4339 				add_x86_feature(featureset, X86FSET_SVM);
4340 			}
4341 
4342 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4343 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4344 			}
4345 
4346 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4347 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4348 			}
4349 
4350 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4351 				add_x86_feature(featureset, X86FSET_XOP);
4352 			}
4353 
4354 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4355 				add_x86_feature(featureset, X86FSET_FMA4);
4356 			}
4357 
4358 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4359 				add_x86_feature(featureset, X86FSET_TBM);
4360 			}
4361 
4362 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4363 				add_x86_feature(featureset, X86FSET_MONITORX);
4364 			}
4365 			break;
4366 		default:
4367 			break;
4368 		}
4369 
4370 		/*
4371 		 * Get CPUID data about processor cores and hyperthreads.
4372 		 */
4373 		switch (cpi->cpi_vendor) {
4374 		case X86_VENDOR_Intel:
4375 			if (cpi->cpi_maxeax >= 4) {
4376 				cp = &cpi->cpi_std[4];
4377 				cp->cp_eax = 4;
4378 				cp->cp_ecx = 0;
4379 				(void) __cpuid_insn(cp);
4380 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4381 			}
4382 			/*FALLTHROUGH*/
4383 		case X86_VENDOR_AMD:
4384 		case X86_VENDOR_HYGON:
4385 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4386 				break;
4387 			cp = &cpi->cpi_extd[8];
4388 			cp->cp_eax = CPUID_LEAF_EXT_8;
4389 			(void) __cpuid_insn(cp);
4390 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4391 			    cp);
4392 
4393 			/*
4394 			 * AMD uses ebx for some extended functions.
4395 			 */
4396 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4397 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4398 				/*
4399 				 * While we're here, check for the AMD "Error
4400 				 * Pointer Zero/Restore" feature. This can be
4401 				 * used to setup the FP save handlers
4402 				 * appropriately.
4403 				 */
4404 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4405 					cpi->cpi_fp_amd_save = 0;
4406 				} else {
4407 					cpi->cpi_fp_amd_save = 1;
4408 				}
4409 
4410 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4411 					add_x86_feature(featureset,
4412 					    X86FSET_CLZERO);
4413 				}
4414 			}
4415 
4416 			/*
4417 			 * Virtual and physical address limits from
4418 			 * cpuid override previously guessed values.
4419 			 */
4420 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4421 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4422 			break;
4423 		default:
4424 			break;
4425 		}
4426 
4427 		/*
4428 		 * Get CPUID data about TSC Invariance in Deep C-State.
4429 		 */
4430 		switch (cpi->cpi_vendor) {
4431 		case X86_VENDOR_Intel:
4432 		case X86_VENDOR_AMD:
4433 		case X86_VENDOR_HYGON:
4434 			if (cpi->cpi_maxeax >= 7) {
4435 				cp = &cpi->cpi_extd[7];
4436 				cp->cp_eax = 0x80000007;
4437 				cp->cp_ecx = 0;
4438 				(void) __cpuid_insn(cp);
4439 			}
4440 			break;
4441 		default:
4442 			break;
4443 		}
4444 	}
4445 
4446 	/*
4447 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4448 	 * run and thus gathered some of its dependent leaves.
4449 	 */
4450 	cpuid_basic_topology(cpu, featureset);
4451 	cpuid_basic_thermal(cpu, featureset);
4452 #if !defined(__xpv)
4453 	cpuid_basic_ppin(cpu, featureset);
4454 #endif
4455 
4456 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4457 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4458 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4459 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4460 			/* Special handling for AMD FP not necessary. */
4461 			cpi->cpi_fp_amd_save = 0;
4462 		} else {
4463 			cpi->cpi_fp_amd_save = 1;
4464 		}
4465 	}
4466 
4467 	/*
4468 	 * Check (and potentially set) if lfence is serializing.
4469 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4470 	 */
4471 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4472 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4473 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4474 		/*
4475 		 * The AMD white paper Software Techniques For Managing
4476 		 * Speculation on AMD Processors details circumstances for when
4477 		 * lfence instructions are serializing.
4478 		 *
4479 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4480 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4481 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4482 		 * committed to supporting that MSR on all later CPUs.
4483 		 */
4484 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4485 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4486 		} else if (cpi->cpi_family >= 0x10) {
4487 #if !defined(__xpv)
4488 			uint64_t val;
4489 
4490 			/*
4491 			 * Be careful when attempting to enable the bit, and
4492 			 * verify that it was actually set in case we are
4493 			 * running in a hypervisor which is less than faithful
4494 			 * about its emulation of this feature.
4495 			 */
4496 			on_trap_data_t otd;
4497 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4498 				val = rdmsr(MSR_AMD_DE_CFG);
4499 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4500 				wrmsr(MSR_AMD_DE_CFG, val);
4501 				val = rdmsr(MSR_AMD_DE_CFG);
4502 			} else {
4503 				val = 0;
4504 			}
4505 			no_trap();
4506 
4507 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4508 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4509 			}
4510 #endif
4511 		}
4512 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4513 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4514 		/*
4515 		 * Documentation and other OSes indicate that lfence is always
4516 		 * serializing on Intel CPUs.
4517 		 */
4518 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4519 	}
4520 
4521 
4522 	/*
4523 	 * Check the processor leaves that are used for security features. Grab
4524 	 * any additional processor-specific leaves that we may not have yet.
4525 	 */
4526 	switch (cpi->cpi_vendor) {
4527 	case X86_VENDOR_AMD:
4528 	case X86_VENDOR_HYGON:
4529 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4530 			cp = &cpi->cpi_extd[7];
4531 			cp->cp_eax = CPUID_LEAF_EXT_21;
4532 			cp->cp_ecx = 0;
4533 			(void) __cpuid_insn(cp);
4534 		}
4535 		break;
4536 	default:
4537 		break;
4538 	}
4539 
4540 	cpuid_scan_security(cpu, featureset);
4541 }
4542 
4543 /*
4544  * Make copies of the cpuid table entries we depend on, in
4545  * part for ease of parsing now, in part so that we have only
4546  * one place to correct any of it, in part for ease of
4547  * later export to userland, and in part so we can look at
4548  * this stuff in a crash dump.
4549  */
4550 
4551 static void
4552 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4553 {
4554 	uint_t n, nmax;
4555 	int i;
4556 	struct cpuid_regs *cp;
4557 	uint8_t *dp;
4558 	uint32_t *iptr;
4559 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4560 
4561 	if (cpi->cpi_maxeax < 1)
4562 		return;
4563 
4564 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4565 		nmax = NMAX_CPI_STD;
4566 	/*
4567 	 * (We already handled n == 0 and n == 1 in the basic pass)
4568 	 */
4569 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4570 		/*
4571 		 * leaves 6 and 7 were handled in the basic pass
4572 		 */
4573 		if (n == 6 || n == 7)
4574 			continue;
4575 
4576 		cp->cp_eax = n;
4577 
4578 		/*
4579 		 * CPUID function 4 expects %ecx to be initialized
4580 		 * with an index which indicates which cache to return
4581 		 * information about. The OS is expected to call function 4
4582 		 * with %ecx set to 0, 1, 2, ... until it returns with
4583 		 * EAX[4:0] set to 0, which indicates there are no more
4584 		 * caches.
4585 		 *
4586 		 * Here, populate cpi_std[4] with the information returned by
4587 		 * function 4 when %ecx == 0, and do the rest in a later pass
4588 		 * when dynamic memory allocation becomes available.
4589 		 *
4590 		 * Note: we need to explicitly initialize %ecx here, since
4591 		 * function 4 may have been previously invoked.
4592 		 */
4593 		if (n == 4)
4594 			cp->cp_ecx = 0;
4595 
4596 		(void) __cpuid_insn(cp);
4597 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4598 		switch (n) {
4599 		case 2:
4600 			/*
4601 			 * "the lower 8 bits of the %eax register
4602 			 * contain a value that identifies the number
4603 			 * of times the cpuid [instruction] has to be
4604 			 * executed to obtain a complete image of the
4605 			 * processor's caching systems."
4606 			 *
4607 			 * How *do* they make this stuff up?
4608 			 */
4609 			cpi->cpi_ncache = sizeof (*cp) *
4610 			    BITX(cp->cp_eax, 7, 0);
4611 			if (cpi->cpi_ncache == 0)
4612 				break;
4613 			cpi->cpi_ncache--;	/* skip count byte */
4614 
4615 			/*
4616 			 * Well, for now, rather than attempt to implement
4617 			 * this slightly dubious algorithm, we just look
4618 			 * at the first 15 ..
4619 			 */
4620 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4621 				cpi->cpi_ncache = sizeof (*cp) - 1;
4622 
4623 			dp = cpi->cpi_cacheinfo;
4624 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4625 				uint8_t *p = (void *)&cp->cp_eax;
4626 				for (i = 1; i < 4; i++)
4627 					if (p[i] != 0)
4628 						*dp++ = p[i];
4629 			}
4630 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4631 				uint8_t *p = (void *)&cp->cp_ebx;
4632 				for (i = 0; i < 4; i++)
4633 					if (p[i] != 0)
4634 						*dp++ = p[i];
4635 			}
4636 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4637 				uint8_t *p = (void *)&cp->cp_ecx;
4638 				for (i = 0; i < 4; i++)
4639 					if (p[i] != 0)
4640 						*dp++ = p[i];
4641 			}
4642 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4643 				uint8_t *p = (void *)&cp->cp_edx;
4644 				for (i = 0; i < 4; i++)
4645 					if (p[i] != 0)
4646 						*dp++ = p[i];
4647 			}
4648 			break;
4649 
4650 		case 3:	/* Processor serial number, if PSN supported */
4651 			break;
4652 
4653 		case 4:	/* Deterministic cache parameters */
4654 			break;
4655 
4656 		case 5:	/* Monitor/Mwait parameters */
4657 		{
4658 			size_t mwait_size;
4659 
4660 			/*
4661 			 * check cpi_mwait.support which was set in
4662 			 * cpuid_pass_basic()
4663 			 */
4664 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4665 				break;
4666 
4667 			/*
4668 			 * Protect ourself from insane mwait line size.
4669 			 * Workaround for incomplete hardware emulator(s).
4670 			 */
4671 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4672 			if (mwait_size < sizeof (uint32_t) ||
4673 			    !ISP2(mwait_size)) {
4674 #if DEBUG
4675 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4676 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4677 #endif
4678 				break;
4679 			}
4680 
4681 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4682 			cpi->cpi_mwait.mon_max = mwait_size;
4683 			if (MWAIT_EXTENSION(cpi)) {
4684 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4685 				if (MWAIT_INT_ENABLE(cpi))
4686 					cpi->cpi_mwait.support |=
4687 					    MWAIT_ECX_INT_ENABLE;
4688 			}
4689 			break;
4690 		}
4691 		default:
4692 			break;
4693 		}
4694 	}
4695 
4696 	/*
4697 	 * XSAVE enumeration
4698 	 */
4699 	if (cpi->cpi_maxeax >= 0xD) {
4700 		struct cpuid_regs regs;
4701 		boolean_t cpuid_d_valid = B_TRUE;
4702 
4703 		cp = &regs;
4704 		cp->cp_eax = 0xD;
4705 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4706 
4707 		(void) __cpuid_insn(cp);
4708 
4709 		/*
4710 		 * Sanity checks for debug
4711 		 */
4712 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4713 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4714 			cpuid_d_valid = B_FALSE;
4715 		}
4716 
4717 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4718 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4719 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4720 
4721 		/*
4722 		 * If the hw supports AVX, get the size and offset in the save
4723 		 * area for the ymm state.
4724 		 */
4725 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4726 			cp->cp_eax = 0xD;
4727 			cp->cp_ecx = 2;
4728 			cp->cp_edx = cp->cp_ebx = 0;
4729 
4730 			(void) __cpuid_insn(cp);
4731 
4732 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4733 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4734 				cpuid_d_valid = B_FALSE;
4735 			}
4736 
4737 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4738 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4739 		}
4740 
4741 		/*
4742 		 * If the hw supports MPX, get the size and offset in the
4743 		 * save area for BNDREGS and BNDCSR.
4744 		 */
4745 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4746 			cp->cp_eax = 0xD;
4747 			cp->cp_ecx = 3;
4748 			cp->cp_edx = cp->cp_ebx = 0;
4749 
4750 			(void) __cpuid_insn(cp);
4751 
4752 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4753 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4754 
4755 			cp->cp_eax = 0xD;
4756 			cp->cp_ecx = 4;
4757 			cp->cp_edx = cp->cp_ebx = 0;
4758 
4759 			(void) __cpuid_insn(cp);
4760 
4761 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4762 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4763 		}
4764 
4765 		/*
4766 		 * If the hw supports AVX512, get the size and offset in the
4767 		 * save area for the opmask registers and zmm state.
4768 		 */
4769 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4770 			cp->cp_eax = 0xD;
4771 			cp->cp_ecx = 5;
4772 			cp->cp_edx = cp->cp_ebx = 0;
4773 
4774 			(void) __cpuid_insn(cp);
4775 
4776 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4777 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4778 
4779 			cp->cp_eax = 0xD;
4780 			cp->cp_ecx = 6;
4781 			cp->cp_edx = cp->cp_ebx = 0;
4782 
4783 			(void) __cpuid_insn(cp);
4784 
4785 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4786 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4787 
4788 			cp->cp_eax = 0xD;
4789 			cp->cp_ecx = 7;
4790 			cp->cp_edx = cp->cp_ebx = 0;
4791 
4792 			(void) __cpuid_insn(cp);
4793 
4794 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4795 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4796 		}
4797 
4798 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4799 			xsave_state_size = 0;
4800 		} else if (cpuid_d_valid) {
4801 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4802 		} else {
4803 			/* Broken CPUID 0xD, probably in HVM */
4804 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4805 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4806 			    ", ymm_size = %d, ymm_offset = %d\n",
4807 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4808 			    cpi->cpi_xsave.xsav_hw_features_high,
4809 			    (int)cpi->cpi_xsave.xsav_max_size,
4810 			    (int)cpi->cpi_xsave.ymm_size,
4811 			    (int)cpi->cpi_xsave.ymm_offset);
4812 
4813 			if (xsave_state_size != 0) {
4814 				/*
4815 				 * This must be a non-boot CPU. We cannot
4816 				 * continue, because boot cpu has already
4817 				 * enabled XSAVE.
4818 				 */
4819 				ASSERT(cpu->cpu_id != 0);
4820 				cmn_err(CE_PANIC, "cpu%d: we have already "
4821 				    "enabled XSAVE on boot cpu, cannot "
4822 				    "continue.", cpu->cpu_id);
4823 			} else {
4824 				/*
4825 				 * If we reached here on the boot CPU, it's also
4826 				 * almost certain that we'll reach here on the
4827 				 * non-boot CPUs. When we're here on a boot CPU
4828 				 * we should disable the feature, on a non-boot
4829 				 * CPU we need to confirm that we have.
4830 				 */
4831 				if (cpu->cpu_id == 0) {
4832 					remove_x86_feature(x86_featureset,
4833 					    X86FSET_XSAVE);
4834 					remove_x86_feature(x86_featureset,
4835 					    X86FSET_AVX);
4836 					remove_x86_feature(x86_featureset,
4837 					    X86FSET_F16C);
4838 					remove_x86_feature(x86_featureset,
4839 					    X86FSET_BMI1);
4840 					remove_x86_feature(x86_featureset,
4841 					    X86FSET_BMI2);
4842 					remove_x86_feature(x86_featureset,
4843 					    X86FSET_FMA);
4844 					remove_x86_feature(x86_featureset,
4845 					    X86FSET_AVX2);
4846 					remove_x86_feature(x86_featureset,
4847 					    X86FSET_MPX);
4848 					remove_x86_feature(x86_featureset,
4849 					    X86FSET_AVX512F);
4850 					remove_x86_feature(x86_featureset,
4851 					    X86FSET_AVX512DQ);
4852 					remove_x86_feature(x86_featureset,
4853 					    X86FSET_AVX512PF);
4854 					remove_x86_feature(x86_featureset,
4855 					    X86FSET_AVX512ER);
4856 					remove_x86_feature(x86_featureset,
4857 					    X86FSET_AVX512CD);
4858 					remove_x86_feature(x86_featureset,
4859 					    X86FSET_AVX512BW);
4860 					remove_x86_feature(x86_featureset,
4861 					    X86FSET_AVX512VL);
4862 					remove_x86_feature(x86_featureset,
4863 					    X86FSET_AVX512FMA);
4864 					remove_x86_feature(x86_featureset,
4865 					    X86FSET_AVX512VBMI);
4866 					remove_x86_feature(x86_featureset,
4867 					    X86FSET_AVX512VNNI);
4868 					remove_x86_feature(x86_featureset,
4869 					    X86FSET_AVX512VPOPCDQ);
4870 					remove_x86_feature(x86_featureset,
4871 					    X86FSET_AVX512NNIW);
4872 					remove_x86_feature(x86_featureset,
4873 					    X86FSET_AVX512FMAPS);
4874 					remove_x86_feature(x86_featureset,
4875 					    X86FSET_VAES);
4876 					remove_x86_feature(x86_featureset,
4877 					    X86FSET_VPCLMULQDQ);
4878 					remove_x86_feature(x86_featureset,
4879 					    X86FSET_GFNI);
4880 					remove_x86_feature(x86_featureset,
4881 					    X86FSET_AVX512_VP2INT);
4882 					remove_x86_feature(x86_featureset,
4883 					    X86FSET_AVX512_BITALG);
4884 					remove_x86_feature(x86_featureset,
4885 					    X86FSET_AVX512_VBMI2);
4886 					remove_x86_feature(x86_featureset,
4887 					    X86FSET_AVX512_BF16);
4888 
4889 					xsave_force_disable = B_TRUE;
4890 				} else {
4891 					VERIFY(is_x86_feature(x86_featureset,
4892 					    X86FSET_XSAVE) == B_FALSE);
4893 				}
4894 			}
4895 		}
4896 	}
4897 
4898 
4899 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4900 		return;
4901 
4902 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4903 		nmax = NMAX_CPI_EXTD;
4904 	/*
4905 	 * Copy the extended properties, fixing them as we go. While we start at
4906 	 * 2 because we've already handled a few cases in the basic pass, the
4907 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
4908 	 */
4909 	iptr = (void *)cpi->cpi_brandstr;
4910 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4911 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4912 		(void) __cpuid_insn(cp);
4913 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4914 		    cp);
4915 		switch (n) {
4916 		case 2:
4917 		case 3:
4918 		case 4:
4919 			/*
4920 			 * Extract the brand string
4921 			 */
4922 			*iptr++ = cp->cp_eax;
4923 			*iptr++ = cp->cp_ebx;
4924 			*iptr++ = cp->cp_ecx;
4925 			*iptr++ = cp->cp_edx;
4926 			break;
4927 		case 5:
4928 			switch (cpi->cpi_vendor) {
4929 			case X86_VENDOR_AMD:
4930 				/*
4931 				 * The Athlon and Duron were the first
4932 				 * parts to report the sizes of the
4933 				 * TLB for large pages. Before then,
4934 				 * we don't trust the data.
4935 				 */
4936 				if (cpi->cpi_family < 6 ||
4937 				    (cpi->cpi_family == 6 &&
4938 				    cpi->cpi_model < 1))
4939 					cp->cp_eax = 0;
4940 				break;
4941 			default:
4942 				break;
4943 			}
4944 			break;
4945 		case 6:
4946 			switch (cpi->cpi_vendor) {
4947 			case X86_VENDOR_AMD:
4948 				/*
4949 				 * The Athlon and Duron were the first
4950 				 * AMD parts with L2 TLB's.
4951 				 * Before then, don't trust the data.
4952 				 */
4953 				if (cpi->cpi_family < 6 ||
4954 				    (cpi->cpi_family == 6 &&
4955 				    cpi->cpi_model < 1))
4956 					cp->cp_eax = cp->cp_ebx = 0;
4957 				/*
4958 				 * AMD Duron rev A0 reports L2
4959 				 * cache size incorrectly as 1K
4960 				 * when it is really 64K
4961 				 */
4962 				if (cpi->cpi_family == 6 &&
4963 				    cpi->cpi_model == 3 &&
4964 				    cpi->cpi_step == 0) {
4965 					cp->cp_ecx &= 0xffff;
4966 					cp->cp_ecx |= 0x400000;
4967 				}
4968 				break;
4969 			case X86_VENDOR_Cyrix:	/* VIA C3 */
4970 				/*
4971 				 * VIA C3 processors are a bit messed
4972 				 * up w.r.t. encoding cache sizes in %ecx
4973 				 */
4974 				if (cpi->cpi_family != 6)
4975 					break;
4976 				/*
4977 				 * model 7 and 8 were incorrectly encoded
4978 				 *
4979 				 * xxx is model 8 really broken?
4980 				 */
4981 				if (cpi->cpi_model == 7 ||
4982 				    cpi->cpi_model == 8)
4983 					cp->cp_ecx =
4984 					    BITX(cp->cp_ecx, 31, 24) << 16 |
4985 					    BITX(cp->cp_ecx, 23, 16) << 12 |
4986 					    BITX(cp->cp_ecx, 15, 8) << 8 |
4987 					    BITX(cp->cp_ecx, 7, 0);
4988 				/*
4989 				 * model 9 stepping 1 has wrong associativity
4990 				 */
4991 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4992 					cp->cp_ecx |= 8 << 12;
4993 				break;
4994 			case X86_VENDOR_Intel:
4995 				/*
4996 				 * Extended L2 Cache features function.
4997 				 * First appeared on Prescott.
4998 				 */
4999 			default:
5000 				break;
5001 			}
5002 			break;
5003 		default:
5004 			break;
5005 		}
5006 	}
5007 }
5008 
5009 static const char *
5010 intel_cpubrand(const struct cpuid_info *cpi)
5011 {
5012 	int i;
5013 
5014 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5015 
5016 	switch (cpi->cpi_family) {
5017 	case 5:
5018 		return ("Intel Pentium(r)");
5019 	case 6:
5020 		switch (cpi->cpi_model) {
5021 			uint_t celeron, xeon;
5022 			const struct cpuid_regs *cp;
5023 		case 0:
5024 		case 1:
5025 		case 2:
5026 			return ("Intel Pentium(r) Pro");
5027 		case 3:
5028 		case 4:
5029 			return ("Intel Pentium(r) II");
5030 		case 6:
5031 			return ("Intel Celeron(r)");
5032 		case 5:
5033 		case 7:
5034 			celeron = xeon = 0;
5035 			cp = &cpi->cpi_std[2];	/* cache info */
5036 
5037 			for (i = 1; i < 4; i++) {
5038 				uint_t tmp;
5039 
5040 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5041 				if (tmp == 0x40)
5042 					celeron++;
5043 				if (tmp >= 0x44 && tmp <= 0x45)
5044 					xeon++;
5045 			}
5046 
5047 			for (i = 0; i < 2; i++) {
5048 				uint_t tmp;
5049 
5050 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5051 				if (tmp == 0x40)
5052 					celeron++;
5053 				else if (tmp >= 0x44 && tmp <= 0x45)
5054 					xeon++;
5055 			}
5056 
5057 			for (i = 0; i < 4; i++) {
5058 				uint_t tmp;
5059 
5060 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5061 				if (tmp == 0x40)
5062 					celeron++;
5063 				else if (tmp >= 0x44 && tmp <= 0x45)
5064 					xeon++;
5065 			}
5066 
5067 			for (i = 0; i < 4; i++) {
5068 				uint_t tmp;
5069 
5070 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5071 				if (tmp == 0x40)
5072 					celeron++;
5073 				else if (tmp >= 0x44 && tmp <= 0x45)
5074 					xeon++;
5075 			}
5076 
5077 			if (celeron)
5078 				return ("Intel Celeron(r)");
5079 			if (xeon)
5080 				return (cpi->cpi_model == 5 ?
5081 				    "Intel Pentium(r) II Xeon(tm)" :
5082 				    "Intel Pentium(r) III Xeon(tm)");
5083 			return (cpi->cpi_model == 5 ?
5084 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5085 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5086 		default:
5087 			break;
5088 		}
5089 	default:
5090 		break;
5091 	}
5092 
5093 	/* BrandID is present if the field is nonzero */
5094 	if (cpi->cpi_brandid != 0) {
5095 		static const struct {
5096 			uint_t bt_bid;
5097 			const char *bt_str;
5098 		} brand_tbl[] = {
5099 			{ 0x1,	"Intel(r) Celeron(r)" },
5100 			{ 0x2,	"Intel(r) Pentium(r) III" },
5101 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5102 			{ 0x4,	"Intel(r) Pentium(r) III" },
5103 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5104 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5105 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5106 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5107 			{ 0xa,	"Intel(r) Celeron(r)" },
5108 			{ 0xb,	"Intel(r) Xeon(tm)" },
5109 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5110 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5111 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5112 			{ 0x11, "Mobile Genuine Intel(r)" },
5113 			{ 0x12, "Intel(r) Celeron(r) M" },
5114 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5115 			{ 0x14, "Intel(r) Celeron(r)" },
5116 			{ 0x15, "Mobile Genuine Intel(r)" },
5117 			{ 0x16,	"Intel(r) Pentium(r) M" },
5118 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5119 		};
5120 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5121 		uint_t sgn;
5122 
5123 		sgn = (cpi->cpi_family << 8) |
5124 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5125 
5126 		for (i = 0; i < btblmax; i++)
5127 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5128 				break;
5129 		if (i < btblmax) {
5130 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5131 				return ("Intel(r) Celeron(r)");
5132 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5133 				return ("Intel(r) Xeon(tm) MP");
5134 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5135 				return ("Intel(r) Xeon(tm)");
5136 			return (brand_tbl[i].bt_str);
5137 		}
5138 	}
5139 
5140 	return (NULL);
5141 }
5142 
5143 static const char *
5144 amd_cpubrand(const struct cpuid_info *cpi)
5145 {
5146 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5147 
5148 	switch (cpi->cpi_family) {
5149 	case 5:
5150 		switch (cpi->cpi_model) {
5151 		case 0:
5152 		case 1:
5153 		case 2:
5154 		case 3:
5155 		case 4:
5156 		case 5:
5157 			return ("AMD-K5(r)");
5158 		case 6:
5159 		case 7:
5160 			return ("AMD-K6(r)");
5161 		case 8:
5162 			return ("AMD-K6(r)-2");
5163 		case 9:
5164 			return ("AMD-K6(r)-III");
5165 		default:
5166 			return ("AMD (family 5)");
5167 		}
5168 	case 6:
5169 		switch (cpi->cpi_model) {
5170 		case 1:
5171 			return ("AMD-K7(tm)");
5172 		case 0:
5173 		case 2:
5174 		case 4:
5175 			return ("AMD Athlon(tm)");
5176 		case 3:
5177 		case 7:
5178 			return ("AMD Duron(tm)");
5179 		case 6:
5180 		case 8:
5181 		case 10:
5182 			/*
5183 			 * Use the L2 cache size to distinguish
5184 			 */
5185 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5186 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5187 		default:
5188 			return ("AMD (family 6)");
5189 		}
5190 	default:
5191 		break;
5192 	}
5193 
5194 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5195 	    cpi->cpi_brandid != 0) {
5196 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5197 		case 3:
5198 			return ("AMD Opteron(tm) UP 1xx");
5199 		case 4:
5200 			return ("AMD Opteron(tm) DP 2xx");
5201 		case 5:
5202 			return ("AMD Opteron(tm) MP 8xx");
5203 		default:
5204 			return ("AMD Opteron(tm)");
5205 		}
5206 	}
5207 
5208 	return (NULL);
5209 }
5210 
5211 static const char *
5212 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5213 {
5214 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5215 
5216 	switch (type) {
5217 	case X86_TYPE_CYRIX_6x86:
5218 		return ("Cyrix 6x86");
5219 	case X86_TYPE_CYRIX_6x86L:
5220 		return ("Cyrix 6x86L");
5221 	case X86_TYPE_CYRIX_6x86MX:
5222 		return ("Cyrix 6x86MX");
5223 	case X86_TYPE_CYRIX_GXm:
5224 		return ("Cyrix GXm");
5225 	case X86_TYPE_CYRIX_MediaGX:
5226 		return ("Cyrix MediaGX");
5227 	case X86_TYPE_CYRIX_MII:
5228 		return ("Cyrix M2");
5229 	case X86_TYPE_VIA_CYRIX_III:
5230 		return ("VIA Cyrix M3");
5231 	default:
5232 		/*
5233 		 * Have another wild guess ..
5234 		 */
5235 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5236 			return ("Cyrix 5x86");
5237 		else if (cpi->cpi_family == 5) {
5238 			switch (cpi->cpi_model) {
5239 			case 2:
5240 				return ("Cyrix 6x86");	/* Cyrix M1 */
5241 			case 4:
5242 				return ("Cyrix MediaGX");
5243 			default:
5244 				break;
5245 			}
5246 		} else if (cpi->cpi_family == 6) {
5247 			switch (cpi->cpi_model) {
5248 			case 0:
5249 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5250 			case 5:
5251 			case 6:
5252 			case 7:
5253 			case 8:
5254 			case 9:
5255 				return ("VIA C3");
5256 			default:
5257 				break;
5258 			}
5259 		}
5260 		break;
5261 	}
5262 	return (NULL);
5263 }
5264 
5265 /*
5266  * This only gets called in the case that the CPU extended
5267  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5268  * aren't available, or contain null bytes for some reason.
5269  */
5270 static void
5271 fabricate_brandstr(struct cpuid_info *cpi)
5272 {
5273 	const char *brand = NULL;
5274 
5275 	switch (cpi->cpi_vendor) {
5276 	case X86_VENDOR_Intel:
5277 		brand = intel_cpubrand(cpi);
5278 		break;
5279 	case X86_VENDOR_AMD:
5280 		brand = amd_cpubrand(cpi);
5281 		break;
5282 	case X86_VENDOR_Cyrix:
5283 		brand = cyrix_cpubrand(cpi, x86_type);
5284 		break;
5285 	case X86_VENDOR_NexGen:
5286 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5287 			brand = "NexGen Nx586";
5288 		break;
5289 	case X86_VENDOR_Centaur:
5290 		if (cpi->cpi_family == 5)
5291 			switch (cpi->cpi_model) {
5292 			case 4:
5293 				brand = "Centaur C6";
5294 				break;
5295 			case 8:
5296 				brand = "Centaur C2";
5297 				break;
5298 			case 9:
5299 				brand = "Centaur C3";
5300 				break;
5301 			default:
5302 				break;
5303 			}
5304 		break;
5305 	case X86_VENDOR_Rise:
5306 		if (cpi->cpi_family == 5 &&
5307 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5308 			brand = "Rise mP6";
5309 		break;
5310 	case X86_VENDOR_SiS:
5311 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5312 			brand = "SiS 55x";
5313 		break;
5314 	case X86_VENDOR_TM:
5315 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5316 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5317 		break;
5318 	case X86_VENDOR_NSC:
5319 	case X86_VENDOR_UMC:
5320 	default:
5321 		break;
5322 	}
5323 	if (brand) {
5324 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5325 		return;
5326 	}
5327 
5328 	/*
5329 	 * If all else fails ...
5330 	 */
5331 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5332 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5333 	    cpi->cpi_model, cpi->cpi_step);
5334 }
5335 
5336 /*
5337  * This routine is called just after kernel memory allocation
5338  * becomes available on cpu0, and as part of mp_startup() on
5339  * the other cpus.
5340  *
5341  * Fixup the brand string, and collect any information from cpuid
5342  * that requires dynamically allocated storage to represent.
5343  */
5344 
5345 static void
5346 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5347 {
5348 	int	i, max, shft, level, size;
5349 	struct cpuid_regs regs;
5350 	struct cpuid_regs *cp;
5351 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5352 
5353 	/*
5354 	 * Deterministic cache parameters
5355 	 *
5356 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5357 	 * values that are present are currently defined to be the same. This
5358 	 * means we can use the same logic to parse it as long as we use the
5359 	 * appropriate leaf to get the data. If you're updating this, make sure
5360 	 * you're careful about which vendor supports which aspect.
5361 	 *
5362 	 * Take this opportunity to detect the number of threads sharing the
5363 	 * last level cache, and construct a corresponding cache id. The
5364 	 * respective cpuid_info members are initialized to the default case of
5365 	 * "no last level cache sharing".
5366 	 */
5367 	cpi->cpi_ncpu_shr_last_cache = 1;
5368 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5369 
5370 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5371 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5372 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5373 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5374 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5375 		uint32_t leaf;
5376 
5377 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5378 			leaf = 4;
5379 		} else {
5380 			leaf = CPUID_LEAF_EXT_1d;
5381 		}
5382 
5383 		/*
5384 		 * Find the # of elements (size) returned by the leaf and along
5385 		 * the way detect last level cache sharing details.
5386 		 */
5387 		bzero(&regs, sizeof (regs));
5388 		cp = &regs;
5389 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5390 			cp->cp_eax = leaf;
5391 			cp->cp_ecx = i;
5392 
5393 			(void) __cpuid_insn(cp);
5394 
5395 			if (CPI_CACHE_TYPE(cp) == 0)
5396 				break;
5397 			level = CPI_CACHE_LVL(cp);
5398 			if (level > max) {
5399 				max = level;
5400 				cpi->cpi_ncpu_shr_last_cache =
5401 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5402 			}
5403 		}
5404 		cpi->cpi_cache_leaf_size = size = i;
5405 
5406 		/*
5407 		 * Allocate the cpi_cache_leaves array. The first element
5408 		 * references the regs for the corresponding leaf with %ecx set
5409 		 * to 0. This was gathered in cpuid_pass_extended().
5410 		 */
5411 		if (size > 0) {
5412 			cpi->cpi_cache_leaves =
5413 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5414 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5415 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5416 			} else {
5417 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5418 			}
5419 
5420 			/*
5421 			 * Allocate storage to hold the additional regs
5422 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5423 			 *
5424 			 * The regs for the leaf, %ecx == 0 has already
5425 			 * been allocated as indicated above.
5426 			 */
5427 			for (i = 1; i < size; i++) {
5428 				cp = cpi->cpi_cache_leaves[i] =
5429 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5430 				cp->cp_eax = leaf;
5431 				cp->cp_ecx = i;
5432 
5433 				(void) __cpuid_insn(cp);
5434 			}
5435 		}
5436 		/*
5437 		 * Determine the number of bits needed to represent
5438 		 * the number of CPUs sharing the last level cache.
5439 		 *
5440 		 * Shift off that number of bits from the APIC id to
5441 		 * derive the cache id.
5442 		 */
5443 		shft = 0;
5444 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5445 			shft++;
5446 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5447 	}
5448 
5449 	/*
5450 	 * Now fixup the brand string
5451 	 */
5452 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5453 		fabricate_brandstr(cpi);
5454 	} else {
5455 
5456 		/*
5457 		 * If we successfully extracted a brand string from the cpuid
5458 		 * instruction, clean it up by removing leading spaces and
5459 		 * similar junk.
5460 		 */
5461 		if (cpi->cpi_brandstr[0]) {
5462 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5463 			char *src, *dst;
5464 
5465 			dst = src = (char *)cpi->cpi_brandstr;
5466 			src[maxlen - 1] = '\0';
5467 			/*
5468 			 * strip leading spaces
5469 			 */
5470 			while (*src == ' ')
5471 				src++;
5472 			/*
5473 			 * Remove any 'Genuine' or "Authentic" prefixes
5474 			 */
5475 			if (strncmp(src, "Genuine ", 8) == 0)
5476 				src += 8;
5477 			if (strncmp(src, "Authentic ", 10) == 0)
5478 				src += 10;
5479 
5480 			/*
5481 			 * Now do an in-place copy.
5482 			 * Map (R) to (r) and (TM) to (tm).
5483 			 * The era of teletypes is long gone, and there's
5484 			 * -really- no need to shout.
5485 			 */
5486 			while (*src != '\0') {
5487 				if (src[0] == '(') {
5488 					if (strncmp(src + 1, "R)", 2) == 0) {
5489 						(void) strncpy(dst, "(r)", 3);
5490 						src += 3;
5491 						dst += 3;
5492 						continue;
5493 					}
5494 					if (strncmp(src + 1, "TM)", 3) == 0) {
5495 						(void) strncpy(dst, "(tm)", 4);
5496 						src += 4;
5497 						dst += 4;
5498 						continue;
5499 					}
5500 				}
5501 				*dst++ = *src++;
5502 			}
5503 			*dst = '\0';
5504 
5505 			/*
5506 			 * Finally, remove any trailing spaces
5507 			 */
5508 			while (--dst > cpi->cpi_brandstr)
5509 				if (*dst == ' ')
5510 					*dst = '\0';
5511 				else
5512 					break;
5513 		} else
5514 			fabricate_brandstr(cpi);
5515 	}
5516 }
5517 
5518 typedef struct {
5519 	uint32_t avm_av;
5520 	uint32_t avm_feat;
5521 } av_feat_map_t;
5522 
5523 /*
5524  * These arrays are used to map features that we should add based on x86
5525  * features that are present. As a large number depend on kernel features,
5526  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5527  * There is an array of these for each hwcap word. Some features aren't tracked
5528  * in the kernel x86 featureset and that's ok. They will not show up in here.
5529  */
5530 static const av_feat_map_t x86fset_to_av1[] = {
5531 	{ AV_386_CX8, X86FSET_CX8 },
5532 	{ AV_386_SEP, X86FSET_SEP },
5533 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5534 	{ AV_386_CMOV, X86FSET_CMOV },
5535 	{ AV_386_FXSR, X86FSET_SSE },
5536 	{ AV_386_SSE, X86FSET_SSE },
5537 	{ AV_386_SSE2, X86FSET_SSE2 },
5538 	{ AV_386_SSE3, X86FSET_SSE3 },
5539 	{ AV_386_CX16, X86FSET_CX16 },
5540 	{ AV_386_TSCP, X86FSET_TSCP },
5541 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5542 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5543 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5544 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5545 	{ AV_386_AES, X86FSET_AES },
5546 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5547 	{ AV_386_XSAVE, X86FSET_XSAVE },
5548 	{ AV_386_AVX, X86FSET_AVX },
5549 	{ AV_386_VMX, X86FSET_VMX },
5550 	{ AV_386_AMD_SVM, X86FSET_SVM }
5551 };
5552 
5553 static const av_feat_map_t x86fset_to_av2[] = {
5554 	{ AV_386_2_F16C, X86FSET_F16C },
5555 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5556 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5557 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5558 	{ AV_386_2_FMA, X86FSET_FMA },
5559 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5560 	{ AV_386_2_ADX, X86FSET_ADX },
5561 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5562 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5563 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5564 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5565 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5566 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5567 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5568 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5569 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5570 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5571 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5572 	{ AV_386_2_SHA, X86FSET_SHA },
5573 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5574 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5575 	{ AV_386_2_CLWB, X86FSET_CLWB },
5576 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5577 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5578 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5579 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5580 	{ AV_386_2_VAES, X86FSET_VAES },
5581 	{ AV_386_2_GFNI, X86FSET_GFNI },
5582 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5583 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5584 };
5585 
5586 static const av_feat_map_t x86fset_to_av3[] = {
5587 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5588 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5589 };
5590 
5591 /*
5592  * This routine is called out of bind_hwcap() much later in the life
5593  * of the kernel (post_startup()).  The job of this routine is to resolve
5594  * the hardware feature support and kernel support for those features into
5595  * what we're actually going to tell applications via the aux vector.
5596  *
5597  * Most of the aux vector is derived from the x86_featureset array vector where
5598  * a given feature indicates that an aux vector should be plumbed through. This
5599  * allows the kernel to use one tracking mechanism for these based on whether or
5600  * not it has the required hardware support (most often xsave). Most newer
5601  * features are added there in case we need them in the kernel. Otherwise,
5602  * features are evaluated based on looking at the cpuid features that remain. If
5603  * you find yourself wanting to clear out cpuid features for some reason, they
5604  * should instead be driven by the feature set so we have a consistent view.
5605  */
5606 
5607 static void
5608 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5609 {
5610 	uint_t *hwcap_out = (uint_t *)arg;
5611 	struct cpuid_info *cpi;
5612 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5613 
5614 	cpi = cpu->cpu_m.mcpu_cpi;
5615 
5616 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5617 		if (is_x86_feature(x86_featureset,
5618 		    x86fset_to_av1[i].avm_feat)) {
5619 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5620 		}
5621 	}
5622 
5623 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5624 		if (is_x86_feature(x86_featureset,
5625 		    x86fset_to_av2[i].avm_feat)) {
5626 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5627 		}
5628 	}
5629 
5630 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5631 		if (is_x86_feature(x86_featureset,
5632 		    x86fset_to_av3[i].avm_feat)) {
5633 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5634 		}
5635 	}
5636 
5637 	/*
5638 	 * From here on out we're working through features that don't have
5639 	 * corresponding kernel feature flags for various reasons that are
5640 	 * mostly just due to the historical implementation.
5641 	 */
5642 	if (cpi->cpi_maxeax >= 1) {
5643 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5644 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5645 
5646 		*edx = CPI_FEATURES_EDX(cpi);
5647 		*ecx = CPI_FEATURES_ECX(cpi);
5648 
5649 		/*
5650 		 * [no explicit support required beyond x87 fp context]
5651 		 */
5652 		if (!fpu_exists)
5653 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5654 
5655 		/*
5656 		 * Now map the supported feature vector to things that we
5657 		 * think userland will care about.
5658 		 */
5659 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5660 			hwcap_flags |= AV_386_MOVBE;
5661 
5662 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5663 			hwcap_flags |= AV_386_POPCNT;
5664 		if (*edx & CPUID_INTC_EDX_FPU)
5665 			hwcap_flags |= AV_386_FPU;
5666 		if (*edx & CPUID_INTC_EDX_MMX)
5667 			hwcap_flags |= AV_386_MMX;
5668 		if (*edx & CPUID_INTC_EDX_TSC)
5669 			hwcap_flags |= AV_386_TSC;
5670 	}
5671 
5672 	/*
5673 	 * Check a few miscellaneous features.
5674 	 */
5675 	if (cpi->cpi_xmaxeax < 0x80000001)
5676 		goto resolve_done;
5677 
5678 	switch (cpi->cpi_vendor) {
5679 		uint32_t *edx, *ecx;
5680 
5681 	case X86_VENDOR_Intel:
5682 		/*
5683 		 * Seems like Intel duplicated what we necessary
5684 		 * here to make the initial crop of 64-bit OS's work.
5685 		 * Hopefully, those are the only "extended" bits
5686 		 * they'll add.
5687 		 */
5688 		/*FALLTHROUGH*/
5689 
5690 	case X86_VENDOR_AMD:
5691 	case X86_VENDOR_HYGON:
5692 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5693 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5694 
5695 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5696 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5697 
5698 		/*
5699 		 * [no explicit support required beyond
5700 		 * x87 fp context and exception handlers]
5701 		 */
5702 		if (!fpu_exists)
5703 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5704 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5705 
5706 		/*
5707 		 * Now map the supported feature vector to
5708 		 * things that we think userland will care about.
5709 		 */
5710 		if (*edx & CPUID_AMD_EDX_MMXamd)
5711 			hwcap_flags |= AV_386_AMD_MMX;
5712 		if (*edx & CPUID_AMD_EDX_3DNow)
5713 			hwcap_flags |= AV_386_AMD_3DNow;
5714 		if (*edx & CPUID_AMD_EDX_3DNowx)
5715 			hwcap_flags |= AV_386_AMD_3DNowx;
5716 
5717 		switch (cpi->cpi_vendor) {
5718 		case X86_VENDOR_AMD:
5719 		case X86_VENDOR_HYGON:
5720 			if (*ecx & CPUID_AMD_ECX_AHF64)
5721 				hwcap_flags |= AV_386_AHF;
5722 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5723 				hwcap_flags |= AV_386_AMD_LZCNT;
5724 			break;
5725 
5726 		case X86_VENDOR_Intel:
5727 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5728 				hwcap_flags |= AV_386_AMD_LZCNT;
5729 			/*
5730 			 * Aarrgh.
5731 			 * Intel uses a different bit in the same word.
5732 			 */
5733 			if (*ecx & CPUID_INTC_ECX_AHF64)
5734 				hwcap_flags |= AV_386_AHF;
5735 			break;
5736 		default:
5737 			break;
5738 		}
5739 		break;
5740 
5741 	default:
5742 		break;
5743 	}
5744 
5745 resolve_done:
5746 	if (hwcap_out != NULL) {
5747 		hwcap_out[0] = hwcap_flags;
5748 		hwcap_out[1] = hwcap_flags_2;
5749 		hwcap_out[2] = hwcap_flags_3;
5750 	}
5751 }
5752 
5753 
5754 /*
5755  * Simulate the cpuid instruction using the data we previously
5756  * captured about this CPU.  We try our best to return the truth
5757  * about the hardware, independently of kernel support.
5758  */
5759 uint32_t
5760 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5761 {
5762 	struct cpuid_info *cpi;
5763 	struct cpuid_regs *xcp;
5764 
5765 	if (cpu == NULL)
5766 		cpu = CPU;
5767 	cpi = cpu->cpu_m.mcpu_cpi;
5768 
5769 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5770 
5771 	/*
5772 	 * CPUID data is cached in two separate places: cpi_std for standard
5773 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5774 	 */
5775 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5776 		xcp = &cpi->cpi_std[cp->cp_eax];
5777 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5778 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5779 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5780 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5781 	} else {
5782 		/*
5783 		 * The caller is asking for data from an input parameter which
5784 		 * the kernel has not cached.  In this case we go fetch from
5785 		 * the hardware and return the data directly to the user.
5786 		 */
5787 		return (__cpuid_insn(cp));
5788 	}
5789 
5790 	cp->cp_eax = xcp->cp_eax;
5791 	cp->cp_ebx = xcp->cp_ebx;
5792 	cp->cp_ecx = xcp->cp_ecx;
5793 	cp->cp_edx = xcp->cp_edx;
5794 	return (cp->cp_eax);
5795 }
5796 
5797 boolean_t
5798 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5799 {
5800 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5801 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5802 }
5803 
5804 int
5805 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5806 {
5807 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5808 
5809 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5810 }
5811 
5812 int
5813 cpuid_is_cmt(cpu_t *cpu)
5814 {
5815 	if (cpu == NULL)
5816 		cpu = CPU;
5817 
5818 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5819 
5820 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5821 }
5822 
5823 /*
5824  * AMD and Intel both implement the 64-bit variant of the syscall
5825  * instruction (syscallq), so if there's -any- support for syscall,
5826  * cpuid currently says "yes, we support this".
5827  *
5828  * However, Intel decided to -not- implement the 32-bit variant of the
5829  * syscall instruction, so we provide a predicate to allow our caller
5830  * to test that subtlety here.
5831  *
5832  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5833  *	even in the case where the hardware would in fact support it.
5834  */
5835 /*ARGSUSED*/
5836 int
5837 cpuid_syscall32_insn(cpu_t *cpu)
5838 {
5839 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5840 
5841 #if !defined(__xpv)
5842 	if (cpu == NULL)
5843 		cpu = CPU;
5844 
5845 	/*CSTYLED*/
5846 	{
5847 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5848 
5849 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5850 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5851 		    cpi->cpi_xmaxeax >= 0x80000001 &&
5852 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5853 			return (1);
5854 	}
5855 #endif
5856 	return (0);
5857 }
5858 
5859 int
5860 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5861 {
5862 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5863 
5864 	static const char fmt[] =
5865 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5866 	static const char fmt_ht[] =
5867 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5868 
5869 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5870 
5871 	if (cpuid_is_cmt(cpu))
5872 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5873 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5874 		    cpi->cpi_family, cpi->cpi_model,
5875 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5876 	return (snprintf(s, n, fmt,
5877 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5878 	    cpi->cpi_family, cpi->cpi_model,
5879 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5880 }
5881 
5882 const char *
5883 cpuid_getvendorstr(cpu_t *cpu)
5884 {
5885 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5886 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5887 }
5888 
5889 uint_t
5890 cpuid_getvendor(cpu_t *cpu)
5891 {
5892 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5893 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5894 }
5895 
5896 uint_t
5897 cpuid_getfamily(cpu_t *cpu)
5898 {
5899 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5900 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5901 }
5902 
5903 uint_t
5904 cpuid_getmodel(cpu_t *cpu)
5905 {
5906 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5907 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5908 }
5909 
5910 uint_t
5911 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5912 {
5913 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5914 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5915 }
5916 
5917 uint_t
5918 cpuid_get_ncore_per_chip(cpu_t *cpu)
5919 {
5920 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5921 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5922 }
5923 
5924 uint_t
5925 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5926 {
5927 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5928 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5929 }
5930 
5931 id_t
5932 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5933 {
5934 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5935 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5936 }
5937 
5938 uint_t
5939 cpuid_getstep(cpu_t *cpu)
5940 {
5941 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5942 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5943 }
5944 
5945 uint_t
5946 cpuid_getsig(struct cpu *cpu)
5947 {
5948 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5949 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5950 }
5951 
5952 uint32_t
5953 cpuid_getchiprev(struct cpu *cpu)
5954 {
5955 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5956 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5957 }
5958 
5959 const char *
5960 cpuid_getchiprevstr(struct cpu *cpu)
5961 {
5962 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5963 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5964 }
5965 
5966 uint32_t
5967 cpuid_getsockettype(struct cpu *cpu)
5968 {
5969 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5970 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5971 }
5972 
5973 const char *
5974 cpuid_getsocketstr(cpu_t *cpu)
5975 {
5976 	static const char *socketstr = NULL;
5977 	struct cpuid_info *cpi;
5978 
5979 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5980 	cpi = cpu->cpu_m.mcpu_cpi;
5981 
5982 	/* Assume that socket types are the same across the system */
5983 	if (socketstr == NULL)
5984 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5985 		    cpi->cpi_model, cpi->cpi_step);
5986 
5987 
5988 	return (socketstr);
5989 }
5990 
5991 x86_uarchrev_t
5992 cpuid_getuarchrev(cpu_t *cpu)
5993 {
5994 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
5995 }
5996 
5997 int
5998 cpuid_get_chipid(cpu_t *cpu)
5999 {
6000 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6001 
6002 	if (cpuid_is_cmt(cpu))
6003 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6004 	return (cpu->cpu_id);
6005 }
6006 
6007 id_t
6008 cpuid_get_coreid(cpu_t *cpu)
6009 {
6010 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6011 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6012 }
6013 
6014 int
6015 cpuid_get_pkgcoreid(cpu_t *cpu)
6016 {
6017 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6018 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6019 }
6020 
6021 int
6022 cpuid_get_clogid(cpu_t *cpu)
6023 {
6024 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6025 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6026 }
6027 
6028 int
6029 cpuid_get_cacheid(cpu_t *cpu)
6030 {
6031 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6032 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6033 }
6034 
6035 uint_t
6036 cpuid_get_procnodeid(cpu_t *cpu)
6037 {
6038 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6039 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6040 }
6041 
6042 uint_t
6043 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6044 {
6045 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6046 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6047 }
6048 
6049 uint_t
6050 cpuid_get_compunitid(cpu_t *cpu)
6051 {
6052 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6053 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6054 }
6055 
6056 uint_t
6057 cpuid_get_cores_per_compunit(cpu_t *cpu)
6058 {
6059 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6060 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6061 }
6062 
6063 uint32_t
6064 cpuid_get_apicid(cpu_t *cpu)
6065 {
6066 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6067 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6068 		return (UINT32_MAX);
6069 	} else {
6070 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6071 	}
6072 }
6073 
6074 void
6075 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6076 {
6077 	struct cpuid_info *cpi;
6078 
6079 	if (cpu == NULL)
6080 		cpu = CPU;
6081 	cpi = cpu->cpu_m.mcpu_cpi;
6082 
6083 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6084 
6085 	if (pabits)
6086 		*pabits = cpi->cpi_pabits;
6087 	if (vabits)
6088 		*vabits = cpi->cpi_vabits;
6089 }
6090 
6091 size_t
6092 cpuid_get_xsave_size(void)
6093 {
6094 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6095 	    sizeof (struct xsave_state)));
6096 }
6097 
6098 /*
6099  * Export information about known offsets to the kernel. We only care about
6100  * things we have actually enabled support for in %xcr0.
6101  */
6102 void
6103 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6104 {
6105 	size_t size, off;
6106 
6107 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6108 
6109 	if (sizep == NULL)
6110 		sizep = &size;
6111 	if (offp == NULL)
6112 		offp = &off;
6113 
6114 	switch (bit) {
6115 	case XFEATURE_LEGACY_FP:
6116 	case XFEATURE_SSE:
6117 		*sizep = sizeof (struct fxsave_state);
6118 		*offp = 0;
6119 		break;
6120 	case XFEATURE_AVX:
6121 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6122 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6123 		break;
6124 	case XFEATURE_AVX512_OPMASK:
6125 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6126 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6127 		break;
6128 	case XFEATURE_AVX512_ZMM:
6129 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6130 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6131 		break;
6132 	case XFEATURE_AVX512_HI_ZMM:
6133 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6134 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6135 		break;
6136 	default:
6137 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6138 	}
6139 }
6140 
6141 /*
6142  * Return true if the CPUs on this system require 'pointer clearing' for the
6143  * floating point error pointer exception handling. In the past, this has been
6144  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6145  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6146  * feature bit and is reflected in the cpi_fp_amd_save member.
6147  */
6148 boolean_t
6149 cpuid_need_fp_excp_handling(void)
6150 {
6151 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6152 	    cpuid_info0.cpi_fp_amd_save != 0);
6153 }
6154 
6155 /*
6156  * Returns the number of data TLB entries for a corresponding
6157  * pagesize.  If it can't be computed, or isn't known, the
6158  * routine returns zero.  If you ask about an architecturally
6159  * impossible pagesize, the routine will panic (so that the
6160  * hat implementor knows that things are inconsistent.)
6161  */
6162 uint_t
6163 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6164 {
6165 	struct cpuid_info *cpi;
6166 	uint_t dtlb_nent = 0;
6167 
6168 	if (cpu == NULL)
6169 		cpu = CPU;
6170 	cpi = cpu->cpu_m.mcpu_cpi;
6171 
6172 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6173 
6174 	/*
6175 	 * Check the L2 TLB info
6176 	 */
6177 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6178 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6179 
6180 		switch (pagesize) {
6181 
6182 		case 4 * 1024:
6183 			/*
6184 			 * All zero in the top 16 bits of the register
6185 			 * indicates a unified TLB. Size is in low 16 bits.
6186 			 */
6187 			if ((cp->cp_ebx & 0xffff0000) == 0)
6188 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6189 			else
6190 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6191 			break;
6192 
6193 		case 2 * 1024 * 1024:
6194 			if ((cp->cp_eax & 0xffff0000) == 0)
6195 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6196 			else
6197 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6198 			break;
6199 
6200 		default:
6201 			panic("unknown L2 pagesize");
6202 			/*NOTREACHED*/
6203 		}
6204 	}
6205 
6206 	if (dtlb_nent != 0)
6207 		return (dtlb_nent);
6208 
6209 	/*
6210 	 * No L2 TLB support for this size, try L1.
6211 	 */
6212 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6213 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6214 
6215 		switch (pagesize) {
6216 		case 4 * 1024:
6217 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6218 			break;
6219 		case 2 * 1024 * 1024:
6220 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6221 			break;
6222 		default:
6223 			panic("unknown L1 d-TLB pagesize");
6224 			/*NOTREACHED*/
6225 		}
6226 	}
6227 
6228 	return (dtlb_nent);
6229 }
6230 
6231 /*
6232  * Return 0 if the erratum is not present or not applicable, positive
6233  * if it is, and negative if the status of the erratum is unknown.
6234  *
6235  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6236  * Processors" #25759, Rev 3.57, August 2005
6237  */
6238 int
6239 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6240 {
6241 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6242 	uint_t eax;
6243 
6244 	/*
6245 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6246 	 * a legacy (32-bit) AMD CPU.
6247 	 */
6248 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6249 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6250 	    cpi->cpi_family == 6) {
6251 		return (0);
6252 	}
6253 
6254 	eax = cpi->cpi_std[1].cp_eax;
6255 
6256 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6257 #define	SH_B3(eax)	(eax == 0xf51)
6258 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6259 
6260 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6261 
6262 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6263 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6264 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6265 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6266 
6267 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6268 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6269 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6270 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6271 
6272 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6273 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6274 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6275 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6276 #define	BH_E4(eax)	(eax == 0x20fb1)
6277 #define	SH_E5(eax)	(eax == 0x20f42)
6278 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6279 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6280 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6281 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6282 			    DH_E6(eax) || JH_E6(eax))
6283 
6284 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6285 #define	DR_B0(eax)	(eax == 0x100f20)
6286 #define	DR_B1(eax)	(eax == 0x100f21)
6287 #define	DR_BA(eax)	(eax == 0x100f2a)
6288 #define	DR_B2(eax)	(eax == 0x100f22)
6289 #define	DR_B3(eax)	(eax == 0x100f23)
6290 #define	RB_C0(eax)	(eax == 0x100f40)
6291 
6292 	switch (erratum) {
6293 	case 1:
6294 		return (cpi->cpi_family < 0x10);
6295 	case 51:	/* what does the asterisk mean? */
6296 		return (B(eax) || SH_C0(eax) || CG(eax));
6297 	case 52:
6298 		return (B(eax));
6299 	case 57:
6300 		return (cpi->cpi_family <= 0x11);
6301 	case 58:
6302 		return (B(eax));
6303 	case 60:
6304 		return (cpi->cpi_family <= 0x11);
6305 	case 61:
6306 	case 62:
6307 	case 63:
6308 	case 64:
6309 	case 65:
6310 	case 66:
6311 	case 68:
6312 	case 69:
6313 	case 70:
6314 	case 71:
6315 		return (B(eax));
6316 	case 72:
6317 		return (SH_B0(eax));
6318 	case 74:
6319 		return (B(eax));
6320 	case 75:
6321 		return (cpi->cpi_family < 0x10);
6322 	case 76:
6323 		return (B(eax));
6324 	case 77:
6325 		return (cpi->cpi_family <= 0x11);
6326 	case 78:
6327 		return (B(eax) || SH_C0(eax));
6328 	case 79:
6329 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6330 	case 80:
6331 	case 81:
6332 	case 82:
6333 		return (B(eax));
6334 	case 83:
6335 		return (B(eax) || SH_C0(eax) || CG(eax));
6336 	case 85:
6337 		return (cpi->cpi_family < 0x10);
6338 	case 86:
6339 		return (SH_C0(eax) || CG(eax));
6340 	case 88:
6341 		return (B(eax) || SH_C0(eax));
6342 	case 89:
6343 		return (cpi->cpi_family < 0x10);
6344 	case 90:
6345 		return (B(eax) || SH_C0(eax) || CG(eax));
6346 	case 91:
6347 	case 92:
6348 		return (B(eax) || SH_C0(eax));
6349 	case 93:
6350 		return (SH_C0(eax));
6351 	case 94:
6352 		return (B(eax) || SH_C0(eax) || CG(eax));
6353 	case 95:
6354 		return (B(eax) || SH_C0(eax));
6355 	case 96:
6356 		return (B(eax) || SH_C0(eax) || CG(eax));
6357 	case 97:
6358 	case 98:
6359 		return (SH_C0(eax) || CG(eax));
6360 	case 99:
6361 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6362 	case 100:
6363 		return (B(eax) || SH_C0(eax));
6364 	case 101:
6365 	case 103:
6366 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6367 	case 104:
6368 		return (SH_C0(eax) || CG(eax) || D0(eax));
6369 	case 105:
6370 	case 106:
6371 	case 107:
6372 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6373 	case 108:
6374 		return (DH_CG(eax));
6375 	case 109:
6376 		return (SH_C0(eax) || CG(eax) || D0(eax));
6377 	case 110:
6378 		return (D0(eax) || EX(eax));
6379 	case 111:
6380 		return (CG(eax));
6381 	case 112:
6382 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6383 	case 113:
6384 		return (eax == 0x20fc0);
6385 	case 114:
6386 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6387 	case 115:
6388 		return (SH_E0(eax) || JH_E1(eax));
6389 	case 116:
6390 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6391 	case 117:
6392 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6393 	case 118:
6394 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6395 		    JH_E6(eax));
6396 	case 121:
6397 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6398 	case 122:
6399 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6400 	case 123:
6401 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6402 	case 131:
6403 		return (cpi->cpi_family < 0x10);
6404 	case 6336786:
6405 
6406 		/*
6407 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6408 		 * if this is a K8 family or newer processor. We're testing for
6409 		 * this 'erratum' to determine whether or not we have a constant
6410 		 * TSC.
6411 		 *
6412 		 * Our current fix for this is to disable the C1-Clock ramping.
6413 		 * However, this doesn't work on newer processor families nor
6414 		 * does it work when virtualized as those devices don't exist.
6415 		 */
6416 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6417 			return (0);
6418 		}
6419 
6420 		if (CPI_FAMILY(cpi) == 0xf) {
6421 			struct cpuid_regs regs;
6422 			regs.cp_eax = 0x80000007;
6423 			(void) __cpuid_insn(&regs);
6424 			return (!(regs.cp_edx & 0x100));
6425 		}
6426 		return (0);
6427 	case 147:
6428 		/*
6429 		 * This erratum (K8 #147) is not present on family 10 and newer.
6430 		 */
6431 		if (cpi->cpi_family >= 0x10) {
6432 			return (0);
6433 		}
6434 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6435 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6436 
6437 	case 6671130:
6438 		/*
6439 		 * check for processors (pre-Shanghai) that do not provide
6440 		 * optimal management of 1gb ptes in its tlb.
6441 		 */
6442 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6443 
6444 	case 298:
6445 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6446 		    DR_B2(eax) || RB_C0(eax));
6447 
6448 	case 721:
6449 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6450 
6451 	default:
6452 		return (-1);
6453 
6454 	}
6455 }
6456 
6457 /*
6458  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6459  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6460  */
6461 int
6462 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6463 {
6464 	struct cpuid_info	*cpi;
6465 	uint_t			osvwid;
6466 	static int		osvwfeature = -1;
6467 	uint64_t		osvwlength;
6468 
6469 
6470 	cpi = cpu->cpu_m.mcpu_cpi;
6471 
6472 	/* confirm OSVW supported */
6473 	if (osvwfeature == -1) {
6474 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6475 	} else {
6476 		/* assert that osvw feature setting is consistent on all cpus */
6477 		ASSERT(osvwfeature ==
6478 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6479 	}
6480 	if (!osvwfeature)
6481 		return (-1);
6482 
6483 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6484 
6485 	switch (erratum) {
6486 	case 298:	/* osvwid is 0 */
6487 		osvwid = 0;
6488 		if (osvwlength <= (uint64_t)osvwid) {
6489 			/* osvwid 0 is unknown */
6490 			return (-1);
6491 		}
6492 
6493 		/*
6494 		 * Check the OSVW STATUS MSR to determine the state
6495 		 * of the erratum where:
6496 		 *   0 - fixed by HW
6497 		 *   1 - BIOS has applied the workaround when BIOS
6498 		 *   workaround is available. (Or for other errata,
6499 		 *   OS workaround is required.)
6500 		 * For a value of 1, caller will confirm that the
6501 		 * erratum 298 workaround has indeed been applied by BIOS.
6502 		 *
6503 		 * A 1 may be set in cpus that have a HW fix
6504 		 * in a mixed cpu system. Regarding erratum 298:
6505 		 *   In a multiprocessor platform, the workaround above
6506 		 *   should be applied to all processors regardless of
6507 		 *   silicon revision when an affected processor is
6508 		 *   present.
6509 		 */
6510 
6511 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6512 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6513 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6514 
6515 	default:
6516 		return (-1);
6517 	}
6518 }
6519 
6520 static const char assoc_str[] = "associativity";
6521 static const char line_str[] = "line-size";
6522 static const char size_str[] = "size";
6523 
6524 static void
6525 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6526     uint32_t val)
6527 {
6528 	char buf[128];
6529 
6530 	/*
6531 	 * ndi_prop_update_int() is used because it is desirable for
6532 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6533 	 */
6534 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6535 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6536 }
6537 
6538 /*
6539  * Intel-style cache/tlb description
6540  *
6541  * Standard cpuid level 2 gives a randomly ordered
6542  * selection of tags that index into a table that describes
6543  * cache and tlb properties.
6544  */
6545 
6546 static const char l1_icache_str[] = "l1-icache";
6547 static const char l1_dcache_str[] = "l1-dcache";
6548 static const char l2_cache_str[] = "l2-cache";
6549 static const char l3_cache_str[] = "l3-cache";
6550 static const char itlb4k_str[] = "itlb-4K";
6551 static const char dtlb4k_str[] = "dtlb-4K";
6552 static const char itlb2M_str[] = "itlb-2M";
6553 static const char itlb4M_str[] = "itlb-4M";
6554 static const char dtlb4M_str[] = "dtlb-4M";
6555 static const char dtlb24_str[] = "dtlb0-2M-4M";
6556 static const char itlb424_str[] = "itlb-4K-2M-4M";
6557 static const char itlb24_str[] = "itlb-2M-4M";
6558 static const char dtlb44_str[] = "dtlb-4K-4M";
6559 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6560 static const char sl2_cache_str[] = "sectored-l2-cache";
6561 static const char itrace_str[] = "itrace-cache";
6562 static const char sl3_cache_str[] = "sectored-l3-cache";
6563 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6564 
6565 static const struct cachetab {
6566 	uint8_t		ct_code;
6567 	uint8_t		ct_assoc;
6568 	uint16_t	ct_line_size;
6569 	size_t		ct_size;
6570 	const char	*ct_label;
6571 } intel_ctab[] = {
6572 	/*
6573 	 * maintain descending order!
6574 	 *
6575 	 * Codes ignored - Reason
6576 	 * ----------------------
6577 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6578 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6579 	 */
6580 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6581 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6582 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6583 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6584 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6585 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6586 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6587 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6588 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6589 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6590 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6591 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6592 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6593 	{ 0xc0, 4, 0, 8, dtlb44_str },
6594 	{ 0xba, 4, 0, 64, dtlb4k_str },
6595 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6596 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6597 	{ 0xb2, 4, 0, 64, itlb4k_str },
6598 	{ 0xb0, 4, 0, 128, itlb4k_str },
6599 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6600 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6601 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6602 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6603 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6604 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6605 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6606 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6607 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6608 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6609 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6610 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6611 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6612 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6613 	{ 0x73, 8, 0, 64*1024, itrace_str},
6614 	{ 0x72, 8, 0, 32*1024, itrace_str},
6615 	{ 0x71, 8, 0, 16*1024, itrace_str},
6616 	{ 0x70, 8, 0, 12*1024, itrace_str},
6617 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6618 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6619 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6620 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6621 	{ 0x5d, 0, 0, 256, dtlb44_str},
6622 	{ 0x5c, 0, 0, 128, dtlb44_str},
6623 	{ 0x5b, 0, 0, 64, dtlb44_str},
6624 	{ 0x5a, 4, 0, 32, dtlb24_str},
6625 	{ 0x59, 0, 0, 16, dtlb4k_str},
6626 	{ 0x57, 4, 0, 16, dtlb4k_str},
6627 	{ 0x56, 4, 0, 16, dtlb4M_str},
6628 	{ 0x55, 0, 0, 7, itlb24_str},
6629 	{ 0x52, 0, 0, 256, itlb424_str},
6630 	{ 0x51, 0, 0, 128, itlb424_str},
6631 	{ 0x50, 0, 0, 64, itlb424_str},
6632 	{ 0x4f, 0, 0, 32, itlb4k_str},
6633 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6634 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6635 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6636 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6637 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6638 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6639 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6640 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6641 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6642 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6643 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6644 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6645 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6646 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6647 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6648 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6649 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6650 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6651 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6652 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6653 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6654 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6655 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6656 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6657 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6658 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6659 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6660 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6661 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6662 	{ 0x0b, 4, 0, 4, itlb4M_str},
6663 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6664 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6665 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6666 	{ 0x05, 4, 0, 32, dtlb4M_str},
6667 	{ 0x04, 4, 0, 8, dtlb4M_str},
6668 	{ 0x03, 4, 0, 64, dtlb4k_str},
6669 	{ 0x02, 4, 0, 2, itlb4M_str},
6670 	{ 0x01, 4, 0, 32, itlb4k_str},
6671 	{ 0 }
6672 };
6673 
6674 static const struct cachetab cyrix_ctab[] = {
6675 	{ 0x70, 4, 0, 32, "tlb-4K" },
6676 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6677 	{ 0 }
6678 };
6679 
6680 /*
6681  * Search a cache table for a matching entry
6682  */
6683 static const struct cachetab *
6684 find_cacheent(const struct cachetab *ct, uint_t code)
6685 {
6686 	if (code != 0) {
6687 		for (; ct->ct_code != 0; ct++)
6688 			if (ct->ct_code <= code)
6689 				break;
6690 		if (ct->ct_code == code)
6691 			return (ct);
6692 	}
6693 	return (NULL);
6694 }
6695 
6696 /*
6697  * Populate cachetab entry with L2 or L3 cache-information using
6698  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6699  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6700  * information is found.
6701  */
6702 static int
6703 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6704 {
6705 	uint32_t level, i;
6706 	int ret = 0;
6707 
6708 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6709 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6710 
6711 		if (level == 2 || level == 3) {
6712 			ct->ct_assoc =
6713 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6714 			ct->ct_line_size =
6715 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6716 			ct->ct_size = ct->ct_assoc *
6717 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6718 			    ct->ct_line_size *
6719 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6720 
6721 			if (level == 2) {
6722 				ct->ct_label = l2_cache_str;
6723 			} else if (level == 3) {
6724 				ct->ct_label = l3_cache_str;
6725 			}
6726 			ret = 1;
6727 		}
6728 	}
6729 
6730 	return (ret);
6731 }
6732 
6733 /*
6734  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6735  * The walk is terminated if the walker returns non-zero.
6736  */
6737 static void
6738 intel_walk_cacheinfo(struct cpuid_info *cpi,
6739     void *arg, int (*func)(void *, const struct cachetab *))
6740 {
6741 	const struct cachetab *ct;
6742 	struct cachetab des_49_ct, des_b1_ct;
6743 	uint8_t *dp;
6744 	int i;
6745 
6746 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6747 		return;
6748 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6749 		/*
6750 		 * For overloaded descriptor 0x49 we use cpuid function 4
6751 		 * if supported by the current processor, to create
6752 		 * cache information.
6753 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6754 		 * to disambiguate the cache information.
6755 		 */
6756 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6757 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6758 				ct = &des_49_ct;
6759 		} else if (*dp == 0xb1) {
6760 			des_b1_ct.ct_code = 0xb1;
6761 			des_b1_ct.ct_assoc = 4;
6762 			des_b1_ct.ct_line_size = 0;
6763 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6764 				des_b1_ct.ct_size = 8;
6765 				des_b1_ct.ct_label = itlb2M_str;
6766 			} else {
6767 				des_b1_ct.ct_size = 4;
6768 				des_b1_ct.ct_label = itlb4M_str;
6769 			}
6770 			ct = &des_b1_ct;
6771 		} else {
6772 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6773 				continue;
6774 			}
6775 		}
6776 
6777 		if (func(arg, ct) != 0) {
6778 			break;
6779 		}
6780 	}
6781 }
6782 
6783 /*
6784  * (Like the Intel one, except for Cyrix CPUs)
6785  */
6786 static void
6787 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6788     void *arg, int (*func)(void *, const struct cachetab *))
6789 {
6790 	const struct cachetab *ct;
6791 	uint8_t *dp;
6792 	int i;
6793 
6794 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6795 		return;
6796 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6797 		/*
6798 		 * Search Cyrix-specific descriptor table first ..
6799 		 */
6800 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6801 			if (func(arg, ct) != 0)
6802 				break;
6803 			continue;
6804 		}
6805 		/*
6806 		 * .. else fall back to the Intel one
6807 		 */
6808 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6809 			if (func(arg, ct) != 0)
6810 				break;
6811 			continue;
6812 		}
6813 	}
6814 }
6815 
6816 /*
6817  * A cacheinfo walker that adds associativity, line-size, and size properties
6818  * to the devinfo node it is passed as an argument.
6819  */
6820 static int
6821 add_cacheent_props(void *arg, const struct cachetab *ct)
6822 {
6823 	dev_info_t *devi = arg;
6824 
6825 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6826 	if (ct->ct_line_size != 0)
6827 		add_cache_prop(devi, ct->ct_label, line_str,
6828 		    ct->ct_line_size);
6829 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6830 	return (0);
6831 }
6832 
6833 
6834 static const char fully_assoc[] = "fully-associative?";
6835 
6836 /*
6837  * AMD style cache/tlb description
6838  *
6839  * Extended functions 5 and 6 directly describe properties of
6840  * tlbs and various cache levels.
6841  */
6842 static void
6843 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6844 {
6845 	switch (assoc) {
6846 	case 0:	/* reserved; ignore */
6847 		break;
6848 	default:
6849 		add_cache_prop(devi, label, assoc_str, assoc);
6850 		break;
6851 	case 0xff:
6852 		add_cache_prop(devi, label, fully_assoc, 1);
6853 		break;
6854 	}
6855 }
6856 
6857 static void
6858 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6859 {
6860 	if (size == 0)
6861 		return;
6862 	add_cache_prop(devi, label, size_str, size);
6863 	add_amd_assoc(devi, label, assoc);
6864 }
6865 
6866 static void
6867 add_amd_cache(dev_info_t *devi, const char *label,
6868     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6869 {
6870 	if (size == 0 || line_size == 0)
6871 		return;
6872 	add_amd_assoc(devi, label, assoc);
6873 	/*
6874 	 * Most AMD parts have a sectored cache. Multiple cache lines are
6875 	 * associated with each tag. A sector consists of all cache lines
6876 	 * associated with a tag. For example, the AMD K6-III has a sector
6877 	 * size of 2 cache lines per tag.
6878 	 */
6879 	if (lines_per_tag != 0)
6880 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6881 	add_cache_prop(devi, label, line_str, line_size);
6882 	add_cache_prop(devi, label, size_str, size * 1024);
6883 }
6884 
6885 static void
6886 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6887 {
6888 	switch (assoc) {
6889 	case 0:	/* off */
6890 		break;
6891 	case 1:
6892 	case 2:
6893 	case 4:
6894 		add_cache_prop(devi, label, assoc_str, assoc);
6895 		break;
6896 	case 6:
6897 		add_cache_prop(devi, label, assoc_str, 8);
6898 		break;
6899 	case 8:
6900 		add_cache_prop(devi, label, assoc_str, 16);
6901 		break;
6902 	case 0xf:
6903 		add_cache_prop(devi, label, fully_assoc, 1);
6904 		break;
6905 	default: /* reserved; ignore */
6906 		break;
6907 	}
6908 }
6909 
6910 static void
6911 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6912 {
6913 	if (size == 0 || assoc == 0)
6914 		return;
6915 	add_amd_l2_assoc(devi, label, assoc);
6916 	add_cache_prop(devi, label, size_str, size);
6917 }
6918 
6919 static void
6920 add_amd_l2_cache(dev_info_t *devi, const char *label,
6921     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6922 {
6923 	if (size == 0 || assoc == 0 || line_size == 0)
6924 		return;
6925 	add_amd_l2_assoc(devi, label, assoc);
6926 	if (lines_per_tag != 0)
6927 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6928 	add_cache_prop(devi, label, line_str, line_size);
6929 	add_cache_prop(devi, label, size_str, size * 1024);
6930 }
6931 
6932 static void
6933 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6934 {
6935 	struct cpuid_regs *cp;
6936 
6937 	if (cpi->cpi_xmaxeax < 0x80000005)
6938 		return;
6939 	cp = &cpi->cpi_extd[5];
6940 
6941 	/*
6942 	 * 4M/2M L1 TLB configuration
6943 	 *
6944 	 * We report the size for 2M pages because AMD uses two
6945 	 * TLB entries for one 4M page.
6946 	 */
6947 	add_amd_tlb(devi, "dtlb-2M",
6948 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6949 	add_amd_tlb(devi, "itlb-2M",
6950 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6951 
6952 	/*
6953 	 * 4K L1 TLB configuration
6954 	 */
6955 
6956 	switch (cpi->cpi_vendor) {
6957 		uint_t nentries;
6958 	case X86_VENDOR_TM:
6959 		if (cpi->cpi_family >= 5) {
6960 			/*
6961 			 * Crusoe processors have 256 TLB entries, but
6962 			 * cpuid data format constrains them to only
6963 			 * reporting 255 of them.
6964 			 */
6965 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6966 				nentries = 256;
6967 			/*
6968 			 * Crusoe processors also have a unified TLB
6969 			 */
6970 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6971 			    nentries);
6972 			break;
6973 		}
6974 		/*FALLTHROUGH*/
6975 	default:
6976 		add_amd_tlb(devi, itlb4k_str,
6977 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6978 		add_amd_tlb(devi, dtlb4k_str,
6979 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6980 		break;
6981 	}
6982 
6983 	/*
6984 	 * data L1 cache configuration
6985 	 */
6986 
6987 	add_amd_cache(devi, l1_dcache_str,
6988 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6989 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6990 
6991 	/*
6992 	 * code L1 cache configuration
6993 	 */
6994 
6995 	add_amd_cache(devi, l1_icache_str,
6996 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6997 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6998 
6999 	if (cpi->cpi_xmaxeax < 0x80000006)
7000 		return;
7001 	cp = &cpi->cpi_extd[6];
7002 
7003 	/* Check for a unified L2 TLB for large pages */
7004 
7005 	if (BITX(cp->cp_eax, 31, 16) == 0)
7006 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7007 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7008 	else {
7009 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7010 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7011 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7012 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7013 	}
7014 
7015 	/* Check for a unified L2 TLB for 4K pages */
7016 
7017 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7018 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7019 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7020 	} else {
7021 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7022 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7023 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7024 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7025 	}
7026 
7027 	add_amd_l2_cache(devi, l2_cache_str,
7028 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7029 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7030 }
7031 
7032 /*
7033  * There are two basic ways that the x86 world describes it cache
7034  * and tlb architecture - Intel's way and AMD's way.
7035  *
7036  * Return which flavor of cache architecture we should use
7037  */
7038 static int
7039 x86_which_cacheinfo(struct cpuid_info *cpi)
7040 {
7041 	switch (cpi->cpi_vendor) {
7042 	case X86_VENDOR_Intel:
7043 		if (cpi->cpi_maxeax >= 2)
7044 			return (X86_VENDOR_Intel);
7045 		break;
7046 	case X86_VENDOR_AMD:
7047 		/*
7048 		 * The K5 model 1 was the first part from AMD that reported
7049 		 * cache sizes via extended cpuid functions.
7050 		 */
7051 		if (cpi->cpi_family > 5 ||
7052 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7053 			return (X86_VENDOR_AMD);
7054 		break;
7055 	case X86_VENDOR_HYGON:
7056 		return (X86_VENDOR_AMD);
7057 	case X86_VENDOR_TM:
7058 		if (cpi->cpi_family >= 5)
7059 			return (X86_VENDOR_AMD);
7060 		/*FALLTHROUGH*/
7061 	default:
7062 		/*
7063 		 * If they have extended CPU data for 0x80000005
7064 		 * then we assume they have AMD-format cache
7065 		 * information.
7066 		 *
7067 		 * If not, and the vendor happens to be Cyrix,
7068 		 * then try our-Cyrix specific handler.
7069 		 *
7070 		 * If we're not Cyrix, then assume we're using Intel's
7071 		 * table-driven format instead.
7072 		 */
7073 		if (cpi->cpi_xmaxeax >= 0x80000005)
7074 			return (X86_VENDOR_AMD);
7075 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7076 			return (X86_VENDOR_Cyrix);
7077 		else if (cpi->cpi_maxeax >= 2)
7078 			return (X86_VENDOR_Intel);
7079 		break;
7080 	}
7081 	return (-1);
7082 }
7083 
7084 void
7085 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7086     struct cpuid_info *cpi)
7087 {
7088 	dev_info_t *cpu_devi;
7089 	int create;
7090 
7091 	cpu_devi = (dev_info_t *)dip;
7092 
7093 	/* device_type */
7094 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7095 	    "device_type", "cpu");
7096 
7097 	/* reg */
7098 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7099 	    "reg", cpu_id);
7100 
7101 	/* cpu-mhz, and clock-frequency */
7102 	if (cpu_freq > 0) {
7103 		long long mul;
7104 
7105 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7106 		    "cpu-mhz", cpu_freq);
7107 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7108 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7109 			    "clock-frequency", (int)mul);
7110 	}
7111 
7112 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7113 
7114 	/* vendor-id */
7115 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7116 	    "vendor-id", cpi->cpi_vendorstr);
7117 
7118 	if (cpi->cpi_maxeax == 0) {
7119 		return;
7120 	}
7121 
7122 	/*
7123 	 * family, model, and step
7124 	 */
7125 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7126 	    "family", CPI_FAMILY(cpi));
7127 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7128 	    "cpu-model", CPI_MODEL(cpi));
7129 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7130 	    "stepping-id", CPI_STEP(cpi));
7131 
7132 	/* type */
7133 	switch (cpi->cpi_vendor) {
7134 	case X86_VENDOR_Intel:
7135 		create = 1;
7136 		break;
7137 	default:
7138 		create = 0;
7139 		break;
7140 	}
7141 	if (create)
7142 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7143 		    "type", CPI_TYPE(cpi));
7144 
7145 	/* ext-family */
7146 	switch (cpi->cpi_vendor) {
7147 	case X86_VENDOR_Intel:
7148 	case X86_VENDOR_AMD:
7149 		create = cpi->cpi_family >= 0xf;
7150 		break;
7151 	case X86_VENDOR_HYGON:
7152 		create = 1;
7153 		break;
7154 	default:
7155 		create = 0;
7156 		break;
7157 	}
7158 	if (create)
7159 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7160 		    "ext-family", CPI_FAMILY_XTD(cpi));
7161 
7162 	/* ext-model */
7163 	switch (cpi->cpi_vendor) {
7164 	case X86_VENDOR_Intel:
7165 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7166 		break;
7167 	case X86_VENDOR_AMD:
7168 		create = CPI_FAMILY(cpi) == 0xf;
7169 		break;
7170 	case X86_VENDOR_HYGON:
7171 		create = 1;
7172 		break;
7173 	default:
7174 		create = 0;
7175 		break;
7176 	}
7177 	if (create)
7178 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7179 		    "ext-model", CPI_MODEL_XTD(cpi));
7180 
7181 	/* generation */
7182 	switch (cpi->cpi_vendor) {
7183 	case X86_VENDOR_AMD:
7184 	case X86_VENDOR_HYGON:
7185 		/*
7186 		 * AMD K5 model 1 was the first part to support this
7187 		 */
7188 		create = cpi->cpi_xmaxeax >= 0x80000001;
7189 		break;
7190 	default:
7191 		create = 0;
7192 		break;
7193 	}
7194 	if (create)
7195 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7196 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7197 
7198 	/* brand-id */
7199 	switch (cpi->cpi_vendor) {
7200 	case X86_VENDOR_Intel:
7201 		/*
7202 		 * brand id first appeared on Pentium III Xeon model 8,
7203 		 * and Celeron model 8 processors and Opteron
7204 		 */
7205 		create = cpi->cpi_family > 6 ||
7206 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7207 		break;
7208 	case X86_VENDOR_AMD:
7209 		create = cpi->cpi_family >= 0xf;
7210 		break;
7211 	case X86_VENDOR_HYGON:
7212 		create = 1;
7213 		break;
7214 	default:
7215 		create = 0;
7216 		break;
7217 	}
7218 	if (create && cpi->cpi_brandid != 0) {
7219 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7220 		    "brand-id", cpi->cpi_brandid);
7221 	}
7222 
7223 	/* chunks, and apic-id */
7224 	switch (cpi->cpi_vendor) {
7225 		/*
7226 		 * first available on Pentium IV and Opteron (K8)
7227 		 */
7228 	case X86_VENDOR_Intel:
7229 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7230 		break;
7231 	case X86_VENDOR_AMD:
7232 		create = cpi->cpi_family >= 0xf;
7233 		break;
7234 	case X86_VENDOR_HYGON:
7235 		create = 1;
7236 		break;
7237 	default:
7238 		create = 0;
7239 		break;
7240 	}
7241 	if (create) {
7242 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7243 		    "chunks", CPI_CHUNKS(cpi));
7244 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7245 		    "apic-id", cpi->cpi_apicid);
7246 		if (cpi->cpi_chipid >= 0) {
7247 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7248 			    "chip#", cpi->cpi_chipid);
7249 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7250 			    "clog#", cpi->cpi_clogid);
7251 		}
7252 	}
7253 
7254 	/* cpuid-features */
7255 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7256 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7257 
7258 
7259 	/* cpuid-features-ecx */
7260 	switch (cpi->cpi_vendor) {
7261 	case X86_VENDOR_Intel:
7262 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7263 		break;
7264 	case X86_VENDOR_AMD:
7265 		create = cpi->cpi_family >= 0xf;
7266 		break;
7267 	case X86_VENDOR_HYGON:
7268 		create = 1;
7269 		break;
7270 	default:
7271 		create = 0;
7272 		break;
7273 	}
7274 	if (create)
7275 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7276 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7277 
7278 	/* ext-cpuid-features */
7279 	switch (cpi->cpi_vendor) {
7280 	case X86_VENDOR_Intel:
7281 	case X86_VENDOR_AMD:
7282 	case X86_VENDOR_HYGON:
7283 	case X86_VENDOR_Cyrix:
7284 	case X86_VENDOR_TM:
7285 	case X86_VENDOR_Centaur:
7286 		create = cpi->cpi_xmaxeax >= 0x80000001;
7287 		break;
7288 	default:
7289 		create = 0;
7290 		break;
7291 	}
7292 	if (create) {
7293 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7294 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7295 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7296 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7297 	}
7298 
7299 	/*
7300 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7301 	 * model 1, and Cyrix GXm.  On earlier models we try and
7302 	 * simulate something similar .. so this string should always
7303 	 * same -something- about the processor, however lame.
7304 	 */
7305 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7306 	    "brand-string", cpi->cpi_brandstr);
7307 
7308 	/*
7309 	 * Finally, cache and tlb information
7310 	 */
7311 	switch (x86_which_cacheinfo(cpi)) {
7312 	case X86_VENDOR_Intel:
7313 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7314 		break;
7315 	case X86_VENDOR_Cyrix:
7316 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7317 		break;
7318 	case X86_VENDOR_AMD:
7319 		amd_cache_info(cpi, cpu_devi);
7320 		break;
7321 	default:
7322 		break;
7323 	}
7324 }
7325 
7326 struct l2info {
7327 	int *l2i_csz;
7328 	int *l2i_lsz;
7329 	int *l2i_assoc;
7330 	int l2i_ret;
7331 };
7332 
7333 /*
7334  * A cacheinfo walker that fetches the size, line-size and associativity
7335  * of the L2 cache
7336  */
7337 static int
7338 intel_l2cinfo(void *arg, const struct cachetab *ct)
7339 {
7340 	struct l2info *l2i = arg;
7341 	int *ip;
7342 
7343 	if (ct->ct_label != l2_cache_str &&
7344 	    ct->ct_label != sl2_cache_str)
7345 		return (0);	/* not an L2 -- keep walking */
7346 
7347 	if ((ip = l2i->l2i_csz) != NULL)
7348 		*ip = ct->ct_size;
7349 	if ((ip = l2i->l2i_lsz) != NULL)
7350 		*ip = ct->ct_line_size;
7351 	if ((ip = l2i->l2i_assoc) != NULL)
7352 		*ip = ct->ct_assoc;
7353 	l2i->l2i_ret = ct->ct_size;
7354 	return (1);		/* was an L2 -- terminate walk */
7355 }
7356 
7357 /*
7358  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7359  *
7360  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7361  *	value is the associativity, the associativity for the L2 cache and
7362  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7363  *	an index into the amd_afd[] array to determine the associativity.
7364  *	-1 is undefined. 0 is fully associative.
7365  */
7366 
7367 static int amd_afd[] =
7368 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7369 
7370 static void
7371 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7372 {
7373 	struct cpuid_regs *cp;
7374 	uint_t size, assoc;
7375 	int i;
7376 	int *ip;
7377 
7378 	if (cpi->cpi_xmaxeax < 0x80000006)
7379 		return;
7380 	cp = &cpi->cpi_extd[6];
7381 
7382 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7383 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7384 		uint_t cachesz = size * 1024;
7385 		assoc = amd_afd[i];
7386 
7387 		ASSERT(assoc != -1);
7388 
7389 		if ((ip = l2i->l2i_csz) != NULL)
7390 			*ip = cachesz;
7391 		if ((ip = l2i->l2i_lsz) != NULL)
7392 			*ip = BITX(cp->cp_ecx, 7, 0);
7393 		if ((ip = l2i->l2i_assoc) != NULL)
7394 			*ip = assoc;
7395 		l2i->l2i_ret = cachesz;
7396 	}
7397 }
7398 
7399 int
7400 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7401 {
7402 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7403 	struct l2info __l2info, *l2i = &__l2info;
7404 
7405 	l2i->l2i_csz = csz;
7406 	l2i->l2i_lsz = lsz;
7407 	l2i->l2i_assoc = assoc;
7408 	l2i->l2i_ret = -1;
7409 
7410 	switch (x86_which_cacheinfo(cpi)) {
7411 	case X86_VENDOR_Intel:
7412 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7413 		break;
7414 	case X86_VENDOR_Cyrix:
7415 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7416 		break;
7417 	case X86_VENDOR_AMD:
7418 		amd_l2cacheinfo(cpi, l2i);
7419 		break;
7420 	default:
7421 		break;
7422 	}
7423 	return (l2i->l2i_ret);
7424 }
7425 
7426 #if !defined(__xpv)
7427 
7428 uint32_t *
7429 cpuid_mwait_alloc(cpu_t *cpu)
7430 {
7431 	uint32_t	*ret;
7432 	size_t		mwait_size;
7433 
7434 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7435 
7436 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7437 	if (mwait_size == 0)
7438 		return (NULL);
7439 
7440 	/*
7441 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7442 	 * allocations.  mwait_size is currently cache line sized.  Neither
7443 	 * of these implementation details are guarantied to be true in the
7444 	 * future.
7445 	 *
7446 	 * First try allocating mwait_size as kmem_alloc() currently returns
7447 	 * correctly aligned memory.  If kmem_alloc() does not return
7448 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7449 	 *
7450 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7451 	 * decide to free this memory.
7452 	 */
7453 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7454 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7455 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7456 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7457 		*ret = MWAIT_RUNNING;
7458 		return (ret);
7459 	} else {
7460 		kmem_free(ret, mwait_size);
7461 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7462 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7463 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7464 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7465 		*ret = MWAIT_RUNNING;
7466 		return (ret);
7467 	}
7468 }
7469 
7470 void
7471 cpuid_mwait_free(cpu_t *cpu)
7472 {
7473 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7474 		return;
7475 	}
7476 
7477 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7478 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7479 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7480 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7481 	}
7482 
7483 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7484 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7485 }
7486 
7487 void
7488 patch_tsc_read(int flag)
7489 {
7490 	size_t cnt;
7491 
7492 	switch (flag) {
7493 	case TSC_NONE:
7494 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7495 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7496 		break;
7497 	case TSC_RDTSC_LFENCE:
7498 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7499 		(void) memcpy((void *)tsc_read,
7500 		    (void *)&_tsc_lfence_start, cnt);
7501 		break;
7502 	case TSC_TSCP:
7503 		cnt = &_tscp_end - &_tscp_start;
7504 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7505 		break;
7506 	default:
7507 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7508 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7509 		break;
7510 	}
7511 	tsc_type = flag;
7512 }
7513 
7514 int
7515 cpuid_deep_cstates_supported(void)
7516 {
7517 	struct cpuid_info *cpi;
7518 	struct cpuid_regs regs;
7519 
7520 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7521 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7522 
7523 	cpi = CPU->cpu_m.mcpu_cpi;
7524 
7525 	switch (cpi->cpi_vendor) {
7526 	case X86_VENDOR_Intel:
7527 		if (cpi->cpi_xmaxeax < 0x80000007)
7528 			return (0);
7529 
7530 		/*
7531 		 * Does TSC run at a constant rate in all C-states?
7532 		 */
7533 		regs.cp_eax = 0x80000007;
7534 		(void) __cpuid_insn(&regs);
7535 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7536 
7537 	default:
7538 		return (0);
7539 	}
7540 }
7541 
7542 #endif	/* !__xpv */
7543 
7544 void
7545 post_startup_cpu_fixups(void)
7546 {
7547 #ifndef __xpv
7548 	/*
7549 	 * Some AMD processors support C1E state. Entering this state will
7550 	 * cause the local APIC timer to stop, which we can't deal with at
7551 	 * this time.
7552 	 */
7553 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7554 		on_trap_data_t otd;
7555 		uint64_t reg;
7556 
7557 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7558 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7559 			/* Disable C1E state if it is enabled by BIOS */
7560 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7561 			    AMD_ACTONCMPHALT_MASK) {
7562 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7563 				    AMD_ACTONCMPHALT_SHIFT);
7564 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7565 			}
7566 		}
7567 		no_trap();
7568 	}
7569 #endif	/* !__xpv */
7570 }
7571 
7572 void
7573 enable_pcid(void)
7574 {
7575 	if (x86_use_pcid == -1)
7576 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7577 
7578 	if (x86_use_invpcid == -1) {
7579 		x86_use_invpcid = is_x86_feature(x86_featureset,
7580 		    X86FSET_INVPCID);
7581 	}
7582 
7583 	if (!x86_use_pcid)
7584 		return;
7585 
7586 	/*
7587 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7588 	 * bits; better make sure there's nothing there.
7589 	 */
7590 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7591 
7592 	setcr4(getcr4() | CR4_PCIDE);
7593 }
7594 
7595 /*
7596  * Setup necessary registers to enable XSAVE feature on this processor.
7597  * This function needs to be called early enough, so that no xsave/xrstor
7598  * ops will execute on the processor before the MSRs are properly set up.
7599  *
7600  * Current implementation has the following assumption:
7601  * - cpuid_pass_basic() is done, so that X86 features are known.
7602  * - fpu_probe() is done, so that fp_save_mech is chosen.
7603  */
7604 void
7605 xsave_setup_msr(cpu_t *cpu)
7606 {
7607 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7608 	ASSERT(fp_save_mech == FP_XSAVE);
7609 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7610 
7611 	/* Enable OSXSAVE in CR4. */
7612 	setcr4(getcr4() | CR4_OSXSAVE);
7613 	/*
7614 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7615 	 * correct value.
7616 	 */
7617 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7618 	setup_xfem();
7619 }
7620 
7621 /*
7622  * Starting with the Westmere processor the local
7623  * APIC timer will continue running in all C-states,
7624  * including the deepest C-states.
7625  */
7626 int
7627 cpuid_arat_supported(void)
7628 {
7629 	struct cpuid_info *cpi;
7630 	struct cpuid_regs regs;
7631 
7632 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7633 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7634 
7635 	cpi = CPU->cpu_m.mcpu_cpi;
7636 
7637 	switch (cpi->cpi_vendor) {
7638 	case X86_VENDOR_Intel:
7639 		/*
7640 		 * Always-running Local APIC Timer is
7641 		 * indicated by CPUID.6.EAX[2].
7642 		 */
7643 		if (cpi->cpi_maxeax >= 6) {
7644 			regs.cp_eax = 6;
7645 			(void) cpuid_insn(NULL, &regs);
7646 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7647 		} else {
7648 			return (0);
7649 		}
7650 	default:
7651 		return (0);
7652 	}
7653 }
7654 
7655 /*
7656  * Check support for Intel ENERGY_PERF_BIAS feature
7657  */
7658 int
7659 cpuid_iepb_supported(struct cpu *cp)
7660 {
7661 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7662 	struct cpuid_regs regs;
7663 
7664 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7665 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7666 
7667 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7668 		return (0);
7669 	}
7670 
7671 	/*
7672 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7673 	 * capability bit CPUID.6.ECX.3
7674 	 */
7675 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7676 		return (0);
7677 
7678 	regs.cp_eax = 0x6;
7679 	(void) cpuid_insn(NULL, &regs);
7680 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7681 }
7682 
7683 /*
7684  * Check support for TSC deadline timer
7685  *
7686  * TSC deadline timer provides a superior software programming
7687  * model over local APIC timer that eliminates "time drifts".
7688  * Instead of specifying a relative time, software specifies an
7689  * absolute time as the target at which the processor should
7690  * generate a timer event.
7691  */
7692 int
7693 cpuid_deadline_tsc_supported(void)
7694 {
7695 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7696 	struct cpuid_regs regs;
7697 
7698 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7699 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7700 
7701 	switch (cpi->cpi_vendor) {
7702 	case X86_VENDOR_Intel:
7703 		if (cpi->cpi_maxeax >= 1) {
7704 			regs.cp_eax = 1;
7705 			(void) cpuid_insn(NULL, &regs);
7706 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7707 		} else {
7708 			return (0);
7709 		}
7710 	default:
7711 		return (0);
7712 	}
7713 }
7714 
7715 #if !defined(__xpv)
7716 /*
7717  * Patch in versions of bcopy for high performance Intel Nhm processors
7718  * and later...
7719  */
7720 void
7721 patch_memops(uint_t vendor)
7722 {
7723 	size_t cnt, i;
7724 	caddr_t to, from;
7725 
7726 	if ((vendor == X86_VENDOR_Intel) &&
7727 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7728 		cnt = &bcopy_patch_end - &bcopy_patch_start;
7729 		to = &bcopy_ck_size;
7730 		from = &bcopy_patch_start;
7731 		for (i = 0; i < cnt; i++) {
7732 			*to++ = *from++;
7733 		}
7734 	}
7735 }
7736 #endif  /*  !__xpv */
7737 
7738 /*
7739  * We're being asked to tell the system how many bits are required to represent
7740  * the various thread and strand IDs. While it's tempting to derive this based
7741  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7742  * correct. Instead, this needs to be based on the number of bits that the APIC
7743  * allows for these different configurations. We only update these to a larger
7744  * value if we find one.
7745  */
7746 void
7747 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7748 {
7749 	struct cpuid_info *cpi;
7750 
7751 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7752 	cpi = cpu->cpu_m.mcpu_cpi;
7753 
7754 	if (cpi->cpi_ncore_bits > *core_nbits) {
7755 		*core_nbits = cpi->cpi_ncore_bits;
7756 	}
7757 
7758 	if (cpi->cpi_nthread_bits > *strand_nbits) {
7759 		*strand_nbits = cpi->cpi_nthread_bits;
7760 	}
7761 }
7762 
7763 void
7764 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7765 {
7766 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7767 	struct cpuid_regs cp;
7768 
7769 	/*
7770 	 * Reread the CPUID portions that we need for various security
7771 	 * information.
7772 	 */
7773 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7774 		/*
7775 		 * Check if we now have leaf 7 available to us.
7776 		 */
7777 		if (cpi->cpi_maxeax < 7) {
7778 			bzero(&cp, sizeof (cp));
7779 			cp.cp_eax = 0;
7780 			cpi->cpi_maxeax = __cpuid_insn(&cp);
7781 			if (cpi->cpi_maxeax < 7)
7782 				return;
7783 		}
7784 
7785 		bzero(&cp, sizeof (cp));
7786 		cp.cp_eax = 7;
7787 		cp.cp_ecx = 0;
7788 		(void) __cpuid_insn(&cp);
7789 		cpi->cpi_std[7] = cp;
7790 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7791 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
7792 		/* No xcpuid support */
7793 		if (cpi->cpi_family < 5 ||
7794 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7795 			return;
7796 
7797 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7798 			bzero(&cp, sizeof (cp));
7799 			cp.cp_eax = CPUID_LEAF_EXT_0;
7800 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7801 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7802 				return;
7803 			}
7804 		}
7805 
7806 		/*
7807 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
7808 		 * leaf 0x21. So we also check that.
7809 		 */
7810 		bzero(&cp, sizeof (cp));
7811 		cp.cp_eax = CPUID_LEAF_EXT_8;
7812 		(void) __cpuid_insn(&cp);
7813 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7814 		cpi->cpi_extd[8] = cp;
7815 
7816 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
7817 			return;
7818 		}
7819 
7820 		bzero(&cp, sizeof (cp));
7821 		cp.cp_eax = CPUID_LEAF_EXT_21;
7822 		(void) __cpuid_insn(&cp);
7823 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
7824 		cpi->cpi_extd[0x21] = cp;
7825 	} else {
7826 		/*
7827 		 * Nothing to do here. Return an empty set which has already
7828 		 * been zeroed for us.
7829 		 */
7830 		return;
7831 	}
7832 	cpuid_scan_security(cpu, fset);
7833 }
7834 
7835 /* ARGSUSED */
7836 static int
7837 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7838 {
7839 	uchar_t *fset;
7840 	boolean_t first_pass = (boolean_t)arg1;
7841 
7842 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7843 	if (first_pass && CPU->cpu_id != 0)
7844 		return (0);
7845 	if (!first_pass && CPU->cpu_id == 0)
7846 		return (0);
7847 	cpuid_pass_ucode(CPU, fset);
7848 
7849 	return (0);
7850 }
7851 
7852 /*
7853  * After a microcode update where the version has changed, then we need to
7854  * rescan CPUID. To do this we check every CPU to make sure that they have the
7855  * same microcode. Then we perform a cross call to all such CPUs. It's the
7856  * caller's job to make sure that no one else can end up doing an update while
7857  * this is going on.
7858  *
7859  * We assume that the system is microcode capable if we're called.
7860  */
7861 void
7862 cpuid_post_ucodeadm(void)
7863 {
7864 	uint32_t rev;
7865 	int i;
7866 	struct cpu *cpu;
7867 	cpuset_t cpuset;
7868 	void *argdata;
7869 	uchar_t *f0;
7870 
7871 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7872 
7873 	mutex_enter(&cpu_lock);
7874 	cpu = cpu_get(0);
7875 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7876 	CPUSET_ONLY(cpuset, 0);
7877 	for (i = 1; i < max_ncpus; i++) {
7878 		if ((cpu = cpu_get(i)) == NULL)
7879 			continue;
7880 
7881 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7882 			panic("post microcode update CPU %d has differing "
7883 			    "microcode revision (%u) from CPU 0 (%u)",
7884 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7885 		}
7886 		CPUSET_ADD(cpuset, i);
7887 	}
7888 
7889 	/*
7890 	 * We do the cross calls in two passes. The first pass is only for the
7891 	 * boot CPU. The second pass is for all of the other CPUs. This allows
7892 	 * the boot CPU to go through and change behavior related to patching or
7893 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
7894 	 * other CPUs to follow suit.
7895 	 */
7896 	kpreempt_disable();
7897 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7898 	    cpuid_post_ucodeadm_xc);
7899 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7900 	    cpuid_post_ucodeadm_xc);
7901 	kpreempt_enable();
7902 
7903 	/*
7904 	 * OK, now look at each CPU and see if their feature sets are equal.
7905 	 */
7906 	f0 = argdata;
7907 	for (i = 1; i < max_ncpus; i++) {
7908 		uchar_t *fset;
7909 		if (!CPU_IN_SET(cpuset, i))
7910 			continue;
7911 
7912 		fset = (uchar_t *)((uintptr_t)argdata +
7913 		    sizeof (x86_featureset) * i);
7914 
7915 		if (!compare_x86_featureset(f0, fset)) {
7916 			panic("Post microcode update CPU %d has "
7917 			    "differing security feature (%p) set from CPU 0 "
7918 			    "(%p), not appending to feature set", i,
7919 			    (void *)fset, (void *)f0);
7920 		}
7921 	}
7922 
7923 	mutex_exit(&cpu_lock);
7924 
7925 	for (i = 0; i < NUM_X86_FEATURES; i++) {
7926 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7927 		    x86_feature_names[i]);
7928 		if (is_x86_feature(f0, i)) {
7929 			add_x86_feature(x86_featureset, i);
7930 		}
7931 	}
7932 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7933 }
7934 
7935 typedef void (*cpuid_pass_f)(cpu_t *, void *);
7936 
7937 typedef struct cpuid_pass_def {
7938 	cpuid_pass_t cpd_pass;
7939 	cpuid_pass_f cpd_func;
7940 } cpuid_pass_def_t;
7941 
7942 /*
7943  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
7944  * normal sense and should not appear here.
7945  */
7946 static const cpuid_pass_def_t cpuid_pass_defs[] = {
7947 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
7948 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
7949 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
7950 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
7951 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
7952 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
7953 };
7954 
7955 void
7956 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
7957 {
7958 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
7959 
7960 	if (cp == NULL)
7961 		cp = CPU;
7962 
7963 	/*
7964 	 * Space statically allocated for BSP, ensure pointer is set
7965 	 */
7966 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
7967 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
7968 
7969 	ASSERT(cpuid_checkpass(cp, pass - 1));
7970 
7971 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
7972 		if (cpuid_pass_defs[i].cpd_pass == pass) {
7973 			cpuid_pass_defs[i].cpd_func(cp, arg);
7974 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
7975 			return;
7976 		}
7977 	}
7978 
7979 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
7980 	    pass, cp->cpu_id);
7981 }
7982 
7983 /*
7984  * Extract the processor family from a chiprev.  Processor families are not the
7985  * same as cpuid families; see comments above and in x86_archext.h.
7986  */
7987 x86_processor_family_t
7988 chiprev_family(const x86_chiprev_t cr)
7989 {
7990 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
7991 }
7992 
7993 /*
7994  * A chiprev matches its template if the vendor and family are identical and the
7995  * revision of the chiprev matches one of the bits set in the template.  Callers
7996  * may bitwise-OR together chiprevs of the same vendor and family to form the
7997  * template, or use the _ANY variant.  It is not possible to match chiprevs of
7998  * multiple vendors or processor families with a single call.  Note that this
7999  * function operates on processor families, not cpuid families.
8000  */
8001 boolean_t
8002 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8003 {
8004 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8005 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8006 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8007 }
8008 
8009 /*
8010  * A chiprev is at least min if the vendor and family are identical and the
8011  * revision of the chiprev is at least as recent as that of min.  Processor
8012  * families are considered unordered and cannot be compared using this function.
8013  * Note that this function operates on processor families, not cpuid families.
8014  * Use of the _ANY chiprev variant with this function is not useful; it will
8015  * always return B_FALSE if the _ANY variant is supplied as the minimum
8016  * revision.  To determine only whether a chiprev is of a given processor
8017  * family, test the return value of chiprev_family() instead.
8018  */
8019 boolean_t
8020 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8021 {
8022 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8023 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8024 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8025 }
8026 
8027 /*
8028  * The uarch functions operate in a manner similar to the chiprev functions
8029  * above.  While it is tempting to allow these to operate on microarchitectures
8030  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8031  * than ZEN2), we elect not to do so because a manufacturer may supply
8032  * processors of multiple different microarchitecture families each of which may
8033  * be internally ordered but unordered with respect to those of other families.
8034  */
8035 x86_uarch_t
8036 uarchrev_uarch(const x86_uarchrev_t ur)
8037 {
8038 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8039 }
8040 
8041 boolean_t
8042 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8043 {
8044 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8045 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8046 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8047 }
8048 
8049 boolean_t
8050 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8051 {
8052 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8053 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8054 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8055 }
8056