xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision a65c38a3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2023 Oxide Computer Company
28  * Copyright 2022 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * The Microcode Pass
328  *
329  * After a microcode update, we do a selective rescan of the cpuid leaves to
330  * determine what features have changed. Microcode updates can provide more
331  * details about security related features to deal with issues like Spectre and
332  * L1TF. On occasion, vendors have violated their contract and removed bits.
333  * However, we don't try to detect that because that puts us in a situation that
334  * we really can't deal with. As such, the only thing we rescan are security
335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
336  * different sequence on APs and therefore is not part of the sequential order;
337  * It is invoked directly instead of by cpuid_execpass() and its completion
338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
339  * a more complex dependency mechanism if warranted by future developments.
340  *
341  * All of the passes are run on all CPUs. However, for the most part we only
342  * care about what the boot CPU says about this information and use the other
343  * CPUs as a rough guide to sanity check that we have the same feature set.
344  *
345  * We do not support running multiple logical CPUs with disjoint, let alone
346  * different, feature sets.
347  *
348  * ------------------
349  * Processor Topology
350  * ------------------
351  *
352  * One of the important things that we need to do is to understand the topology
353  * of the underlying processor. When we say topology in this case, we're trying
354  * to understand the relationship between the logical CPUs that the operating
355  * system sees and the underlying physical layout. Different logical CPUs may
356  * share different resources which can have important consequences for the
357  * performance of the system. For example, they may share caches, execution
358  * units, and more.
359  *
360  * The topology of the processor changes from generation to generation and
361  * vendor to vendor.  Along with that, different vendors use different
362  * terminology, and the operating system itself uses occasionally overlapping
363  * terminology. It's important to understand what this topology looks like so
364  * one can understand the different things that we try to calculate and
365  * determine.
366  *
367  * To get started, let's talk about a little bit of terminology that we've used
368  * so far, is used throughout this file, and is fairly generic across multiple
369  * vendors:
370  *
371  * CPU
372  *	A central processing unit (CPU) refers to a logical and/or virtual
373  *	entity that the operating system can execute instructions on. The
374  *	underlying resources for this CPU may be shared between multiple
375  *	entities; however, to the operating system it is a discrete unit.
376  *
377  * PROCESSOR and PACKAGE
378  *
379  *	Generally, when we use the term 'processor' on its own, we are referring
380  *	to the physical entity that one buys and plugs into a board. However,
381  *	because processor has been overloaded and one might see it used to mean
382  *	multiple different levels, we will instead use the term 'package' for
383  *	the rest of this file. The term package comes from the electrical
384  *	engineering side and refers to the physical entity that encloses the
385  *	electronics inside. Strictly speaking the package can contain more than
386  *	just the CPU, for example, on many processors it may also have what's
387  *	called an 'integrated graphical processing unit (GPU)'. Because the
388  *	package can encapsulate multiple units, it is the largest physical unit
389  *	that we refer to.
390  *
391  * SOCKET
392  *
393  *	A socket refers to unit on a system board (generally the motherboard)
394  *	that can receive a package. A single package, or processor, is plugged
395  *	into a single socket. A system may have multiple sockets. Often times,
396  *	the term socket is used interchangeably with package and refers to the
397  *	electrical component that has plugged in, and not the receptacle itself.
398  *
399  * CORE
400  *
401  *	A core refers to the physical instantiation of a CPU, generally, with a
402  *	full set of hardware resources available to it. A package may contain
403  *	multiple cores inside of it or it may just have a single one. A
404  *	processor with more than one core is often referred to as 'multi-core'.
405  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
406  *	that has 'multi-core' processors.
407  *
408  *	A core may expose a single logical CPU to the operating system, or it
409  *	may expose multiple CPUs, which we call threads, defined below.
410  *
411  *	Some resources may still be shared by cores in the same package. For
412  *	example, many processors will share the level 3 cache between cores.
413  *	Some AMD generations share hardware resources between cores. For more
414  *	information on that see the section 'AMD Topology'.
415  *
416  * THREAD and STRAND
417  *
418  *	In this file, generally a thread refers to a hardware resources and not
419  *	the operating system's logical abstraction. A thread is always exposed
420  *	as an independent logical CPU to the operating system. A thread belongs
421  *	to a specific core. A core may have more than one thread. When that is
422  *	the case, the threads that are part of the same core are often referred
423  *	to as 'siblings'.
424  *
425  *	When multiple threads exist, this is generally referred to as
426  *	simultaneous multi-threading (SMT). When Intel introduced this in their
427  *	processors they called it hyper-threading (HT). When multiple threads
428  *	are active in a core, they split the resources of the core. For example,
429  *	two threads may share the same set of hardware execution units.
430  *
431  *	The operating system often uses the term 'strand' to refer to a thread.
432  *	This helps disambiguate it from the software concept.
433  *
434  * CHIP
435  *
436  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
437  *	base meaning, it is used to refer to a single integrated circuit, which
438  *	may or may not be the only thing in the package. In illumos, when you
439  *	see the term 'chip' it is almost always referring to the same thing as
440  *	the 'package'. However, many vendors may use chip to refer to one of
441  *	many integrated circuits that have been placed in the package. As an
442  *	example, see the subsequent definition.
443  *
444  *	To try and keep things consistent, we will only use chip when referring
445  *	to the entire integrated circuit package, with the exception of the
446  *	definition of multi-chip module (because it is in the name) and use the
447  *	term 'die' when we want the more general, potential sub-component
448  *	definition.
449  *
450  * DIE
451  *
452  *	A die refers to an integrated circuit. Inside of the package there may
453  *	be a single die or multiple dies. This is sometimes called a 'chip' in
454  *	vendor's parlance, but in this file, we use the term die to refer to a
455  *	subcomponent.
456  *
457  * MULTI-CHIP MODULE
458  *
459  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
460  *	are connected together in the same package. When a multi-chip design is
461  *	used, generally each chip is manufactured independently and then joined
462  *	together in the package. For example, on AMD's Zen microarchitecture
463  *	(family 0x17), the package contains several dies (the second meaning of
464  *	chip from above) that are connected together.
465  *
466  * CACHE
467  *
468  *	A cache is a part of the processor that maintains copies of recently
469  *	accessed memory. Caches are split into levels and then into types.
470  *	Commonly there are one to three levels, called level one, two, and
471  *	three. The lower the level, the smaller it is, the closer it is to the
472  *	execution units of the CPU, and the faster it is to access. The layout
473  *	and design of the cache come in many different flavors, consult other
474  *	resources for a discussion of those.
475  *
476  *	Caches are generally split into two types, the instruction and data
477  *	cache. The caches contain what their names suggest, the instruction
478  *	cache has executable program text, while the data cache has all other
479  *	memory that the processor accesses. As of this writing, data is kept
480  *	coherent between all of the caches on x86, so if one modifies program
481  *	text before it is executed, that will be in the data cache, and the
482  *	instruction cache will be synchronized with that change when the
483  *	processor actually executes those instructions. This coherency also
484  *	covers the fact that data could show up in multiple caches.
485  *
486  *	Generally, the lowest level caches are specific to a core. However, the
487  *	last layer cache is shared between some number of cores. The number of
488  *	CPUs sharing this last level cache is important. This has implications
489  *	for the choices that the scheduler makes, as accessing memory that might
490  *	be in a remote cache after thread migration can be quite expensive.
491  *
492  *	Sometimes, the word cache is abbreviated with a '$', because in US
493  *	English the word cache is pronounced the same as cash. So L1D$ refers to
494  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
495  *	in the rest of this theory statement for clarity.
496  *
497  * MEMORY CONTROLLER
498  *
499  *	The memory controller is a component that provides access to DRAM. Each
500  *	memory controller can access a set number of DRAM channels. Each channel
501  *	can have a number of DIMMs (sticks of memory) associated with it. A
502  *	given package may have more than one memory controller. The association
503  *	of the memory controller to a group of cores is important as it is
504  *	cheaper to access memory on the controller that you are associated with.
505  *
506  * NUMA
507  *
508  *	NUMA or non-uniform memory access, describes a way that systems are
509  *	built. On x86, any processor core can address all of the memory in the
510  *	system. However, When using multiple sockets or possibly within a
511  *	multi-chip module, some of that memory is physically closer and some of
512  *	it is further. Memory that is further away is more expensive to access.
513  *	Consider the following image of multiple sockets with memory:
514  *
515  *	+--------+                                                +--------+
516  *	| DIMM A |         +----------+      +----------+         | DIMM D |
517  *	+--------+-+       |          |      |          |       +-+------+-+
518  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
519  *	  +--------+-+     |          |      |          |     +-+------+-+
520  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
521  *	    +--------+                                        +--------+
522  *
523  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
524  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
525  *	access DIMMs A-C and more expensive to access D-F as it has to go
526  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
527  *	D-F are cheaper than A-C. While the socket form is the most common, when
528  *	using multi-chip modules, this can also sometimes occur. For another
529  *	example of this that's more involved, see the AMD topology section.
530  *
531  *
532  * Intel Topology
533  * --------------
534  *
535  * Most Intel processors since Nehalem, (as of this writing the current gen
536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
537  * the package is a single monolithic die. MCMs currently aren't used. Most
538  * parts have three levels of caches, with the L3 cache being shared between
539  * all of the cores on the package. The L1/L2 cache is generally specific to
540  * an individual core. The following image shows at a simplified level what
541  * this looks like. The memory controller is commonly part of something called
542  * the 'Uncore', that used to be separate physical chips that were not a part of
543  * the package, but are now part of the same chip.
544  *
545  *  +-----------------------------------------------------------------------+
546  *  | Package                                                               |
547  *  |  +-------------------+  +-------------------+  +-------------------+  |
548  *  |  | Core              |  | Core              |  | Core              |  |
549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
558  *  |  +-------------------+  +-------------------+  +-------------------+  |
559  *  | +-------------------------------------------------------------------+ |
560  *  | |                         Shared L3 Cache                           | |
561  *  | +-------------------------------------------------------------------+ |
562  *  | +-------------------------------------------------------------------+ |
563  *  | |                        Memory Controller                          | |
564  *  | +-------------------------------------------------------------------+ |
565  *  +-----------------------------------------------------------------------+
566  *
567  * A side effect of this current architecture is that what we care about from a
568  * scheduling and topology perspective, is simplified. In general we care about
569  * understanding which logical CPUs are part of the same core and socket.
570  *
571  * To determine the relationship between threads and cores, Intel initially used
572  * the identifier in the advanced programmable interrupt controller (APIC). They
573  * also added cpuid leaf 4 to give additional information about the number of
574  * threads and CPUs in the processor. With the addition of x2apic (which
575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
576  * additional cpuid topology leaf 0xB was added.
577  *
578  * AMD Topology
579  * ------------
580  *
581  * When discussing AMD topology, we want to break this into three distinct
582  * generations of topology. There's the basic topology that has been used in
583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
587  * additional terminology that's worth talking about.
588  *
589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
590  * that they considered SMT. Whether or not the AMD processors have SMT
591  * influences many things including scheduling and reliability, availability,
592  * and serviceability (RAS) features.
593  *
594  * NODE
595  *
596  *	AMD uses the term node to refer to a die that contains a number of cores
597  *	and I/O resources. Depending on the processor family and model, more
598  *	than one node can be present in the package. When there is more than one
599  *	node this indicates a multi-chip module. Usually each node has its own
600  *	access to memory and I/O devices. This is important and generally
601  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
602  *	result, we track this relationship in the operating system.
603  *
604  *	In processors with an L3 cache, the L3 cache is generally shared across
605  *	the entire node, though the way this is carved up varies from generation
606  *	to generation.
607  *
608  * BULLDOZER
609  *
610  *	Starting with the Bulldozer family (0x15) and continuing until the
611  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
612  *	compute unit. In a compute unit, two traditional cores share a number of
613  *	hardware resources. Critically, they share the FPU, L1 instruction
614  *	cache, and the L2 cache. Several compute units were then combined inside
615  *	of a single node.  Because the integer execution units, L1 data cache,
616  *	and some other resources were not shared between the cores, AMD never
617  *	considered this to be SMT.
618  *
619  * ZEN
620  *
621  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
622  *	is called Zeppelin. These modules are similar to the idea of nodes used
623  *	previously. Each of these nodes has two DRAM channels which all of the
624  *	cores in the node can access uniformly. These nodes are linked together
625  *	in the package, creating a NUMA environment.
626  *
627  *	The Zeppelin die itself contains two different 'core complexes'. Each
628  *	core complex consists of four cores which each have two threads, for a
629  *	total of 8 logical CPUs per complex. Unlike other generations,
630  *	where all the logical CPUs in a given node share the L3 cache, here each
631  *	core complex has its own shared L3 cache.
632  *
633  *	A further thing that we need to consider is that in some configurations,
634  *	particularly with the Threadripper line of processors, not every die
635  *	actually has its memory controllers wired up to actual memory channels.
636  *	This means that some cores have memory attached to them and others
637  *	don't.
638  *
639  *	To put Zen in perspective, consider the following images:
640  *
641  *      +--------------------------------------------------------+
642  *      | Core Complex                                           |
643  *      | +-------------------+    +-------------------+  +---+  |
644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
650  *      | +-------------------+    +-------------------+  | C |  |
651  *      | +-------------------+    +-------------------+  | a |  |
652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
658  *      | +-------------------+    +-------------------+  +---+  |
659  *      |                                                        |
660  *	+--------------------------------------------------------+
661  *
662  *  This first image represents a single Zen core complex that consists of four
663  *  cores.
664  *
665  *
666  *	+--------------------------------------------------------+
667  *	| Zeppelin Die                                           |
668  *	|  +--------------------------------------------------+  |
669  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
670  *	|  +--------------------------------------------------+  |
671  *      |                           HH                           |
672  *	|          +-----------+    HH    +-----------+          |
673  *	|          |           |    HH    |           |          |
674  *	|          |    Core   |==========|    Core   |          |
675  *	|          |  Complex  |==========|  Complex  |          |
676  *	|          |           |    HH    |           |          |
677  *	|          +-----------+    HH    +-----------+          |
678  *      |                           HH                           |
679  *	|  +--------------------------------------------------+  |
680  *	|  |                Memory Controller                 |  |
681  *	|  +--------------------------------------------------+  |
682  *      |                                                        |
683  *	+--------------------------------------------------------+
684  *
685  *  This image represents a single Zeppelin Die. Note how both cores are
686  *  connected to the same memory controller and I/O units. While each core
687  *  complex has its own L3 cache as seen in the first image, they both have
688  *  uniform access to memory.
689  *
690  *
691  *                      PP                     PP
692  *                      PP                     PP
693  *           +----------PP---------------------PP---------+
694  *           |          PP                     PP         |
695  *           |    +-----------+          +-----------+    |
696  *           |    |           |          |           |    |
697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
699  *           |    |           |          |           |    |
700  *           |    +-----------+ooo    ...+-----------+    |
701  *           |          HH      ooo  ...       HH         |
702  *           |          HH        oo..         HH         |
703  *           |          HH        ..oo         HH         |
704  *           |          HH      ...  ooo       HH         |
705  *           |    +-----------+...    ooo+-----------+    |
706  *           |    |           |          |           |    |
707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
709  *           |    |           |          |           |    |
710  *           |    +-----------+          +-----------+    |
711  *           |          PP                     PP         |
712  *           +----------PP---------------------PP---------+
713  *                      PP                     PP
714  *                      PP                     PP
715  *
716  *  This image represents a single Zen package. In this example, it has four
717  *  Zeppelin dies, though some configurations only have a single one. In this
718  *  example, each die is directly connected to the next. Also, each die is
719  *  represented as being connected to memory by the 'M' character and connected
720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
721  *  die is made up of two core complexes, we have multiple different NUMA
722  *  domains that we care about for these systems.
723  *
724  * ZEN 2
725  *
726  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
727  *	each Zeppelin Die had its own I/O die, that has been moved out of the
728  *	core complex in Zen 2. The actual core complex looks pretty similar, but
729  *	now the die actually looks much simpler:
730  *
731  *      +--------------------------------------------------------+
732  *      | Zen 2 Core Complex Die    HH                           |
733  *      |                           HH                           |
734  *      |          +-----------+    HH    +-----------+          |
735  *      |          |           |    HH    |           |          |
736  *      |          |    Core   |==========|    Core   |          |
737  *      |          |  Complex  |==========|  Complex  |          |
738  *      |          |           |    HH    |           |          |
739  *      |          +-----------+    HH    +-----------+          |
740  *      |                           HH                           |
741  *      |                           HH                           |
742  *      +--------------------------------------------------------+
743  *
744  *	From here, when we add the central I/O die, this changes things a bit.
745  *	Each die is connected to the I/O die, rather than trying to interconnect
746  *	them directly. The following image takes the same Zen 1 image that we
747  *	had earlier and shows what it looks like with the I/O die instead:
748  *
749  *                                 PP    PP
750  *                                 PP    PP
751  *           +---------------------PP----PP---------------------+
752  *           |                     PP    PP                     |
753  *           |  +-----------+      PP    PP      +-----------+  |
754  *           |  |           |      PP    PP      |           |  |
755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
757  *           |  |         |o|oooo|          |oooo|o|         |  |
758  *           |  +-----------+    |          |    +-----------+  |
759  *           |                   |   I/O    |                   |
760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
762  *           |                   |          |                   |
763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
765  *           |                   |          |                   |
766  *           |  +-----------+    |          |    +-----------+  |
767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
769  *           |  |    Die    |      PP    PP      |    Die    |  |
770  *           |  |           |      PP    PP      |           |  |
771  *           |  +-----------+      PP    PP      +-----------+  |
772  *           |                     PP    PP                     |
773  *           +---------------------PP----PP---------------------+
774  *                                 PP    PP
775  *                                 PP    PP
776  *
777  *	The above has four core complex dies installed, though the Zen 2 EPYC
778  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
779  *	generally only have one to two. The more notable difference here is how
780  *	everything communicates. Note that memory and PCIe come out of the
781  *	central die. This changes the way that one die accesses a resource. It
782  *	basically always has to go to the I/O die, where as in Zen 1 it may have
783  *	satisfied it locally. In general, this ends up being a better strategy
784  *	for most things, though it is possible to still treat everything in four
785  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
786  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
787  *	now there is only one 'node' present.
788  *
789  * ZEN 3
790  *
791  *	From an architectural perspective, Zen 3 is a much smaller change from
792  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
793  *	its microarchitectural changes. The biggest thing for us is how the die
794  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
795  *	cache. However, in Zen 3, the L3 is now shared between the entire core
796  *	complex die and is no longer partitioned between each core complex. This
797  *	means that all cores on the die can share the same L3 cache. Otherwise,
798  *	the general layout of the overall package with various core complexes
799  *	and an I/O die stays the same. Here's what the Core Complex Die looks
800  *	like in a bit more detail:
801  *
802  *               +-------------------------------------------------+
803  *               | Zen 3 Core Complex Die                          |
804  *               | +-------------------+    +-------------------+  |
805  *               | | Core       +----+ |    | Core       +----+ |  |
806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
811  *               | +-------------------+    +-------------------+  |
812  *               | +-------------------+    +-------------------+  |
813  *               | | Core       +----+ |    | Core       +----+ |  |
814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
819  *               | +-------------------+    +-------------------+  |
820  *               |                                                 |
821  *               | +--------------------------------------------+  |
822  *               | |                 L3 Cache                   |  |
823  *               | +--------------------------------------------+  |
824  *               |                                                 |
825  *               | +-------------------+    +-------------------+  |
826  *               | | Core       +----+ |    | Core       +----+ |  |
827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
832  *               | +-------------------+    +-------------------+  |
833  *               | +-------------------+    +-------------------+  |
834  *               | | Core       +----+ |    | Core       +----+ |  |
835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
840  *               | +-------------------+    +-------------------+  |
841  *               +-------------------------------------------------+
842  *
843  *	While it is not pictured, there are connections from the die to the
844  *	broader data fabric and additional functional blocks to support that
845  *	communication and coherency.
846  *
847  * CPUID LEAVES
848  *
849  * There are a few different CPUID leaves that we can use to try and understand
850  * the actual state of the world. As part of the introduction of family 0xf, AMD
851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
852  * processors that are in the system. Because families before Zen didn't have
853  * SMT, this was always the number of cores that were in the system. However, it
854  * should always be thought of as the number of logical threads to be consistent
855  * between generations. In addition we also get the size of the APIC ID that is
856  * used to represent the number of logical processors. This is important for
857  * deriving topology information.
858  *
859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
860  * bit between Bulldozer and later families, but it is quite useful in
861  * determining the topology information. Because this information has changed
862  * across family generations, it's worth calling out what these mean
863  * explicitly. The registers have the following meanings:
864  *
865  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
866  *		APIC ID, even though on systems without x2apic support, it will
867  *		be limited to 8 bits.
868  *
869  *	%ebx	On Bulldozer-era systems this contains information about the
870  *		number of cores that are in a compute unit (cores that share
871  *		resources). It also contains a per-package compute unit ID that
872  *		identifies which compute unit the logical CPU is a part of.
873  *
874  *		On Zen-era systems this instead contains the number of threads
875  *		per core and the ID of the core that the logical CPU is a part
876  *		of. Note, this ID is unique only to the package, it is not
877  *		globally unique across the entire system.
878  *
879  *	%ecx	This contains the number of nodes that exist in the package. It
880  *		also contains an ID that identifies which node the logical CPU
881  *		is a part of.
882  *
883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
884  * cache layout to determine which logical CPUs are sharing which caches.
885  *
886  * illumos Topology
887  * ----------------
888  *
889  * Based on the above we synthesize the information into several different
890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
891  * of what each member is supposed to represent and their uniqueness. In
892  * general, there are two levels of uniqueness that we care about. We care about
893  * an ID that is globally unique. That means that it will be unique across all
894  * entities in the system. For example, the default logical CPU ID is globally
895  * unique. On the other hand, there is some information that we only care about
896  * being unique within the context of a single package / socket. Here are the
897  * variables that we keep track of and their meaning.
898  *
899  * Several of the values that are asking for an identifier, with the exception
900  * of cpi_apicid, are allowed to be synthetic.
901  *
902  *
903  * cpi_apicid
904  *
905  *	This is the value of the CPU's APIC id. This should be the full 32-bit
906  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
907  *	APIC ID. This value is globally unique between all logical CPUs across
908  *	all packages. This is usually required by the APIC.
909  *
910  * cpi_chipid
911  *
912  *	This value indicates the ID of the package that the logical CPU is a
913  *	part of. This value is allowed to be synthetic. It is usually derived by
914  *	taking the CPU's APIC ID and determining how many bits are used to
915  *	represent CPU cores in the package. All logical CPUs that are part of
916  *	the same package must have the same value.
917  *
918  * cpi_coreid
919  *
920  *	This represents the ID of a CPU core. Two logical CPUs should only have
921  *	the same cpi_coreid value if they are part of the same core. These
922  *	values may be synthetic. On systems that support SMT, this value is
923  *	usually derived from the APIC ID, otherwise it is often synthetic and
924  *	just set to the value of the cpu_id in the cpu_t.
925  *
926  * cpi_pkgcoreid
927  *
928  *	This is similar to the cpi_coreid in that logical CPUs that are part of
929  *	the same core should have the same ID. The main difference is that these
930  *	values are only required to be unique to a given socket.
931  *
932  * cpi_clogid
933  *
934  *	This represents the logical ID of a logical CPU. This value should be
935  *	unique within a given socket for each logical CPU. This is allowed to be
936  *	synthetic, though it is usually based off of the CPU's apic ID. The
937  *	broader system expects that logical CPUs that have are part of the same
938  *	core have contiguous numbers. For example, if there were two threads per
939  *	core, then the core IDs divided by two should be the same and the first
940  *	modulus two should be zero and the second one. For example, IDs 4 and 5
941  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
942  *	6 represent two logical CPUs that are part of different cores.
943  *
944  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
945  *	from the same source, strictly speaking, they don't have to be and the
946  *	two values should be considered logically independent. One should not
947  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
948  *	some kind of relationship. While this is tempting, we've seen cases on
949  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
950  *
951  * cpi_ncpu_per_chip
952  *
953  *	This value indicates the total number of logical CPUs that exist in the
954  *	physical package. Critically, this is not the number of logical CPUs
955  *	that exist for just the single core.
956  *
957  *	This value should be the same for all logical CPUs in the same package.
958  *
959  * cpi_ncore_per_chip
960  *
961  *	This value indicates the total number of physical CPU cores that exist
962  *	in the package. The system compares this value with cpi_ncpu_per_chip to
963  *	determine if simultaneous multi-threading (SMT) is enabled. When
964  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
965  *	the X86FSET_HTT feature is not set. If this value is greater than one,
966  *	than we consider the processor to have the feature X86FSET_CMP, to
967  *	indicate that there is support for more than one core.
968  *
969  *	This value should be the same for all logical CPUs in the same package.
970  *
971  * cpi_procnodes_per_pkg
972  *
973  *	This value indicates the number of 'nodes' that exist in the package.
974  *	When processors are actually a multi-chip module, this represents the
975  *	number of such modules that exist in the package. Currently, on Intel
976  *	based systems this member is always set to 1.
977  *
978  *	This value should be the same for all logical CPUs in the same package.
979  *
980  * cpi_procnodeid
981  *
982  *	This value indicates the ID of the node that the logical CPU is a part
983  *	of. All logical CPUs that are in the same node must have the same value
984  *	here. This value must be unique across all of the packages in the
985  *	system.  On Intel based systems, this is currently set to the value in
986  *	cpi_chipid because there is only one node.
987  *
988  * cpi_cores_per_compunit
989  *
990  *	This value indicates the number of cores that are part of a compute
991  *	unit. See the AMD topology section for this. This member only has real
992  *	meaning currently for AMD Bulldozer family processors. For all other
993  *	processors, this should currently be set to 1.
994  *
995  * cpi_compunitid
996  *
997  *	This indicates the compute unit that the logical CPU belongs to. For
998  *	processors without AMD Bulldozer-style compute units this should be set
999  *	to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *	This indicates the number of logical CPUs that are sharing the same last
1004  *	level cache. This value should be the same for all CPUs that are sharing
1005  *	that cache. The last cache refers to the cache that is closest to memory
1006  *	and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *	This indicates the ID of the last cache that the logical CPU uses. This
1011  *	cache is often shared between multiple logical CPUs and is the cache
1012  *	that is closest to memory and furthest away from the CPU. This value
1013  *	should be the same for a group of logical CPUs only if they actually
1014  *	share the same last level cache. IDs should not overlap between
1015  *	packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *	This indicates the number of bits that are required to represent all of
1020  *	the cores in the system. As cores are derived based on their APIC IDs,
1021  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *	this value to be larger than the actual number of IDs that are present
1023  *	in the system. This is used to size tables by the CMI framework. It is
1024  *	only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *	This indicates the number of bits required to represent all of the IDs
1029  *	that cover the logical CPUs that exist on a given core. It's OK for this
1030  *	value to be larger than the actual number of IDs that are present in the
1031  *	system.  This is used to size tables by the CMI framework. It is
1032  *	only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *
1117  * Each of these requires different sets of mitigations and has different attack
1118  * surfaces. For the most part, this discussion is about protecting the kernel
1119  * from non-kernel executing environments such as user processes and hardware
1120  * virtual machines. Unfortunately, there are a number of user vs. user
1121  * scenarios that exist with these. The rest of this section will describe the
1122  * overall approach that the system has taken to address these as well as their
1123  * shortcomings. Unfortunately, not all of the above have been handled today.
1124  *
1125  * SPECTRE v2, ret2spec, SpectreRSB
1126  *
1127  * The second variant of the spectre attack focuses on performing branch target
1128  * injection. This generally impacts indirect call instructions in the system.
1129  * There are three different ways to mitigate this issue that are commonly
1130  * described today:
1131  *
1132  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1133  *  2. Using Retpolines and RSB Stuffing
1134  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1135  *
1136  * IBRS uses a feature added to microcode to restrict speculation, among other
1137  * things. This form of mitigation has not been used as it has been generally
1138  * seen as too expensive and requires reactivation upon various transitions in
1139  * the system.
1140  *
1141  * As a less impactful alternative to IBRS, retpolines were developed by
1142  * Google. These basically require one to replace indirect calls with a specific
1143  * trampoline that will cause speculation to fail and break the attack.
1144  * Retpolines require compiler support. We always build with retpolines in the
1145  * external thunk mode. This means that a traditional indirect call is replaced
1146  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1147  * of this is that all indirect function calls are performed through a register.
1148  *
1149  * We have to use a common external location of the thunk and not inline it into
1150  * the callsite so that way we can have a single place to patch these functions.
1151  * As it turns out, we currently have two different forms of retpolines that
1152  * exist in the system:
1153  *
1154  *  1. A full retpoline
1155  *  2. A no-op version
1156  *
1157  * The first one is used in the general case. Historically, there was an
1158  * AMD-specific optimized retopoline variant that was based around using a
1159  * serializing lfence instruction; however, in March 2022 it was announced that
1160  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1161  * use it and it is no longer available in the system.
1162  *
1163  * The third form described above is the most curious. It turns out that the way
1164  * that retpolines are implemented is that they rely on how speculation is
1165  * performed on a 'ret' instruction. Intel has continued to optimize this
1166  * process (which is partly why we need to have return stack buffer stuffing,
1167  * but more on that in a bit) and in processors starting with Cascade Lake
1168  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1169  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1170  *
1171  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1172  * physical core. However, if this is the case, we don't want to use retpolines
1173  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1174  * function (called a thunk) into a jmp instruction. This means that we're still
1175  * paying the cost of an extra jump to the external thunk, but it gives us
1176  * flexibility and the ability to have a single kernel image that works across a
1177  * wide variety of systems and hardware features.
1178  *
1179  * Unfortunately, this alone is insufficient. First, Skylake systems have
1180  * additional speculation for the Return Stack Buffer (RSB) which is used to
1181  * return from call instructions which retpolines take advantage of. However,
1182  * this problem is not just limited to Skylake and is actually more pernicious.
1183  * The SpectreRSB paper introduces several more problems that can arise with
1184  * dealing with this. The RSB can be poisoned just like the indirect branch
1185  * predictor. This means that one needs to clear the RSB when transitioning
1186  * between two different privilege domains. Some examples include:
1187  *
1188  *  - Switching between two different user processes
1189  *  - Going between user land and the kernel
1190  *  - Returning to the kernel from a hardware virtual machine
1191  *
1192  * Mitigating this involves combining a couple of different things. The first is
1193  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1194  * Bridge. When an RSB entry refers to a user address and we're executing in the
1195  * kernel, speculation through it will be stopped when SMEP is enabled. This
1196  * protects against a number of the different cases that we would normally be
1197  * worried about such as when we enter the kernel from user land.
1198  *
1199  * To prevent against additional manipulation of the RSB from other contexts
1200  * such as a non-root VMX context attacking the kernel we first look to
1201  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1202  * nothing else that we need to do to protect the kernel at this time.
1203  *
1204  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1205  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1206  * Currently this is employed on context switch and vmx_exit. The
1207  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1208  *
1209  * If SMEP is not present, then we would have to stuff the RSB every time we
1210  * transitioned from user mode to the kernel, which isn't very practical right
1211  * now.
1212  *
1213  * To fully protect user to user and vmx to vmx attacks from these classes of
1214  * issues, we would also need to allow them to opt into performing an Indirect
1215  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1216  *
1217  * By default, the system will enable RSB stuffing and the required variant of
1218  * retpolines and store that information in the x86_spectrev2_mitigation value.
1219  * This will be evaluated after a microcode update as well, though it is
1220  * expected that microcode updates will not take away features. This may mean
1221  * that a late loaded microcode may not end up in the optimal configuration
1222  * (though this should be rare).
1223  *
1224  * Currently we do not build kmdb with retpolines or perform any additional side
1225  * channel security mitigations for it. One complication with kmdb is that it
1226  * requires its own retpoline thunks and it would need to adjust itself based on
1227  * what the kernel does. The threat model of kmdb is more limited and therefore
1228  * it may make more sense to investigate using prediction barriers as the whole
1229  * system is only executing a single instruction at a time while in kmdb.
1230  *
1231  * SPECTRE v1, v4
1232  *
1233  * The v1 and v4 variants of spectre are not currently mitigated in the
1234  * system and require other classes of changes to occur in the code.
1235  *
1236  * SPECTRE v1 (SWAPGS VARIANT)
1237  *
1238  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1239  * can generally affect any branch-dependent code. The swapgs issue is one
1240  * variant of this. If we are coming in from userspace, we can have code like
1241  * this:
1242  *
1243  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1244  *	je	1f
1245  *	movq	$0, REGOFF_SAVFP(%rsp)
1246  *	swapgs
1247  *	1:
1248  *	movq	%gs:CPU_THREAD, %rax
1249  *
1250  * If an attacker can cause a mis-speculation of the branch here, we could skip
1251  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1252  * load. If subsequent code can act as the usual Spectre cache gadget, this
1253  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1254  * any use of the %gs override.
1255  *
1256  * The other case is also an issue: if we're coming into a trap from kernel
1257  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1258  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1259  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1260  * case, and the fix is the same in both cases (an lfence at the branch target
1261  * 1: in this example), we'll just do it unconditionally.
1262  *
1263  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1264  * harder for user-space to actually set a useful %gsbase value: although it's
1265  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1266  * mitigate anyway.
1267  *
1268  * MELTDOWN
1269  *
1270  * Meltdown, or spectre v3, allowed a user process to read any data in their
1271  * address space regardless of whether or not the page tables in question
1272  * allowed the user to have the ability to read them. The solution to meltdown
1273  * is kernel page table isolation. In this world, there are two page tables that
1274  * are used for a process, one in user land and one in the kernel. To implement
1275  * this we use per-CPU page tables and switch between the user and kernel
1276  * variants when entering and exiting the kernel.  For more information about
1277  * this process and how the trampolines work, please see the big theory
1278  * statements and additional comments in:
1279  *
1280  *  - uts/i86pc/ml/kpti_trampolines.s
1281  *  - uts/i86pc/vm/hat_i86.c
1282  *
1283  * While Meltdown only impacted Intel systems and there are also Intel systems
1284  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1285  * kernel page table isolation enabled. While this may at first seem weird, an
1286  * important thing to remember is that you can't speculatively read an address
1287  * if it's never in your page table at all. Having user processes without kernel
1288  * pages present provides us with an important layer of defense in the kernel
1289  * against any other side channel attacks that exist and have yet to be
1290  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1291  * default, no matter the x86 system.
1292  *
1293  * L1 TERMINAL FAULT
1294  *
1295  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1296  * execution uses page table entries. Effectively, it is two different problems.
1297  * The first is that it ignores the not present bit in the page table entries
1298  * when performing speculative execution. This means that something can
1299  * speculatively read the listed physical address if it's present in the L1
1300  * cache under certain conditions (see Intel's documentation for the full set of
1301  * conditions). Secondly, this can be used to bypass hardware virtualization
1302  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1303  * instructions.
1304  *
1305  * For the non-hardware virtualized case, this is relatively easy to deal with.
1306  * We must make sure that all unmapped pages have an address of zero. This means
1307  * that they could read the first 4k of physical memory; however, we never use
1308  * that first page in the operating system and always skip putting it in our
1309  * memory map, even if firmware tells us we can use it in our memory map. While
1310  * other systems try to put extra metadata in the address and reserved bits,
1311  * which led to this being problematic in those cases, we do not.
1312  *
1313  * For hardware virtual machines things are more complicated. Because they can
1314  * construct their own page tables, it isn't hard for them to perform this
1315  * attack against any physical address. The one wrinkle is that this physical
1316  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1317  * to flush the L1 data cache. We wrap this up in the function
1318  * spec_uarch_flush(). This function is also used in the mitigation of
1319  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1320  * hypervisors such as KVM or bhyve are responsible for performing this before
1321  * entering the guest.
1322  *
1323  * Because this attack takes place in the L1 cache, there's another wrinkle
1324  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1325  * designs. This means that when a thread enters a hardware virtualized context
1326  * and flushes the L1 data cache, the other thread on the processor may then go
1327  * ahead and put new data in it that can be potentially attacked. While one
1328  * solution is to disable SMT on the system, another option that is available is
1329  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1330  * goes through and makes sure that if a HVM is being scheduled on one thread,
1331  * then the thing on the other thread is from the same hardware virtual machine.
1332  * If an interrupt comes in or the guest exits to the broader system, then the
1333  * other SMT thread will be kicked out.
1334  *
1335  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1336  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1337  * perform L1TF related mitigations.
1338  *
1339  * MICROARCHITECTURAL DATA SAMPLING
1340  *
1341  * Microarchitectural data sampling (MDS) is a combination of four discrete
1342  * vulnerabilities that are similar issues affecting various parts of the CPU's
1343  * microarchitectural implementation around load, store, and fill buffers.
1344  * Specifically it is made up of the following subcomponents:
1345  *
1346  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1347  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1348  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1349  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1350  *
1351  * To begin addressing these, Intel has introduced another feature in microcode
1352  * called MD_CLEAR. This changes the verw instruction to operate in a different
1353  * way. This allows us to execute the verw instruction in a particular way to
1354  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1355  * updated when this microcode is present to flush this state.
1356  *
1357  * Primarily we need to flush this state whenever we transition from the kernel
1358  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1359  * little bit different. Here the structures are statically sized when a logical
1360  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1361  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1362  * mwait, or another ACPI method. To perform these flushes, we call
1363  * x86_md_clear() at all of these transition points.
1364  *
1365  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1366  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1367  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1368  * a no-op.
1369  *
1370  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1371  * particular, everything we've discussed above is only valid for a single
1372  * thread executing on a core. In the case where you have hyper-threading
1373  * present, this attack can be performed between threads. The theoretical fix
1374  * for this is to ensure that both threads are always in the same security
1375  * domain. This means that they are executing in the same ring and mutually
1376  * trust each other. Practically speaking, this would mean that a system call
1377  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1378  * Rather than implement this, we recommend that one disables hyper-threading
1379  * through the use of psradm -aS.
1380  *
1381  * TSX ASYNCHRONOUS ABORT
1382  *
1383  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1384  * behaves like MDS, but leverages Intel's transactional instructions as another
1385  * vector. Effectively, when a transaction hits one of these cases (unmapped
1386  * page, various cache snoop activity, etc.) then the same data can be exposed
1387  * as in the case of MDS. This means that you can attack your twin.
1388  *
1389  * Intel has described that there are two different ways that we can mitigate
1390  * this problem on affected processors:
1391  *
1392  *   1) We can use the same techniques used to deal with MDS. Flushing the
1393  *      microarchitectural buffers and disabling hyperthreading will mitigate
1394  *      this in the same way.
1395  *
1396  *   2) Using microcode to disable TSX.
1397  *
1398  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1399  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1400  * That's OK as we're already doing all such mitigations. On the other hand,
1401  * processors with MDS_NO are all supposed to receive microcode updates that
1402  * enumerate support for disabling TSX. In general, we'd rather use this method
1403  * when available as it doesn't require disabling hyperthreading to be
1404  * effective. Currently we basically are relying on microcode for processors
1405  * that enumerate MDS_NO.
1406  *
1407  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1408  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1409  * different powers. The first allows us to cause all transactions to
1410  * immediately abort. The second gives us a means of disabling TSX completely,
1411  * which includes removing it from cpuid. If we have support for this in
1412  * microcode during the first cpuid pass, then we'll disable TSX completely such
1413  * that user land never has a chance to observe the bit. However, if we are late
1414  * loading the microcode, then we must use the functionality to cause
1415  * transactions to automatically abort. This is necessary for user land's sake.
1416  * Once a program sees a cpuid bit, it must not be taken away.
1417  *
1418  * We track whether or not we should do this based on what cpuid pass we're in.
1419  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1420  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1421  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1422  * second time after we do the initial microcode update.  As a result we need to
1423  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1424  * suitable microcode on the current CPU (which happens prior to
1425  * cpuid_pass_ucode()).
1426  *
1427  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1428  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1429  * unfortunate feature in a number of ways, and taking the opportunity to
1430  * finally be able to turn it off is likely to be of benefit in the future.
1431  *
1432  * SUMMARY
1433  *
1434  * The following table attempts to summarize the mitigations for various issues
1435  * and what's done in various places:
1436  *
1437  *  - Spectre v1: Not currently mitigated
1438  *  - swapgs: lfences after swapgs paths
1439  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS if HW support
1440  *  - Meltdown: Kernel Page Table Isolation
1441  *  - Spectre v3a: Updated CPU microcode
1442  *  - Spectre v4: Not currently mitigated
1443  *  - SpectreRSB: SMEP and RSB Stuffing
1444  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1445  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1446  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1447  *
1448  * The following table indicates the x86 feature set bits that indicate that a
1449  * given problem has been solved or a notable feature is present:
1450  *
1451  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1452  *  - MDS_NO: All forms of MDS
1453  *  - TAA_NO: TAA
1454  */
1455 
1456 #include <sys/types.h>
1457 #include <sys/archsystm.h>
1458 #include <sys/x86_archext.h>
1459 #include <sys/kmem.h>
1460 #include <sys/systm.h>
1461 #include <sys/cmn_err.h>
1462 #include <sys/sunddi.h>
1463 #include <sys/sunndi.h>
1464 #include <sys/cpuvar.h>
1465 #include <sys/processor.h>
1466 #include <sys/sysmacros.h>
1467 #include <sys/pg.h>
1468 #include <sys/fp.h>
1469 #include <sys/controlregs.h>
1470 #include <sys/bitmap.h>
1471 #include <sys/auxv_386.h>
1472 #include <sys/memnode.h>
1473 #include <sys/pci_cfgspace.h>
1474 #include <sys/comm_page.h>
1475 #include <sys/mach_mmu.h>
1476 #include <sys/ucode.h>
1477 #include <sys/tsc.h>
1478 #include <sys/kobj.h>
1479 #include <sys/asm_misc.h>
1480 
1481 #ifdef __xpv
1482 #include <sys/hypervisor.h>
1483 #else
1484 #include <sys/ontrap.h>
1485 #endif
1486 
1487 uint_t x86_vendor = X86_VENDOR_IntelClone;
1488 uint_t x86_type = X86_TYPE_OTHER;
1489 uint_t x86_clflush_size = 0;
1490 
1491 #if defined(__xpv)
1492 int x86_use_pcid = 0;
1493 int x86_use_invpcid = 0;
1494 #else
1495 int x86_use_pcid = -1;
1496 int x86_use_invpcid = -1;
1497 #endif
1498 
1499 typedef enum {
1500 	X86_SPECTREV2_RETPOLINE,
1501 	X86_SPECTREV2_ENHANCED_IBRS,
1502 	X86_SPECTREV2_DISABLED
1503 } x86_spectrev2_mitigation_t;
1504 
1505 uint_t x86_disable_spectrev2 = 0;
1506 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1507     X86_SPECTREV2_RETPOLINE;
1508 
1509 /*
1510  * The mitigation status for TAA:
1511  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1512  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1513  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1514  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1515  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1516  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1517  */
1518 typedef enum {
1519 	X86_TAA_NOTHING,
1520 	X86_TAA_DISABLED,
1521 	X86_TAA_MD_CLEAR,
1522 	X86_TAA_TSX_FORCE_ABORT,
1523 	X86_TAA_TSX_DISABLE,
1524 	X86_TAA_HW_MITIGATED
1525 } x86_taa_mitigation_t;
1526 
1527 uint_t x86_disable_taa = 0;
1528 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1529 
1530 uint_t pentiumpro_bug4046376;
1531 
1532 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1533 
1534 static char *x86_feature_names[NUM_X86_FEATURES] = {
1535 	"lgpg",
1536 	"tsc",
1537 	"msr",
1538 	"mtrr",
1539 	"pge",
1540 	"de",
1541 	"cmov",
1542 	"mmx",
1543 	"mca",
1544 	"pae",
1545 	"cv8",
1546 	"pat",
1547 	"sep",
1548 	"sse",
1549 	"sse2",
1550 	"htt",
1551 	"asysc",
1552 	"nx",
1553 	"sse3",
1554 	"cx16",
1555 	"cmp",
1556 	"tscp",
1557 	"mwait",
1558 	"sse4a",
1559 	"cpuid",
1560 	"ssse3",
1561 	"sse4_1",
1562 	"sse4_2",
1563 	"1gpg",
1564 	"clfsh",
1565 	"64",
1566 	"aes",
1567 	"pclmulqdq",
1568 	"xsave",
1569 	"avx",
1570 	"vmx",
1571 	"svm",
1572 	"topoext",
1573 	"f16c",
1574 	"rdrand",
1575 	"x2apic",
1576 	"avx2",
1577 	"bmi1",
1578 	"bmi2",
1579 	"fma",
1580 	"smep",
1581 	"smap",
1582 	"adx",
1583 	"rdseed",
1584 	"mpx",
1585 	"avx512f",
1586 	"avx512dq",
1587 	"avx512pf",
1588 	"avx512er",
1589 	"avx512cd",
1590 	"avx512bw",
1591 	"avx512vl",
1592 	"avx512fma",
1593 	"avx512vbmi",
1594 	"avx512_vpopcntdq",
1595 	"avx512_4vnniw",
1596 	"avx512_4fmaps",
1597 	"xsaveopt",
1598 	"xsavec",
1599 	"xsaves",
1600 	"sha",
1601 	"umip",
1602 	"pku",
1603 	"ospke",
1604 	"pcid",
1605 	"invpcid",
1606 	"ibrs",
1607 	"ibpb",
1608 	"stibp",
1609 	"ssbd",
1610 	"ssbd_virt",
1611 	"rdcl_no",
1612 	"ibrs_all",
1613 	"rsba",
1614 	"ssb_no",
1615 	"stibp_all",
1616 	"flush_cmd",
1617 	"l1d_vmentry_no",
1618 	"fsgsbase",
1619 	"clflushopt",
1620 	"clwb",
1621 	"monitorx",
1622 	"clzero",
1623 	"xop",
1624 	"fma4",
1625 	"tbm",
1626 	"avx512_vnni",
1627 	"amd_pcec",
1628 	"md_clear",
1629 	"mds_no",
1630 	"core_thermal",
1631 	"pkg_thermal",
1632 	"tsx_ctrl",
1633 	"taa_no",
1634 	"ppin",
1635 	"vaes",
1636 	"vpclmulqdq",
1637 	"lfence_serializing",
1638 	"gfni",
1639 	"avx512_vp2intersect",
1640 	"avx512_bitalg",
1641 	"avx512_vbmi2",
1642 	"avx512_bf16"
1643 };
1644 
1645 boolean_t
1646 is_x86_feature(void *featureset, uint_t feature)
1647 {
1648 	ASSERT(feature < NUM_X86_FEATURES);
1649 	return (BT_TEST((ulong_t *)featureset, feature));
1650 }
1651 
1652 void
1653 add_x86_feature(void *featureset, uint_t feature)
1654 {
1655 	ASSERT(feature < NUM_X86_FEATURES);
1656 	BT_SET((ulong_t *)featureset, feature);
1657 }
1658 
1659 void
1660 remove_x86_feature(void *featureset, uint_t feature)
1661 {
1662 	ASSERT(feature < NUM_X86_FEATURES);
1663 	BT_CLEAR((ulong_t *)featureset, feature);
1664 }
1665 
1666 boolean_t
1667 compare_x86_featureset(void *setA, void *setB)
1668 {
1669 	/*
1670 	 * We assume that the unused bits of the bitmap are always zero.
1671 	 */
1672 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1673 		return (B_TRUE);
1674 	} else {
1675 		return (B_FALSE);
1676 	}
1677 }
1678 
1679 void
1680 print_x86_featureset(void *featureset)
1681 {
1682 	uint_t i;
1683 
1684 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1685 		if (is_x86_feature(featureset, i)) {
1686 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1687 			    x86_feature_names[i]);
1688 		}
1689 	}
1690 }
1691 
1692 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1693 static size_t xsave_state_size = 0;
1694 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1695 boolean_t xsave_force_disable = B_FALSE;
1696 extern int disable_smap;
1697 
1698 /*
1699  * This is set to platform type we are running on.
1700  */
1701 static int platform_type = -1;
1702 
1703 #if !defined(__xpv)
1704 /*
1705  * Variable to patch if hypervisor platform detection needs to be
1706  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1707  */
1708 int enable_platform_detection = 1;
1709 #endif
1710 
1711 /*
1712  * monitor/mwait info.
1713  *
1714  * size_actual and buf_actual are the real address and size allocated to get
1715  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1716  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1717  * processor cache-line alignment, but this is not guarantied in the furture.
1718  */
1719 struct mwait_info {
1720 	size_t		mon_min;	/* min size to avoid missed wakeups */
1721 	size_t		mon_max;	/* size to avoid false wakeups */
1722 	size_t		size_actual;	/* size actually allocated */
1723 	void		*buf_actual;	/* memory actually allocated */
1724 	uint32_t	support;	/* processor support of monitor/mwait */
1725 };
1726 
1727 /*
1728  * xsave/xrestor info.
1729  *
1730  * This structure contains HW feature bits and the size of the xsave save area.
1731  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1732  * (xsave_state) to describe the xsave layout. However, at runtime the
1733  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1734  * xsave_state structure simply represents the legacy layout of the beginning
1735  * of the xsave area.
1736  */
1737 struct xsave_info {
1738 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1739 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1740 	size_t		xsav_max_size;  /* max size save area for HW features */
1741 	size_t		ymm_size;	/* AVX: size of ymm save area */
1742 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1743 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1744 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1745 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1746 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1747 	size_t		opmask_size;	/* AVX512: size of opmask save */
1748 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1749 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1750 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1751 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1752 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1753 };
1754 
1755 
1756 /*
1757  * These constants determine how many of the elements of the
1758  * cpuid we cache in the cpuid_info data structure; the
1759  * remaining elements are accessible via the cpuid instruction.
1760  */
1761 
1762 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1763 #define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1764 
1765 /*
1766  * See the big theory statement for a more detailed explanation of what some of
1767  * these members mean.
1768  */
1769 struct cpuid_info {
1770 	uint_t cpi_pass;		/* last pass completed */
1771 	/*
1772 	 * standard function information
1773 	 */
1774 	uint_t cpi_maxeax;		/* fn 0: %eax */
1775 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1776 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1777 
1778 	uint_t cpi_family;		/* fn 1: extended family */
1779 	uint_t cpi_model;		/* fn 1: extended model */
1780 	uint_t cpi_step;		/* fn 1: stepping */
1781 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1782 					/*		AMD: package/socket # */
1783 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1784 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1785 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1786 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1787 	uint_t cpi_ncache;		/* fn 2: number of elements */
1788 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1789 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1790 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1791 					/* Intel fn: 4, AMD fn: 8000001d */
1792 	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1793 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1794 	struct cpuid_regs cpi_sub7[1];	/* Leaf 7, sub-leaf 1 */
1795 	/*
1796 	 * extended function information
1797 	 */
1798 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1799 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1800 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1801 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1802 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1803 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1804 
1805 	id_t cpi_coreid;		/* same coreid => strands share core */
1806 	int cpi_pkgcoreid;		/* core number within single package */
1807 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1808 					/* Intel: fn 4: %eax[31-26] */
1809 
1810 	/*
1811 	 * These values represent the number of bits that are required to store
1812 	 * information about the number of cores and threads.
1813 	 */
1814 	uint_t cpi_ncore_bits;
1815 	uint_t cpi_nthread_bits;
1816 	/*
1817 	 * supported feature information
1818 	 */
1819 	uint32_t cpi_support[6];
1820 #define	STD_EDX_FEATURES	0
1821 #define	AMD_EDX_FEATURES	1
1822 #define	TM_EDX_FEATURES		2
1823 #define	STD_ECX_FEATURES	3
1824 #define	AMD_ECX_FEATURES	4
1825 #define	STD_EBX_FEATURES	5
1826 	/*
1827 	 * Synthesized information, where known.
1828 	 */
1829 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1830 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1831 	uint32_t cpi_socket;		/* Chip package/socket type */
1832 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1833 
1834 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1835 	uint32_t cpi_apicid;
1836 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1837 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1838 					/* Intel: 1 */
1839 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1840 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1841 
1842 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1843 };
1844 
1845 
1846 static struct cpuid_info cpuid_info0;
1847 
1848 /*
1849  * These bit fields are defined by the Intel Application Note AP-485
1850  * "Intel Processor Identification and the CPUID Instruction"
1851  */
1852 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1853 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1854 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1855 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1856 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1857 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1858 
1859 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1860 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1861 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1862 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1863 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1864 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1865 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1866 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1867 
1868 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1869 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1870 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1871 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1872 
1873 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1874 #define	CPI_XMAXEAX_MAX		0x80000100
1875 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1876 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1877 
1878 /*
1879  * Function 4 (Deterministic Cache Parameters) macros
1880  * Defined by Intel Application Note AP-485
1881  */
1882 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1883 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1884 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1885 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1886 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1887 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1888 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1889 
1890 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1891 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1892 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1893 
1894 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1895 
1896 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1897 
1898 
1899 /*
1900  * A couple of shorthand macros to identify "later" P6-family chips
1901  * like the Pentium M and Core.  First, the "older" P6-based stuff
1902  * (loosely defined as "pre-Pentium-4"):
1903  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1904  */
1905 #define	IS_LEGACY_P6(cpi) (			\
1906 	cpi->cpi_family == 6 &&			\
1907 		(cpi->cpi_model == 1 ||		\
1908 		cpi->cpi_model == 3 ||		\
1909 		cpi->cpi_model == 5 ||		\
1910 		cpi->cpi_model == 6 ||		\
1911 		cpi->cpi_model == 7 ||		\
1912 		cpi->cpi_model == 8 ||		\
1913 		cpi->cpi_model == 0xA ||	\
1914 		cpi->cpi_model == 0xB)		\
1915 )
1916 
1917 /* A "new F6" is everything with family 6 that's not the above */
1918 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1919 
1920 /* Extended family/model support */
1921 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1922 	cpi->cpi_family >= 0xf)
1923 
1924 /*
1925  * Info for monitor/mwait idle loop.
1926  *
1927  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1928  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1929  * 2006.
1930  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1931  * Documentation Updates" #33633, Rev 2.05, December 2006.
1932  */
1933 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1934 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1935 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1936 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1937 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1938 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1939 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1940 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1941 /*
1942  * Number of sub-cstates for a given c-state.
1943  */
1944 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1945 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1946 
1947 /*
1948  * XSAVE leaf 0xD enumeration
1949  */
1950 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1951 #define	CPUID_LEAFD_2_YMM_SIZE		256
1952 
1953 /*
1954  * Common extended leaf names to cut down on typos.
1955  */
1956 #define	CPUID_LEAF_EXT_0		0x80000000
1957 #define	CPUID_LEAF_EXT_8		0x80000008
1958 #define	CPUID_LEAF_EXT_1d		0x8000001d
1959 #define	CPUID_LEAF_EXT_1e		0x8000001e
1960 
1961 /*
1962  * Functions we consune from cpuid_subr.c;  don't publish these in a header
1963  * file to try and keep people using the expected cpuid_* interfaces.
1964  */
1965 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1966 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1967 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1968 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1969 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
1970 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1971 
1972 /*
1973  * Apply up various platform-dependent restrictions where the
1974  * underlying platform restrictions mean the CPU can be marked
1975  * as less capable than its cpuid instruction would imply.
1976  */
1977 #if defined(__xpv)
1978 static void
1979 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1980 {
1981 	switch (eax) {
1982 	case 1: {
1983 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1984 		    0 : CPUID_INTC_EDX_MCA;
1985 		cp->cp_edx &=
1986 		    ~(mcamask |
1987 		    CPUID_INTC_EDX_PSE |
1988 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1989 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1990 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1991 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1992 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1993 		break;
1994 	}
1995 
1996 	case 0x80000001:
1997 		cp->cp_edx &=
1998 		    ~(CPUID_AMD_EDX_PSE |
1999 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2000 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2001 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2002 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2003 		    CPUID_AMD_EDX_TSCP);
2004 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2005 		break;
2006 	default:
2007 		break;
2008 	}
2009 
2010 	switch (vendor) {
2011 	case X86_VENDOR_Intel:
2012 		switch (eax) {
2013 		case 4:
2014 			/*
2015 			 * Zero out the (ncores-per-chip - 1) field
2016 			 */
2017 			cp->cp_eax &= 0x03fffffff;
2018 			break;
2019 		default:
2020 			break;
2021 		}
2022 		break;
2023 	case X86_VENDOR_AMD:
2024 	case X86_VENDOR_HYGON:
2025 		switch (eax) {
2026 
2027 		case 0x80000001:
2028 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2029 			break;
2030 
2031 		case CPUID_LEAF_EXT_8:
2032 			/*
2033 			 * Zero out the (ncores-per-chip - 1) field
2034 			 */
2035 			cp->cp_ecx &= 0xffffff00;
2036 			break;
2037 		default:
2038 			break;
2039 		}
2040 		break;
2041 	default:
2042 		break;
2043 	}
2044 }
2045 #else
2046 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2047 #endif
2048 
2049 /*
2050  *  Some undocumented ways of patching the results of the cpuid
2051  *  instruction to permit running Solaris 10 on future cpus that
2052  *  we don't currently support.  Could be set to non-zero values
2053  *  via settings in eeprom.
2054  */
2055 
2056 uint32_t cpuid_feature_ecx_include;
2057 uint32_t cpuid_feature_ecx_exclude;
2058 uint32_t cpuid_feature_edx_include;
2059 uint32_t cpuid_feature_edx_exclude;
2060 
2061 /*
2062  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2063  */
2064 void
2065 cpuid_alloc_space(cpu_t *cpu)
2066 {
2067 	/*
2068 	 * By convention, cpu0 is the boot cpu, which is set up
2069 	 * before memory allocation is available.  All other cpus get
2070 	 * their cpuid_info struct allocated here.
2071 	 */
2072 	ASSERT(cpu->cpu_id != 0);
2073 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2074 	cpu->cpu_m.mcpu_cpi =
2075 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2076 }
2077 
2078 void
2079 cpuid_free_space(cpu_t *cpu)
2080 {
2081 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2082 	int i;
2083 
2084 	ASSERT(cpi != NULL);
2085 	ASSERT(cpi != &cpuid_info0);
2086 
2087 	/*
2088 	 * Free up any cache leaf related dynamic storage. The first entry was
2089 	 * cached from the standard cpuid storage, so we should not free it.
2090 	 */
2091 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2092 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2093 	if (cpi->cpi_cache_leaf_size > 0)
2094 		kmem_free(cpi->cpi_cache_leaves,
2095 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2096 
2097 	kmem_free(cpi, sizeof (*cpi));
2098 	cpu->cpu_m.mcpu_cpi = NULL;
2099 }
2100 
2101 #if !defined(__xpv)
2102 /*
2103  * Determine the type of the underlying platform. This is used to customize
2104  * initialization of various subsystems (e.g. TSC). determine_platform() must
2105  * only ever be called once to prevent two processors from seeing different
2106  * values of platform_type. Must be called before cpuid_pass_ident(), the
2107  * earliest consumer to execute; the identification pass will call
2108  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2109  */
2110 void
2111 determine_platform(void)
2112 {
2113 	struct cpuid_regs cp;
2114 	uint32_t base;
2115 	uint32_t regs[4];
2116 	char *hvstr = (char *)regs;
2117 
2118 	ASSERT(platform_type == -1);
2119 
2120 	platform_type = HW_NATIVE;
2121 
2122 	if (!enable_platform_detection)
2123 		return;
2124 
2125 	/*
2126 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2127 	 * vendor signature, and set platform type accordingly.
2128 	 *
2129 	 * References:
2130 	 * http://lkml.org/lkml/2008/10/1/246
2131 	 * http://kb.vmware.com/kb/1009458
2132 	 */
2133 	cp.cp_eax = 0x1;
2134 	(void) __cpuid_insn(&cp);
2135 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2136 		cp.cp_eax = 0x40000000;
2137 		(void) __cpuid_insn(&cp);
2138 		regs[0] = cp.cp_ebx;
2139 		regs[1] = cp.cp_ecx;
2140 		regs[2] = cp.cp_edx;
2141 		regs[3] = 0;
2142 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2143 			platform_type = HW_XEN_HVM;
2144 			return;
2145 		}
2146 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2147 			platform_type = HW_VMWARE;
2148 			return;
2149 		}
2150 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2151 			platform_type = HW_KVM;
2152 			return;
2153 		}
2154 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2155 			platform_type = HW_BHYVE;
2156 			return;
2157 		}
2158 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
2159 			platform_type = HW_MICROSOFT;
2160 	} else {
2161 		/*
2162 		 * Check older VMware hardware versions. VMware hypervisor is
2163 		 * detected by performing an IN operation to VMware hypervisor
2164 		 * port and checking that value returned in %ebx is VMware
2165 		 * hypervisor magic value.
2166 		 *
2167 		 * References: http://kb.vmware.com/kb/1009458
2168 		 */
2169 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2170 		if (regs[1] == VMWARE_HVMAGIC) {
2171 			platform_type = HW_VMWARE;
2172 			return;
2173 		}
2174 	}
2175 
2176 	/*
2177 	 * Check Xen hypervisor. In a fully virtualized domain,
2178 	 * Xen's pseudo-cpuid function returns a string representing the
2179 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2180 	 * supported cpuid function. We need at least a (base + 2) leaf value
2181 	 * to do what we want to do. Try different base values, since the
2182 	 * hypervisor might use a different one depending on whether Hyper-V
2183 	 * emulation is switched on by default or not.
2184 	 */
2185 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2186 		cp.cp_eax = base;
2187 		(void) __cpuid_insn(&cp);
2188 		regs[0] = cp.cp_ebx;
2189 		regs[1] = cp.cp_ecx;
2190 		regs[2] = cp.cp_edx;
2191 		regs[3] = 0;
2192 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2193 		    cp.cp_eax >= (base + 2)) {
2194 			platform_type &= ~HW_NATIVE;
2195 			platform_type |= HW_XEN_HVM;
2196 			return;
2197 		}
2198 	}
2199 }
2200 
2201 int
2202 get_hwenv(void)
2203 {
2204 	ASSERT(platform_type != -1);
2205 	return (platform_type);
2206 }
2207 
2208 int
2209 is_controldom(void)
2210 {
2211 	return (0);
2212 }
2213 
2214 #else
2215 
2216 int
2217 get_hwenv(void)
2218 {
2219 	return (HW_XEN_PV);
2220 }
2221 
2222 int
2223 is_controldom(void)
2224 {
2225 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2226 }
2227 
2228 #endif	/* __xpv */
2229 
2230 /*
2231  * Make sure that we have gathered all of the CPUID leaves that we might need to
2232  * determine topology. We assume that the standard leaf 1 has already been done
2233  * and that xmaxeax has already been calculated.
2234  */
2235 static void
2236 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2237 {
2238 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2239 
2240 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2241 		struct cpuid_regs *cp;
2242 
2243 		cp = &cpi->cpi_extd[8];
2244 		cp->cp_eax = CPUID_LEAF_EXT_8;
2245 		(void) __cpuid_insn(cp);
2246 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2247 	}
2248 
2249 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2250 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2251 		struct cpuid_regs *cp;
2252 
2253 		cp = &cpi->cpi_extd[0x1e];
2254 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2255 		(void) __cpuid_insn(cp);
2256 	}
2257 }
2258 
2259 /*
2260  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2261  * it to everything else. If not, and we're on an AMD system where 8000001e is
2262  * valid, then we use that. Othewrise, we fall back to the default value for the
2263  * APIC ID in leaf 1.
2264  */
2265 static uint32_t
2266 cpuid_gather_apicid(struct cpuid_info *cpi)
2267 {
2268 	/*
2269 	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2270 	 * it, we need to gather it again.
2271 	 */
2272 	if (cpi->cpi_maxeax >= 0xB) {
2273 		struct cpuid_regs regs;
2274 		struct cpuid_regs *cp;
2275 
2276 		cp = &regs;
2277 		cp->cp_eax = 0xB;
2278 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2279 		(void) __cpuid_insn(cp);
2280 
2281 		if (cp->cp_ebx != 0) {
2282 			return (cp->cp_edx);
2283 		}
2284 	}
2285 
2286 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2287 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2288 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2289 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2290 		return (cpi->cpi_extd[0x1e].cp_eax);
2291 	}
2292 
2293 	return (CPI_APIC_ID(cpi));
2294 }
2295 
2296 /*
2297  * For AMD processors, attempt to calculate the number of chips and cores that
2298  * exist. The way that we do this varies based on the generation, because the
2299  * generations themselves have changed dramatically.
2300  *
2301  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2302  * However, with the advent of family 17h (Zen) it actually tells us the number
2303  * of threads, so we need to look at leaf 0x8000001e if available to determine
2304  * its value. Otherwise, for all prior families, the number of enabled cores is
2305  * the same as threads.
2306  *
2307  * If we do not have leaf 0x80000008, then we assume that this processor does
2308  * not have anything. AMD's older CPUID specification says there's no reason to
2309  * fall back to leaf 1.
2310  *
2311  * In some virtualization cases we will not have leaf 8000001e or it will be
2312  * zero. When that happens we assume the number of threads is one.
2313  */
2314 static void
2315 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2316 {
2317 	uint_t nthreads, nthread_per_core;
2318 
2319 	nthreads = nthread_per_core = 1;
2320 
2321 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2322 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2323 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2324 		nthreads = CPI_CPU_COUNT(cpi);
2325 	}
2326 
2327 	/*
2328 	 * For us to have threads, and know about it, we have to be at least at
2329 	 * family 17h and have the cpuid bit that says we have extended
2330 	 * topology.
2331 	 */
2332 	if (cpi->cpi_family >= 0x17 &&
2333 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2334 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2335 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2336 	}
2337 
2338 	*ncpus = nthreads;
2339 	*ncores = nthreads / nthread_per_core;
2340 }
2341 
2342 /*
2343  * Seed the initial values for the cores and threads for an Intel based
2344  * processor. These values will be overwritten if we detect that the processor
2345  * supports CPUID leaf 0xb.
2346  */
2347 static void
2348 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2349 {
2350 	/*
2351 	 * Only seed the number of physical cores from the first level leaf 4
2352 	 * information. The number of threads there indicate how many share the
2353 	 * L1 cache, which may or may not have anything to do with the number of
2354 	 * logical CPUs per core.
2355 	 */
2356 	if (cpi->cpi_maxeax >= 4) {
2357 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2358 	} else {
2359 		*ncores = 1;
2360 	}
2361 
2362 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2363 		*ncpus = CPI_CPU_COUNT(cpi);
2364 	} else {
2365 		*ncpus = *ncores;
2366 	}
2367 }
2368 
2369 static boolean_t
2370 cpuid_leafB_getids(cpu_t *cpu)
2371 {
2372 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2373 	struct cpuid_regs regs;
2374 	struct cpuid_regs *cp;
2375 
2376 	if (cpi->cpi_maxeax < 0xB)
2377 		return (B_FALSE);
2378 
2379 	cp = &regs;
2380 	cp->cp_eax = 0xB;
2381 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2382 
2383 	(void) __cpuid_insn(cp);
2384 
2385 	/*
2386 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2387 	 * indicates that the extended topology enumeration leaf is
2388 	 * available.
2389 	 */
2390 	if (cp->cp_ebx != 0) {
2391 		uint32_t x2apic_id = 0;
2392 		uint_t coreid_shift = 0;
2393 		uint_t ncpu_per_core = 1;
2394 		uint_t chipid_shift = 0;
2395 		uint_t ncpu_per_chip = 1;
2396 		uint_t i;
2397 		uint_t level;
2398 
2399 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2400 			cp->cp_eax = 0xB;
2401 			cp->cp_ecx = i;
2402 
2403 			(void) __cpuid_insn(cp);
2404 			level = CPI_CPU_LEVEL_TYPE(cp);
2405 
2406 			if (level == 1) {
2407 				x2apic_id = cp->cp_edx;
2408 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2409 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2410 			} else if (level == 2) {
2411 				x2apic_id = cp->cp_edx;
2412 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2413 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2414 			}
2415 		}
2416 
2417 		/*
2418 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2419 		 */
2420 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2421 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2422 		    ncpu_per_core;
2423 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2424 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2425 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2426 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2427 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2428 		cpi->cpi_compunitid = cpi->cpi_coreid;
2429 
2430 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2431 			cpi->cpi_nthread_bits = coreid_shift;
2432 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2433 		}
2434 
2435 		return (B_TRUE);
2436 	} else {
2437 		return (B_FALSE);
2438 	}
2439 }
2440 
2441 static void
2442 cpuid_intel_getids(cpu_t *cpu, void *feature)
2443 {
2444 	uint_t i;
2445 	uint_t chipid_shift = 0;
2446 	uint_t coreid_shift = 0;
2447 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2448 
2449 	/*
2450 	 * There are no compute units or processor nodes currently on Intel.
2451 	 * Always set these to one.
2452 	 */
2453 	cpi->cpi_procnodes_per_pkg = 1;
2454 	cpi->cpi_cores_per_compunit = 1;
2455 
2456 	/*
2457 	 * If cpuid Leaf B is present, use that to try and get this information.
2458 	 * It will be the most accurate for Intel CPUs.
2459 	 */
2460 	if (cpuid_leafB_getids(cpu))
2461 		return;
2462 
2463 	/*
2464 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2465 	 * and ncore_per_chip. These represent the largest power of two values
2466 	 * that we need to cover all of the IDs in the system. Therefore, we use
2467 	 * those values to seed the number of bits needed to cover information
2468 	 * in the case when leaf B is not available. These values will probably
2469 	 * be larger than required, but that's OK.
2470 	 */
2471 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2472 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2473 
2474 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2475 		chipid_shift++;
2476 
2477 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2478 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2479 
2480 	if (is_x86_feature(feature, X86FSET_CMP)) {
2481 		/*
2482 		 * Multi-core (and possibly multi-threaded)
2483 		 * processors.
2484 		 */
2485 		uint_t ncpu_per_core = 0;
2486 
2487 		if (cpi->cpi_ncore_per_chip == 1)
2488 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2489 		else if (cpi->cpi_ncore_per_chip > 1)
2490 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2491 			    cpi->cpi_ncore_per_chip;
2492 		/*
2493 		 * 8bit APIC IDs on dual core Pentiums
2494 		 * look like this:
2495 		 *
2496 		 * +-----------------------+------+------+
2497 		 * | Physical Package ID   |  MC  |  HT  |
2498 		 * +-----------------------+------+------+
2499 		 * <------- chipid -------->
2500 		 * <------- coreid --------------->
2501 		 *			   <--- clogid -->
2502 		 *			   <------>
2503 		 *			   pkgcoreid
2504 		 *
2505 		 * Where the number of bits necessary to
2506 		 * represent MC and HT fields together equals
2507 		 * to the minimum number of bits necessary to
2508 		 * store the value of cpi->cpi_ncpu_per_chip.
2509 		 * Of those bits, the MC part uses the number
2510 		 * of bits necessary to store the value of
2511 		 * cpi->cpi_ncore_per_chip.
2512 		 */
2513 		for (i = 1; i < ncpu_per_core; i <<= 1)
2514 			coreid_shift++;
2515 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2516 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2517 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2518 		/*
2519 		 * Single-core multi-threaded processors.
2520 		 */
2521 		cpi->cpi_coreid = cpi->cpi_chipid;
2522 		cpi->cpi_pkgcoreid = 0;
2523 	} else {
2524 		/*
2525 		 * Single-core single-thread processors.
2526 		 */
2527 		cpi->cpi_coreid = cpu->cpu_id;
2528 		cpi->cpi_pkgcoreid = 0;
2529 	}
2530 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2531 	cpi->cpi_compunitid = cpi->cpi_coreid;
2532 }
2533 
2534 /*
2535  * Historically, AMD has had CMP chips with only a single thread per core.
2536  * However, starting in family 17h (Zen), this has changed and they now have
2537  * multiple threads. Our internal core id needs to be a unique value.
2538  *
2539  * To determine the core id of an AMD system, if we're from a family before 17h,
2540  * then we just use the cpu id, as that gives us a good value that will be
2541  * unique for each core. If instead, we're on family 17h or later, then we need
2542  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2543  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2544  * We can't use the normal core id in that leaf as it's only unique within the
2545  * socket, which is perfect for cpi_pkgcoreid, but not us.
2546  */
2547 static id_t
2548 cpuid_amd_get_coreid(cpu_t *cpu)
2549 {
2550 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2551 
2552 	if (cpi->cpi_family >= 0x17 &&
2553 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2554 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2555 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2556 		if (nthreads > 1) {
2557 			VERIFY3U(nthreads, ==, 2);
2558 			return (cpi->cpi_apicid >> 1);
2559 		}
2560 	}
2561 
2562 	return (cpu->cpu_id);
2563 }
2564 
2565 /*
2566  * IDs on AMD is a more challenging task. This is notable because of the
2567  * following two facts:
2568  *
2569  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2570  *     also no way to get an actual unique core id from the system. As such, we
2571  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2572  *     however, guarantee that sibling cores of a chip will have sequential
2573  *     coreids starting at a multiple of the number of cores per chip - that is
2574  *     usually the case, but if the APIC IDs have been set up in a different
2575  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2576  *
2577  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2578  *     called compute units. These compute units share the L1I cache, L2 cache,
2579  *     and the FPU. To deal with this, a new topology leaf was added in
2580  *     0x8000001e. However, parts of this leaf have different meanings
2581  *     once we get to family 0x17.
2582  */
2583 
2584 static void
2585 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2586 {
2587 	int i, first_half, coreidsz;
2588 	uint32_t nb_caps_reg;
2589 	uint_t node2_1;
2590 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2591 	struct cpuid_regs *cp;
2592 
2593 	/*
2594 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2595 	 * hasn't been stripped by virtualization). We always set the compute
2596 	 * unit id to the same value. Also, initialize the default number of
2597 	 * cores per compute unit and nodes per package. This will be
2598 	 * overwritten when we know information about a particular family.
2599 	 */
2600 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2601 	cpi->cpi_compunitid = cpi->cpi_coreid;
2602 	cpi->cpi_cores_per_compunit = 1;
2603 	cpi->cpi_procnodes_per_pkg = 1;
2604 
2605 	/*
2606 	 * To construct the logical ID, we need to determine how many APIC IDs
2607 	 * are dedicated to the cores and threads. This is provided for us in
2608 	 * 0x80000008. However, if it's not present (say due to virtualization),
2609 	 * then we assume it's one. This should be present on all 64-bit AMD
2610 	 * processors.  It was added in family 0xf (Hammer).
2611 	 */
2612 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2613 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2614 
2615 		/*
2616 		 * In AMD parlance chip is really a node while illumos
2617 		 * uses chip as equivalent to socket/package.
2618 		 */
2619 		if (coreidsz == 0) {
2620 			/* Use legacy method */
2621 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2622 				coreidsz++;
2623 			if (coreidsz == 0)
2624 				coreidsz = 1;
2625 		}
2626 	} else {
2627 		/* Assume single-core part */
2628 		coreidsz = 1;
2629 	}
2630 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2631 
2632 	/*
2633 	 * The package core ID varies depending on the family. While it may be
2634 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2635 	 * this value is the core id in the given node. For non-virtualized
2636 	 * family 17h, we need to take the logical core id and shift off the
2637 	 * threads like we do when getting the core id.  Otherwise, we can use
2638 	 * the clogid as is. When family 17h is virtualized, the clogid should
2639 	 * be sufficient as if we don't have valid data in the leaf, then we
2640 	 * won't think we have SMT, in which case the cpi_clogid should be
2641 	 * sufficient.
2642 	 */
2643 	if (cpi->cpi_family >= 0x17 &&
2644 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2645 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2646 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2647 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2648 		if (nthreads > 1) {
2649 			VERIFY3U(nthreads, ==, 2);
2650 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2651 		} else {
2652 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2653 		}
2654 	} else {
2655 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2656 	}
2657 
2658 	/*
2659 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2660 	 * (bulldozer) or newer, then we can derive all of this from leaf
2661 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2662 	 */
2663 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2664 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2665 		cp = &cpi->cpi_extd[0x1e];
2666 
2667 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2668 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2669 
2670 		/*
2671 		 * For Bulldozer-era CPUs, recalculate the compute unit
2672 		 * information.
2673 		 */
2674 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2675 			cpi->cpi_cores_per_compunit =
2676 			    BITX(cp->cp_ebx, 15, 8) + 1;
2677 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2678 			    (cpi->cpi_ncore_per_chip /
2679 			    cpi->cpi_cores_per_compunit) *
2680 			    (cpi->cpi_procnodeid /
2681 			    cpi->cpi_procnodes_per_pkg);
2682 		}
2683 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2684 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2685 	} else if (cpi->cpi_family == 0x10) {
2686 		/*
2687 		 * See if we are a multi-node processor.
2688 		 * All processors in the system have the same number of nodes
2689 		 */
2690 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2691 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2692 			/* Single-node */
2693 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2694 			    coreidsz);
2695 		} else {
2696 
2697 			/*
2698 			 * Multi-node revision D (2 nodes per package
2699 			 * are supported)
2700 			 */
2701 			cpi->cpi_procnodes_per_pkg = 2;
2702 
2703 			first_half = (cpi->cpi_pkgcoreid <=
2704 			    (cpi->cpi_ncore_per_chip/2 - 1));
2705 
2706 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2707 				/* We are BSP */
2708 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2709 			} else {
2710 
2711 				/* We are AP */
2712 				/* NodeId[2:1] bits to use for reading F3xe8 */
2713 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2714 
2715 				nb_caps_reg =
2716 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2717 
2718 				/*
2719 				 * Check IntNodeNum bit (31:30, but bit 31 is
2720 				 * always 0 on dual-node processors)
2721 				 */
2722 				if (BITX(nb_caps_reg, 30, 30) == 0)
2723 					cpi->cpi_procnodeid = node2_1 +
2724 					    !first_half;
2725 				else
2726 					cpi->cpi_procnodeid = node2_1 +
2727 					    first_half;
2728 			}
2729 		}
2730 	} else {
2731 		cpi->cpi_procnodeid = 0;
2732 	}
2733 
2734 	cpi->cpi_chipid =
2735 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2736 
2737 	cpi->cpi_ncore_bits = coreidsz;
2738 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2739 	    cpi->cpi_ncore_per_chip);
2740 }
2741 
2742 static void
2743 spec_uarch_flush_noop(void)
2744 {
2745 }
2746 
2747 /*
2748  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2749  * MDS-related micro-architectural state that would normally happen by calling
2750  * x86_md_clear().
2751  */
2752 static void
2753 spec_uarch_flush_msr(void)
2754 {
2755 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2756 }
2757 
2758 /*
2759  * This function points to a function that will flush certain
2760  * micro-architectural state on the processor. This flush is used to mitigate
2761  * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2762  * function can point to one of three functions:
2763  *
2764  * - A noop which is done because we either are vulnerable, but do not have
2765  *   microcode available to help deal with a fix, or because we aren't
2766  *   vulnerable.
2767  *
2768  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2769  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2770  *   however, it only flushes the MDS related micro-architectural state on the
2771  *   current hyperthread, it does not do anything for the twin.
2772  *
2773  * - x86_md_clear which will flush the MDS related state. This is done when we
2774  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2775  *   (RDCL_NO is set).
2776  */
2777 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2778 
2779 static void
2780 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2781 {
2782 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2783 
2784 	/*
2785 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2786 	 * has been fixed in hardware, it doesn't cover everything related to
2787 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2788 	 * need to mitigate this.
2789 	 */
2790 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2791 	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2792 		return;
2793 	}
2794 
2795 	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2796 		const uint8_t nop = NOP_INSTR;
2797 		uint8_t *md = (uint8_t *)x86_md_clear;
2798 
2799 		*md = nop;
2800 	}
2801 
2802 	membar_producer();
2803 }
2804 
2805 static void
2806 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2807 {
2808 	boolean_t need_l1d, need_mds;
2809 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2810 
2811 	/*
2812 	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2813 	 * hardware, then there's nothing left for us to do for enabling the
2814 	 * flush. We can also go ahead and say that SMT exclusion is
2815 	 * unnecessary.
2816 	 */
2817 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2818 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2819 	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2820 		extern int smt_exclusion;
2821 		smt_exclusion = 0;
2822 		spec_uarch_flush = spec_uarch_flush_noop;
2823 		membar_producer();
2824 		return;
2825 	}
2826 
2827 	/*
2828 	 * The locations where we need to perform an L1D flush are required both
2829 	 * for mitigating L1TF and MDS. When verw support is present in
2830 	 * microcode, then the L1D flush will take care of doing that as well.
2831 	 * However, if we have a system where RDCL_NO is present, but we don't
2832 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2833 	 * L1D flush.
2834 	 */
2835 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2836 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2837 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2838 		need_l1d = B_TRUE;
2839 	} else {
2840 		need_l1d = B_FALSE;
2841 	}
2842 
2843 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2844 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2845 		need_mds = B_TRUE;
2846 	} else {
2847 		need_mds = B_FALSE;
2848 	}
2849 
2850 	if (need_l1d) {
2851 		spec_uarch_flush = spec_uarch_flush_msr;
2852 	} else if (need_mds) {
2853 		spec_uarch_flush = x86_md_clear;
2854 	} else {
2855 		/*
2856 		 * We have no hardware mitigations available to us.
2857 		 */
2858 		spec_uarch_flush = spec_uarch_flush_noop;
2859 	}
2860 	membar_producer();
2861 }
2862 
2863 /*
2864  * We default to enabling RSB mitigations.
2865  *
2866  * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2867  * post-barrier RSB guessing suggests we should enable RSB mitigations always
2868  * unless specifically instructed not to.
2869  */
2870 static void
2871 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2872 {
2873 	const uint8_t ret = RET_INSTR;
2874 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2875 
2876 	switch (mit) {
2877 	case X86_SPECTREV2_DISABLED:
2878 		*stuff = ret;
2879 		break;
2880 	default:
2881 		break;
2882 	}
2883 }
2884 
2885 static void
2886 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2887 {
2888 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2889 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2890 	    "_r14", "_r15" };
2891 	const uint_t nthunks = ARRAY_SIZE(thunks);
2892 	const char *type;
2893 	uint_t i;
2894 
2895 	if (mit == x86_spectrev2_mitigation)
2896 		return;
2897 
2898 	switch (mit) {
2899 	case X86_SPECTREV2_RETPOLINE:
2900 		type = "gen";
2901 		break;
2902 	case X86_SPECTREV2_ENHANCED_IBRS:
2903 	case X86_SPECTREV2_DISABLED:
2904 		type = "jmp";
2905 		break;
2906 	default:
2907 		panic("asked to updated retpoline state with unknown state!");
2908 	}
2909 
2910 	for (i = 0; i < nthunks; i++) {
2911 		uintptr_t source, dest;
2912 		int ssize, dsize;
2913 		char sourcebuf[64], destbuf[64];
2914 
2915 		(void) snprintf(destbuf, sizeof (destbuf),
2916 		    "__x86_indirect_thunk%s", thunks[i]);
2917 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2918 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2919 
2920 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2921 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2922 		VERIFY3U(source, !=, 0);
2923 		VERIFY3U(dest, !=, 0);
2924 		VERIFY3S(dsize, >=, ssize);
2925 		bcopy((void *)source, (void *)dest, ssize);
2926 	}
2927 }
2928 
2929 static void
2930 cpuid_enable_enhanced_ibrs(void)
2931 {
2932 	uint64_t val;
2933 
2934 	val = rdmsr(MSR_IA32_SPEC_CTRL);
2935 	val |= IA32_SPEC_CTRL_IBRS;
2936 	wrmsr(MSR_IA32_SPEC_CTRL, val);
2937 }
2938 
2939 /*
2940  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2941  * we can disable TSX, we do so.
2942  *
2943  * This determination is done only on the boot CPU, potentially after loading
2944  * updated microcode.
2945  */
2946 static void
2947 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2948 {
2949 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2950 
2951 	VERIFY(cpu->cpu_id == 0);
2952 
2953 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2954 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2955 		return;
2956 	}
2957 
2958 	if (x86_disable_taa) {
2959 		x86_taa_mitigation = X86_TAA_DISABLED;
2960 		return;
2961 	}
2962 
2963 	/*
2964 	 * If we do not have the ability to disable TSX, then our only
2965 	 * mitigation options are in hardware (TAA_NO), or by using our existing
2966 	 * MDS mitigation as described above.  The latter relies upon us having
2967 	 * configured MDS mitigations correctly! This includes disabling SMT if
2968 	 * we want to cross-CPU-thread protection.
2969 	 */
2970 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2971 		/*
2972 		 * It's not clear whether any parts will enumerate TAA_NO
2973 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
2974 		 */
2975 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2976 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2977 			return;
2978 		}
2979 
2980 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2981 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2982 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
2983 		} else {
2984 			x86_taa_mitigation = X86_TAA_NOTHING;
2985 		}
2986 		return;
2987 	}
2988 
2989 	/*
2990 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
2991 	 * enough in boot.
2992 	 *
2993 	 * Otherwise, we'll fall back to causing transactions to abort as our
2994 	 * mitigation. TSX-using code will always take the fallback path.
2995 	 */
2996 	if (cpi->cpi_pass < 4) {
2997 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2998 	} else {
2999 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3000 	}
3001 }
3002 
3003 /*
3004  * As mentioned, we should only touch the MSR when we've got a suitable
3005  * microcode loaded on this CPU.
3006  */
3007 static void
3008 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3009 {
3010 	uint64_t val;
3011 
3012 	switch (taa) {
3013 	case X86_TAA_TSX_DISABLE:
3014 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3015 			return;
3016 		val = rdmsr(MSR_IA32_TSX_CTRL);
3017 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3018 		wrmsr(MSR_IA32_TSX_CTRL, val);
3019 		break;
3020 	case X86_TAA_TSX_FORCE_ABORT:
3021 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3022 			return;
3023 		val = rdmsr(MSR_IA32_TSX_CTRL);
3024 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3025 		wrmsr(MSR_IA32_TSX_CTRL, val);
3026 		break;
3027 	case X86_TAA_HW_MITIGATED:
3028 	case X86_TAA_MD_CLEAR:
3029 	case X86_TAA_DISABLED:
3030 	case X86_TAA_NOTHING:
3031 		break;
3032 	}
3033 }
3034 
3035 static void
3036 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3037 {
3038 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3039 	x86_spectrev2_mitigation_t v2mit;
3040 
3041 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3042 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3043 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3044 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3045 			add_x86_feature(featureset, X86FSET_IBPB);
3046 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3047 			add_x86_feature(featureset, X86FSET_IBRS);
3048 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3049 			add_x86_feature(featureset, X86FSET_STIBP);
3050 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3051 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3052 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3053 			add_x86_feature(featureset, X86FSET_SSBD);
3054 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3055 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3056 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3057 			add_x86_feature(featureset, X86FSET_SSB_NO);
3058 		/*
3059 		 * Don't enable enhanced IBRS unless we're told that we should
3060 		 * prefer it and it has the same semantics as Intel. This is
3061 		 * split into two bits rather than a single one.
3062 		 */
3063 		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3064 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
3065 			add_x86_feature(featureset, X86FSET_IBRS_ALL);
3066 		}
3067 
3068 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3069 	    cpi->cpi_maxeax >= 7) {
3070 		struct cpuid_regs *ecp;
3071 		ecp = &cpi->cpi_std[7];
3072 
3073 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3074 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3075 		}
3076 
3077 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3078 			add_x86_feature(featureset, X86FSET_IBRS);
3079 			add_x86_feature(featureset, X86FSET_IBPB);
3080 		}
3081 
3082 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3083 			add_x86_feature(featureset, X86FSET_STIBP);
3084 		}
3085 
3086 		/*
3087 		 * Don't read the arch caps MSR on xpv where we lack the
3088 		 * on_trap().
3089 		 */
3090 #ifndef __xpv
3091 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3092 			on_trap_data_t otd;
3093 
3094 			/*
3095 			 * Be paranoid and assume we'll get a #GP.
3096 			 */
3097 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3098 				uint64_t reg;
3099 
3100 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3101 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3102 					add_x86_feature(featureset,
3103 					    X86FSET_RDCL_NO);
3104 				}
3105 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3106 					add_x86_feature(featureset,
3107 					    X86FSET_IBRS_ALL);
3108 				}
3109 				if (reg & IA32_ARCH_CAP_RSBA) {
3110 					add_x86_feature(featureset,
3111 					    X86FSET_RSBA);
3112 				}
3113 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3114 					add_x86_feature(featureset,
3115 					    X86FSET_L1D_VM_NO);
3116 				}
3117 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3118 					add_x86_feature(featureset,
3119 					    X86FSET_SSB_NO);
3120 				}
3121 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3122 					add_x86_feature(featureset,
3123 					    X86FSET_MDS_NO);
3124 				}
3125 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3126 					add_x86_feature(featureset,
3127 					    X86FSET_TSX_CTRL);
3128 				}
3129 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3130 					add_x86_feature(featureset,
3131 					    X86FSET_TAA_NO);
3132 				}
3133 			}
3134 			no_trap();
3135 		}
3136 #endif	/* !__xpv */
3137 
3138 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3139 			add_x86_feature(featureset, X86FSET_SSBD);
3140 
3141 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3142 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3143 	}
3144 
3145 	/*
3146 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3147 	 * will have already run this function and determined what we need to
3148 	 * do. This gives us a hook for per-HW thread mitigations such as
3149 	 * enhanced IBRS, or disabling TSX.
3150 	 */
3151 	if (cpu->cpu_id != 0) {
3152 		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
3153 			cpuid_enable_enhanced_ibrs();
3154 		}
3155 
3156 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3157 		return;
3158 	}
3159 
3160 	/*
3161 	 * Go through and initialize various security mechanisms that we should
3162 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3163 	 * TAA.
3164 	 */
3165 
3166 	/*
3167 	 * By default we've come in with retpolines enabled. Check whether we
3168 	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3169 	 * by default, but disabled if we are using enhanced IBRS. Note, we do
3170 	 * not allow the use of AMD optimized retpolines as it was disclosed by
3171 	 * AMD in March 2022 that they were still vulnerable. Prior to that
3172 	 * point, we used them.
3173 	 */
3174 	if (x86_disable_spectrev2 != 0) {
3175 		v2mit = X86_SPECTREV2_DISABLED;
3176 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3177 		cpuid_enable_enhanced_ibrs();
3178 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3179 	} else {
3180 		v2mit = X86_SPECTREV2_RETPOLINE;
3181 	}
3182 
3183 	cpuid_patch_retpolines(v2mit);
3184 	cpuid_patch_rsb(v2mit);
3185 	x86_spectrev2_mitigation = v2mit;
3186 	membar_producer();
3187 
3188 	/*
3189 	 * We need to determine what changes are required for mitigating L1TF
3190 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3191 	 * is required.
3192 	 *
3193 	 * If any of these are present, then we need to flush u-arch state at
3194 	 * various points. For MDS, we need to do so whenever we change to a
3195 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3196 	 * flush the L1D cache at VM entry. When we have microcode that handles
3197 	 * MDS, the L1D flush also clears the other u-arch state that the
3198 	 * md_clear does.
3199 	 */
3200 
3201 	/*
3202 	 * Update whether or not we need to be taking explicit action against
3203 	 * MDS.
3204 	 */
3205 	cpuid_update_md_clear(cpu, featureset);
3206 
3207 	/*
3208 	 * Determine whether SMT exclusion is required and whether or not we
3209 	 * need to perform an l1d flush.
3210 	 */
3211 	cpuid_update_l1d_flush(cpu, featureset);
3212 
3213 	/*
3214 	 * Determine what our mitigation strategy should be for TAA and then
3215 	 * also apply TAA mitigations.
3216 	 */
3217 	cpuid_update_tsx(cpu, featureset);
3218 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3219 }
3220 
3221 /*
3222  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3223  */
3224 void
3225 setup_xfem(void)
3226 {
3227 	uint64_t flags = XFEATURE_LEGACY_FP;
3228 
3229 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3230 
3231 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3232 		flags |= XFEATURE_SSE;
3233 
3234 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3235 		flags |= XFEATURE_AVX;
3236 
3237 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3238 		flags |= XFEATURE_AVX512;
3239 
3240 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3241 
3242 	xsave_bv_all = flags;
3243 }
3244 
3245 static void
3246 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3247 {
3248 	struct cpuid_info *cpi;
3249 
3250 	cpi = cpu->cpu_m.mcpu_cpi;
3251 
3252 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3253 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3254 		cpuid_gather_amd_topology_leaves(cpu);
3255 	}
3256 
3257 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3258 
3259 	/*
3260 	 * Before we can calculate the IDs that we should assign to this
3261 	 * processor, we need to understand how many cores and threads it has.
3262 	 */
3263 	switch (cpi->cpi_vendor) {
3264 	case X86_VENDOR_Intel:
3265 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3266 		    &cpi->cpi_ncore_per_chip);
3267 		break;
3268 	case X86_VENDOR_AMD:
3269 	case X86_VENDOR_HYGON:
3270 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3271 		    &cpi->cpi_ncore_per_chip);
3272 		break;
3273 	default:
3274 		/*
3275 		 * If we have some other x86 compatible chip, it's not clear how
3276 		 * they would behave. The most common case is virtualization
3277 		 * today, though there are also 64-bit VIA chips. Assume that
3278 		 * all we can get is the basic Leaf 1 HTT information.
3279 		 */
3280 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3281 			cpi->cpi_ncore_per_chip = 1;
3282 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3283 		}
3284 		break;
3285 	}
3286 
3287 	/*
3288 	 * Based on the calculated number of threads and cores, potentially
3289 	 * assign the HTT and CMT features.
3290 	 */
3291 	if (cpi->cpi_ncore_per_chip > 1) {
3292 		add_x86_feature(featureset, X86FSET_CMP);
3293 	}
3294 
3295 	if (cpi->cpi_ncpu_per_chip > 1 &&
3296 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3297 		add_x86_feature(featureset, X86FSET_HTT);
3298 	}
3299 
3300 	/*
3301 	 * Now that has been set up, we need to go through and calculate all of
3302 	 * the rest of the parameters that exist. If we think the CPU doesn't
3303 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3304 	 * up information in some way. The most likely case for this is
3305 	 * virtualization where we have a lot of partial topology information.
3306 	 */
3307 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3308 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3309 		/*
3310 		 * This is a single core, single-threaded processor.
3311 		 */
3312 		cpi->cpi_procnodes_per_pkg = 1;
3313 		cpi->cpi_cores_per_compunit = 1;
3314 		cpi->cpi_compunitid = 0;
3315 		cpi->cpi_chipid = -1;
3316 		cpi->cpi_clogid = 0;
3317 		cpi->cpi_coreid = cpu->cpu_id;
3318 		cpi->cpi_pkgcoreid = 0;
3319 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3320 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3321 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3322 		} else {
3323 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3324 		}
3325 	} else {
3326 		switch (cpi->cpi_vendor) {
3327 		case X86_VENDOR_Intel:
3328 			cpuid_intel_getids(cpu, featureset);
3329 			break;
3330 		case X86_VENDOR_AMD:
3331 		case X86_VENDOR_HYGON:
3332 			cpuid_amd_getids(cpu, featureset);
3333 			break;
3334 		default:
3335 			/*
3336 			 * In this case, it's hard to say what we should do.
3337 			 * We're going to model them to the OS as single core
3338 			 * threads. We don't have a good identifier for them, so
3339 			 * we're just going to use the cpu id all on a single
3340 			 * chip.
3341 			 *
3342 			 * This case has historically been different from the
3343 			 * case above where we don't have HTT or CMP. While they
3344 			 * could be combined, we've opted to keep it separate to
3345 			 * minimize the risk of topology changes in weird cases.
3346 			 */
3347 			cpi->cpi_procnodes_per_pkg = 1;
3348 			cpi->cpi_cores_per_compunit = 1;
3349 			cpi->cpi_chipid = 0;
3350 			cpi->cpi_coreid = cpu->cpu_id;
3351 			cpi->cpi_clogid = cpu->cpu_id;
3352 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3353 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3354 			cpi->cpi_compunitid = cpi->cpi_coreid;
3355 			break;
3356 		}
3357 	}
3358 }
3359 
3360 /*
3361  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3362  * always gather leaf 6 if it's supported; however, we only look for features on
3363  * Intel systems as AMD does not currently define any of the features we look
3364  * for below.
3365  */
3366 static void
3367 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3368 {
3369 	struct cpuid_regs *cp;
3370 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3371 
3372 	if (cpi->cpi_maxeax < 6) {
3373 		return;
3374 	}
3375 
3376 	cp = &cpi->cpi_std[6];
3377 	cp->cp_eax = 6;
3378 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3379 	(void) __cpuid_insn(cp);
3380 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3381 
3382 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3383 		return;
3384 	}
3385 
3386 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3387 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3388 	}
3389 
3390 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3391 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3392 	}
3393 }
3394 
3395 /*
3396  * This is used when we discover that we have AVX support in cpuid. This
3397  * proceeds to scan for the rest of the AVX derived features.
3398  */
3399 static void
3400 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3401 {
3402 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3403 
3404 	/*
3405 	 * If we don't have AVX, don't bother with most of this.
3406 	 */
3407 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3408 		return;
3409 
3410 	add_x86_feature(featureset, X86FSET_AVX);
3411 
3412 	/*
3413 	 * Intel says we can't check these without also
3414 	 * checking AVX.
3415 	 */
3416 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3417 		add_x86_feature(featureset, X86FSET_F16C);
3418 
3419 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3420 		add_x86_feature(featureset, X86FSET_FMA);
3421 
3422 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3423 		add_x86_feature(featureset, X86FSET_BMI1);
3424 
3425 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3426 		add_x86_feature(featureset, X86FSET_BMI2);
3427 
3428 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3429 		add_x86_feature(featureset, X86FSET_AVX2);
3430 
3431 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3432 		add_x86_feature(featureset, X86FSET_VAES);
3433 
3434 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3435 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3436 
3437 	/*
3438 	 * The rest of the AVX features require AVX512. Do not check them unless
3439 	 * it is present.
3440 	 */
3441 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3442 		return;
3443 	add_x86_feature(featureset, X86FSET_AVX512F);
3444 
3445 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3446 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3447 
3448 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3449 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3450 
3451 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3452 		add_x86_feature(featureset, X86FSET_AVX512PF);
3453 
3454 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3455 		add_x86_feature(featureset, X86FSET_AVX512ER);
3456 
3457 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3458 		add_x86_feature(featureset, X86FSET_AVX512CD);
3459 
3460 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3461 		add_x86_feature(featureset, X86FSET_AVX512BW);
3462 
3463 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3464 		add_x86_feature(featureset, X86FSET_AVX512VL);
3465 
3466 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3467 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3468 
3469 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3470 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3471 
3472 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3473 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3474 
3475 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3476 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3477 
3478 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3479 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3480 
3481 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3482 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3483 
3484 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3485 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3486 
3487 	/*
3488 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3489 	 * we don't need to.
3490 	 */
3491 	if (cpi->cpi_std[7].cp_eax < 1)
3492 		return;
3493 
3494 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3495 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3496 }
3497 
3498 /*
3499  * PPIN is the protected processor inventory number. On AMD this is an actual
3500  * feature bit. However, on Intel systems we need to read the platform
3501  * information MSR if we're on a specific model.
3502  */
3503 #if !defined(__xpv)
3504 static void
3505 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3506 {
3507 	on_trap_data_t otd;
3508 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3509 
3510 	switch (cpi->cpi_vendor) {
3511 	case X86_VENDOR_AMD:
3512 		/*
3513 		 * This leaf will have already been gathered in the topology
3514 		 * functions.
3515 		 */
3516 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3517 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3518 				add_x86_feature(featureset, X86FSET_PPIN);
3519 			}
3520 		}
3521 		break;
3522 	case X86_VENDOR_Intel:
3523 		if (cpi->cpi_family != 6)
3524 			break;
3525 		switch (cpi->cpi_model) {
3526 		case INTC_MODEL_IVYBRIDGE_XEON:
3527 		case INTC_MODEL_HASWELL_XEON:
3528 		case INTC_MODEL_BROADWELL_XEON:
3529 		case INTC_MODEL_BROADWELL_XEON_D:
3530 		case INTC_MODEL_SKYLAKE_XEON:
3531 		case INTC_MODEL_ICELAKE_XEON:
3532 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3533 				uint64_t value;
3534 
3535 				value = rdmsr(MSR_PLATFORM_INFO);
3536 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3537 					add_x86_feature(featureset,
3538 					    X86FSET_PPIN);
3539 				}
3540 			}
3541 			no_trap();
3542 			break;
3543 		default:
3544 			break;
3545 		}
3546 		break;
3547 	default:
3548 		break;
3549 	}
3550 }
3551 #endif	/* ! __xpv */
3552 
3553 static void
3554 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3555 {
3556 	uchar_t *featureset = (uchar_t *)arg;
3557 
3558 	/*
3559 	 * We don't run on any processor that doesn't have cpuid, and could not
3560 	 * possibly have arrived here.
3561 	 */
3562 	add_x86_feature(featureset, X86FSET_CPUID);
3563 }
3564 
3565 static void
3566 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3567 {
3568 	struct cpuid_info *cpi;
3569 	struct cpuid_regs *cp;
3570 
3571 	/*
3572 	 * We require that virtual/native detection be complete and that PCI
3573 	 * config space access has been set up; at present there is no reliable
3574 	 * way to determine the latter.
3575 	 */
3576 #if !defined(__xpv)
3577 	ASSERT3S(platform_type, !=, -1);
3578 #endif	/* !__xpv */
3579 
3580 	cpi = cpu->cpu_m.mcpu_cpi;
3581 	ASSERT(cpi != NULL);
3582 
3583 	cp = &cpi->cpi_std[0];
3584 	cp->cp_eax = 0;
3585 	cpi->cpi_maxeax = __cpuid_insn(cp);
3586 	{
3587 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3588 		*iptr++ = cp->cp_ebx;
3589 		*iptr++ = cp->cp_edx;
3590 		*iptr++ = cp->cp_ecx;
3591 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3592 	}
3593 
3594 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3595 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3596 
3597 	/*
3598 	 * Limit the range in case of weird hardware
3599 	 */
3600 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3601 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3602 	if (cpi->cpi_maxeax < 1)
3603 		return;
3604 
3605 	cp = &cpi->cpi_std[1];
3606 	cp->cp_eax = 1;
3607 	(void) __cpuid_insn(cp);
3608 
3609 	/*
3610 	 * Extract identifying constants for easy access.
3611 	 */
3612 	cpi->cpi_model = CPI_MODEL(cpi);
3613 	cpi->cpi_family = CPI_FAMILY(cpi);
3614 
3615 	if (cpi->cpi_family == 0xf)
3616 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3617 
3618 	/*
3619 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3620 	 * Intel, and presumably everyone else, uses model == 0xf, as
3621 	 * one would expect (max value means possible overflow).  Sigh.
3622 	 */
3623 
3624 	switch (cpi->cpi_vendor) {
3625 	case X86_VENDOR_Intel:
3626 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3627 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3628 		break;
3629 	case X86_VENDOR_AMD:
3630 		if (CPI_FAMILY(cpi) == 0xf)
3631 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3632 		break;
3633 	case X86_VENDOR_HYGON:
3634 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3635 		break;
3636 	default:
3637 		if (cpi->cpi_model == 0xf)
3638 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3639 		break;
3640 	}
3641 
3642 	cpi->cpi_step = CPI_STEP(cpi);
3643 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3644 
3645 	/*
3646 	 * Synthesize chip "revision" and socket type
3647 	 */
3648 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3649 	    cpi->cpi_model, cpi->cpi_step);
3650 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3651 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3652 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3653 	    cpi->cpi_model, cpi->cpi_step);
3654 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3655 	    cpi->cpi_model, cpi->cpi_step);
3656 }
3657 
3658 static void
3659 cpuid_pass_basic(cpu_t *cpu, void *arg)
3660 {
3661 	uchar_t *featureset = (uchar_t *)arg;
3662 	uint32_t mask_ecx, mask_edx;
3663 	struct cpuid_info *cpi;
3664 	struct cpuid_regs *cp;
3665 	int xcpuid;
3666 #if !defined(__xpv)
3667 	extern int idle_cpu_prefer_mwait;
3668 #endif
3669 
3670 	cpi = cpu->cpu_m.mcpu_cpi;
3671 	ASSERT(cpi != NULL);
3672 
3673 	if (cpi->cpi_maxeax < 1)
3674 		return;
3675 
3676 	/*
3677 	 * This was filled during the identification pass.
3678 	 */
3679 	cp = &cpi->cpi_std[1];
3680 
3681 	/*
3682 	 * *default* assumptions:
3683 	 * - believe %edx feature word
3684 	 * - ignore %ecx feature word
3685 	 * - 32-bit virtual and physical addressing
3686 	 */
3687 	mask_edx = 0xffffffff;
3688 	mask_ecx = 0;
3689 
3690 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3691 
3692 	switch (cpi->cpi_vendor) {
3693 	case X86_VENDOR_Intel:
3694 		if (cpi->cpi_family == 5)
3695 			x86_type = X86_TYPE_P5;
3696 		else if (IS_LEGACY_P6(cpi)) {
3697 			x86_type = X86_TYPE_P6;
3698 			pentiumpro_bug4046376 = 1;
3699 			/*
3700 			 * Clear the SEP bit when it was set erroneously
3701 			 */
3702 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3703 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3704 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3705 			x86_type = X86_TYPE_P4;
3706 			/*
3707 			 * We don't currently depend on any of the %ecx
3708 			 * features until Prescott, so we'll only check
3709 			 * this from P4 onwards.  We might want to revisit
3710 			 * that idea later.
3711 			 */
3712 			mask_ecx = 0xffffffff;
3713 		} else if (cpi->cpi_family > 0xf)
3714 			mask_ecx = 0xffffffff;
3715 		/*
3716 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3717 		 * to obtain the monitor linesize.
3718 		 */
3719 		if (cpi->cpi_maxeax < 5)
3720 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3721 		break;
3722 	case X86_VENDOR_IntelClone:
3723 	default:
3724 		break;
3725 	case X86_VENDOR_AMD:
3726 #if defined(OPTERON_ERRATUM_108)
3727 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3728 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3729 			cpi->cpi_model = 0xc;
3730 		} else
3731 #endif
3732 		if (cpi->cpi_family == 5) {
3733 			/*
3734 			 * AMD K5 and K6
3735 			 *
3736 			 * These CPUs have an incomplete implementation
3737 			 * of MCA/MCE which we mask away.
3738 			 */
3739 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3740 
3741 			/*
3742 			 * Model 0 uses the wrong (APIC) bit
3743 			 * to indicate PGE.  Fix it here.
3744 			 */
3745 			if (cpi->cpi_model == 0) {
3746 				if (cp->cp_edx & 0x200) {
3747 					cp->cp_edx &= ~0x200;
3748 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3749 				}
3750 			}
3751 
3752 			/*
3753 			 * Early models had problems w/ MMX; disable.
3754 			 */
3755 			if (cpi->cpi_model < 6)
3756 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3757 		}
3758 
3759 		/*
3760 		 * For newer families, SSE3 and CX16, at least, are valid;
3761 		 * enable all
3762 		 */
3763 		if (cpi->cpi_family >= 0xf)
3764 			mask_ecx = 0xffffffff;
3765 		/*
3766 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3767 		 * to obtain the monitor linesize.
3768 		 */
3769 		if (cpi->cpi_maxeax < 5)
3770 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3771 
3772 #if !defined(__xpv)
3773 		/*
3774 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3775 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3776 		 * know for certain that in at least family 17h, per AMD, mwait
3777 		 * is preferred. Families in-between are less certain.
3778 		 */
3779 		if (cpi->cpi_family < 0x17) {
3780 			idle_cpu_prefer_mwait = 0;
3781 		}
3782 #endif
3783 
3784 		break;
3785 	case X86_VENDOR_HYGON:
3786 		/* Enable all for Hygon Dhyana CPU */
3787 		mask_ecx = 0xffffffff;
3788 		break;
3789 	case X86_VENDOR_TM:
3790 		/*
3791 		 * workaround the NT workaround in CMS 4.1
3792 		 */
3793 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3794 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3795 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3796 		break;
3797 	case X86_VENDOR_Centaur:
3798 		/*
3799 		 * workaround the NT workarounds again
3800 		 */
3801 		if (cpi->cpi_family == 6)
3802 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3803 		break;
3804 	case X86_VENDOR_Cyrix:
3805 		/*
3806 		 * We rely heavily on the probing in locore
3807 		 * to actually figure out what parts, if any,
3808 		 * of the Cyrix cpuid instruction to believe.
3809 		 */
3810 		switch (x86_type) {
3811 		case X86_TYPE_CYRIX_486:
3812 			mask_edx = 0;
3813 			break;
3814 		case X86_TYPE_CYRIX_6x86:
3815 			mask_edx = 0;
3816 			break;
3817 		case X86_TYPE_CYRIX_6x86L:
3818 			mask_edx =
3819 			    CPUID_INTC_EDX_DE |
3820 			    CPUID_INTC_EDX_CX8;
3821 			break;
3822 		case X86_TYPE_CYRIX_6x86MX:
3823 			mask_edx =
3824 			    CPUID_INTC_EDX_DE |
3825 			    CPUID_INTC_EDX_MSR |
3826 			    CPUID_INTC_EDX_CX8 |
3827 			    CPUID_INTC_EDX_PGE |
3828 			    CPUID_INTC_EDX_CMOV |
3829 			    CPUID_INTC_EDX_MMX;
3830 			break;
3831 		case X86_TYPE_CYRIX_GXm:
3832 			mask_edx =
3833 			    CPUID_INTC_EDX_MSR |
3834 			    CPUID_INTC_EDX_CX8 |
3835 			    CPUID_INTC_EDX_CMOV |
3836 			    CPUID_INTC_EDX_MMX;
3837 			break;
3838 		case X86_TYPE_CYRIX_MediaGX:
3839 			break;
3840 		case X86_TYPE_CYRIX_MII:
3841 		case X86_TYPE_VIA_CYRIX_III:
3842 			mask_edx =
3843 			    CPUID_INTC_EDX_DE |
3844 			    CPUID_INTC_EDX_TSC |
3845 			    CPUID_INTC_EDX_MSR |
3846 			    CPUID_INTC_EDX_CX8 |
3847 			    CPUID_INTC_EDX_PGE |
3848 			    CPUID_INTC_EDX_CMOV |
3849 			    CPUID_INTC_EDX_MMX;
3850 			break;
3851 		default:
3852 			break;
3853 		}
3854 		break;
3855 	}
3856 
3857 #if defined(__xpv)
3858 	/*
3859 	 * Do not support MONITOR/MWAIT under a hypervisor
3860 	 */
3861 	mask_ecx &= ~CPUID_INTC_ECX_MON;
3862 	/*
3863 	 * Do not support XSAVE under a hypervisor for now
3864 	 */
3865 	xsave_force_disable = B_TRUE;
3866 
3867 #endif	/* __xpv */
3868 
3869 	if (xsave_force_disable) {
3870 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3871 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3872 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3873 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3874 	}
3875 
3876 	/*
3877 	 * Now we've figured out the masks that determine
3878 	 * which bits we choose to believe, apply the masks
3879 	 * to the feature words, then map the kernel's view
3880 	 * of these feature words into its feature word.
3881 	 */
3882 	cp->cp_edx &= mask_edx;
3883 	cp->cp_ecx &= mask_ecx;
3884 
3885 	/*
3886 	 * apply any platform restrictions (we don't call this
3887 	 * immediately after __cpuid_insn here, because we need the
3888 	 * workarounds applied above first)
3889 	 */
3890 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3891 
3892 	/*
3893 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3894 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
3895 	 * 7 has sub-leaves determined by ecx.
3896 	 */
3897 	if (cpi->cpi_maxeax >= 7) {
3898 		struct cpuid_regs *ecp;
3899 		ecp = &cpi->cpi_std[7];
3900 		ecp->cp_eax = 7;
3901 		ecp->cp_ecx = 0;
3902 		(void) __cpuid_insn(ecp);
3903 
3904 		/*
3905 		 * If XSAVE has been disabled, just ignore all of the
3906 		 * extended-save-area dependent flags here. By removing most of
3907 		 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
3908 		 * end up looking at additional xsave dependent leaves right
3909 		 * now.
3910 		 */
3911 		if (xsave_force_disable) {
3912 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3913 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3914 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3915 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3916 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3917 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3918 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3919 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
3920 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
3921 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
3922 		}
3923 
3924 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3925 			add_x86_feature(featureset, X86FSET_SMEP);
3926 
3927 		/*
3928 		 * We check disable_smap here in addition to in startup_smap()
3929 		 * to ensure CPUs that aren't the boot CPU don't accidentally
3930 		 * include it in the feature set and thus generate a mismatched
3931 		 * x86 feature set across CPUs.
3932 		 */
3933 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3934 		    disable_smap == 0)
3935 			add_x86_feature(featureset, X86FSET_SMAP);
3936 
3937 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3938 			add_x86_feature(featureset, X86FSET_RDSEED);
3939 
3940 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3941 			add_x86_feature(featureset, X86FSET_ADX);
3942 
3943 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3944 			add_x86_feature(featureset, X86FSET_FSGSBASE);
3945 
3946 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3947 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3948 
3949 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3950 			add_x86_feature(featureset, X86FSET_INVPCID);
3951 
3952 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3953 			add_x86_feature(featureset, X86FSET_UMIP);
3954 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3955 			add_x86_feature(featureset, X86FSET_PKU);
3956 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3957 			add_x86_feature(featureset, X86FSET_OSPKE);
3958 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
3959 			add_x86_feature(featureset, X86FSET_GFNI);
3960 
3961 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3962 			add_x86_feature(featureset, X86FSET_CLWB);
3963 
3964 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3965 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3966 				add_x86_feature(featureset, X86FSET_MPX);
3967 		}
3968 
3969 		/*
3970 		 * If we have subleaf 1 available, grab and store that. This is
3971 		 * used for more AVX and related features.
3972 		 */
3973 		if (ecp->cp_eax >= 1) {
3974 			struct cpuid_regs *c71;
3975 			c71 = &cpi->cpi_sub7[0];
3976 			c71->cp_eax = 7;
3977 			c71->cp_ecx = 1;
3978 			(void) __cpuid_insn(c71);
3979 		}
3980 	}
3981 
3982 	/*
3983 	 * fold in overrides from the "eeprom" mechanism
3984 	 */
3985 	cp->cp_edx |= cpuid_feature_edx_include;
3986 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3987 
3988 	cp->cp_ecx |= cpuid_feature_ecx_include;
3989 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3990 
3991 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3992 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3993 	}
3994 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3995 		add_x86_feature(featureset, X86FSET_TSC);
3996 	}
3997 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3998 		add_x86_feature(featureset, X86FSET_MSR);
3999 	}
4000 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4001 		add_x86_feature(featureset, X86FSET_MTRR);
4002 	}
4003 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4004 		add_x86_feature(featureset, X86FSET_PGE);
4005 	}
4006 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4007 		add_x86_feature(featureset, X86FSET_CMOV);
4008 	}
4009 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4010 		add_x86_feature(featureset, X86FSET_MMX);
4011 	}
4012 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4013 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4014 		add_x86_feature(featureset, X86FSET_MCA);
4015 	}
4016 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4017 		add_x86_feature(featureset, X86FSET_PAE);
4018 	}
4019 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4020 		add_x86_feature(featureset, X86FSET_CX8);
4021 	}
4022 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4023 		add_x86_feature(featureset, X86FSET_CX16);
4024 	}
4025 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4026 		add_x86_feature(featureset, X86FSET_PAT);
4027 	}
4028 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4029 		add_x86_feature(featureset, X86FSET_SEP);
4030 	}
4031 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4032 		/*
4033 		 * In our implementation, fxsave/fxrstor
4034 		 * are prerequisites before we'll even
4035 		 * try and do SSE things.
4036 		 */
4037 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4038 			add_x86_feature(featureset, X86FSET_SSE);
4039 		}
4040 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4041 			add_x86_feature(featureset, X86FSET_SSE2);
4042 		}
4043 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4044 			add_x86_feature(featureset, X86FSET_SSE3);
4045 		}
4046 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4047 			add_x86_feature(featureset, X86FSET_SSSE3);
4048 		}
4049 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4050 			add_x86_feature(featureset, X86FSET_SSE4_1);
4051 		}
4052 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4053 			add_x86_feature(featureset, X86FSET_SSE4_2);
4054 		}
4055 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4056 			add_x86_feature(featureset, X86FSET_AES);
4057 		}
4058 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4059 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4060 		}
4061 
4062 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4063 			add_x86_feature(featureset, X86FSET_SHA);
4064 
4065 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4066 			add_x86_feature(featureset, X86FSET_XSAVE);
4067 
4068 			/* We only test AVX & AVX512 when there is XSAVE */
4069 			cpuid_basic_avx(cpu, featureset);
4070 		}
4071 	}
4072 
4073 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4074 		add_x86_feature(featureset, X86FSET_PCID);
4075 	}
4076 
4077 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4078 		add_x86_feature(featureset, X86FSET_X2APIC);
4079 	}
4080 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4081 		add_x86_feature(featureset, X86FSET_DE);
4082 	}
4083 #if !defined(__xpv)
4084 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4085 
4086 		/*
4087 		 * We require the CLFLUSH instruction for erratum workaround
4088 		 * to use MONITOR/MWAIT.
4089 		 */
4090 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4091 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4092 			add_x86_feature(featureset, X86FSET_MWAIT);
4093 		} else {
4094 			extern int idle_cpu_assert_cflush_monitor;
4095 
4096 			/*
4097 			 * All processors we are aware of which have
4098 			 * MONITOR/MWAIT also have CLFLUSH.
4099 			 */
4100 			if (idle_cpu_assert_cflush_monitor) {
4101 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4102 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4103 			}
4104 		}
4105 	}
4106 #endif	/* __xpv */
4107 
4108 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4109 		add_x86_feature(featureset, X86FSET_VMX);
4110 	}
4111 
4112 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4113 		add_x86_feature(featureset, X86FSET_RDRAND);
4114 
4115 	/*
4116 	 * Only need it first time, rest of the cpus would follow suit.
4117 	 * we only capture this for the bootcpu.
4118 	 */
4119 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4120 		add_x86_feature(featureset, X86FSET_CLFSH);
4121 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4122 	}
4123 	if (is_x86_feature(featureset, X86FSET_PAE))
4124 		cpi->cpi_pabits = 36;
4125 
4126 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4127 		struct cpuid_regs r, *ecp;
4128 
4129 		ecp = &r;
4130 		ecp->cp_eax = 0xD;
4131 		ecp->cp_ecx = 1;
4132 		ecp->cp_edx = ecp->cp_ebx = 0;
4133 		(void) __cpuid_insn(ecp);
4134 
4135 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4136 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4137 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4138 			add_x86_feature(featureset, X86FSET_XSAVEC);
4139 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4140 			add_x86_feature(featureset, X86FSET_XSAVES);
4141 
4142 		/*
4143 		 * Zen 2 family processors suffer from erratum 1386 that causes
4144 		 * xsaves to not function correctly in some circumstances. There
4145 		 * are no supervisor states in Zen 2 and earlier. Practically
4146 		 * speaking this has no impact for us as we currently do not
4147 		 * leverage compressed xsave formats. To safeguard against
4148 		 * issues in the future where we may opt to using it, we remove
4149 		 * it from the feature set now. While Matisse has a microcode
4150 		 * update available with a fix, not all Zen 2 CPUs do so it's
4151 		 * simpler for the moment to unconditionally remove it.
4152 		 */
4153 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4154 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4155 			remove_x86_feature(featureset, X86FSET_XSAVES);
4156 		}
4157 	}
4158 
4159 	/*
4160 	 * Work on the "extended" feature information, doing
4161 	 * some basic initialization to be used in the extended pass.
4162 	 */
4163 	xcpuid = 0;
4164 	switch (cpi->cpi_vendor) {
4165 	case X86_VENDOR_Intel:
4166 		/*
4167 		 * On KVM we know we will have proper support for extended
4168 		 * cpuid.
4169 		 */
4170 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4171 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4172 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4173 			xcpuid++;
4174 		break;
4175 	case X86_VENDOR_AMD:
4176 		if (cpi->cpi_family > 5 ||
4177 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4178 			xcpuid++;
4179 		break;
4180 	case X86_VENDOR_Cyrix:
4181 		/*
4182 		 * Only these Cyrix CPUs are -known- to support
4183 		 * extended cpuid operations.
4184 		 */
4185 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4186 		    x86_type == X86_TYPE_CYRIX_GXm)
4187 			xcpuid++;
4188 		break;
4189 	case X86_VENDOR_HYGON:
4190 	case X86_VENDOR_Centaur:
4191 	case X86_VENDOR_TM:
4192 	default:
4193 		xcpuid++;
4194 		break;
4195 	}
4196 
4197 	if (xcpuid) {
4198 		cp = &cpi->cpi_extd[0];
4199 		cp->cp_eax = CPUID_LEAF_EXT_0;
4200 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4201 	}
4202 
4203 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4204 
4205 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4206 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4207 
4208 		switch (cpi->cpi_vendor) {
4209 		case X86_VENDOR_Intel:
4210 		case X86_VENDOR_AMD:
4211 		case X86_VENDOR_HYGON:
4212 			if (cpi->cpi_xmaxeax < 0x80000001)
4213 				break;
4214 			cp = &cpi->cpi_extd[1];
4215 			cp->cp_eax = 0x80000001;
4216 			(void) __cpuid_insn(cp);
4217 
4218 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4219 			    cpi->cpi_family == 5 &&
4220 			    cpi->cpi_model == 6 &&
4221 			    cpi->cpi_step == 6) {
4222 				/*
4223 				 * K6 model 6 uses bit 10 to indicate SYSC
4224 				 * Later models use bit 11. Fix it here.
4225 				 */
4226 				if (cp->cp_edx & 0x400) {
4227 					cp->cp_edx &= ~0x400;
4228 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4229 				}
4230 			}
4231 
4232 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4233 
4234 			/*
4235 			 * Compute the additions to the kernel's feature word.
4236 			 */
4237 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4238 				add_x86_feature(featureset, X86FSET_NX);
4239 			}
4240 
4241 			/*
4242 			 * Regardless whether or not we boot 64-bit,
4243 			 * we should have a way to identify whether
4244 			 * the CPU is capable of running 64-bit.
4245 			 */
4246 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4247 				add_x86_feature(featureset, X86FSET_64);
4248 			}
4249 
4250 			/* 1 GB large page - enable only for 64 bit kernel */
4251 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4252 				add_x86_feature(featureset, X86FSET_1GPG);
4253 			}
4254 
4255 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4256 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4257 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4258 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4259 				add_x86_feature(featureset, X86FSET_SSE4A);
4260 			}
4261 
4262 			/*
4263 			 * It's really tricky to support syscall/sysret in
4264 			 * the i386 kernel; we rely on sysenter/sysexit
4265 			 * instead.  In the amd64 kernel, things are -way-
4266 			 * better.
4267 			 */
4268 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4269 				add_x86_feature(featureset, X86FSET_ASYSC);
4270 			}
4271 
4272 			/*
4273 			 * While we're thinking about system calls, note
4274 			 * that AMD processors don't support sysenter
4275 			 * in long mode at all, so don't try to program them.
4276 			 */
4277 			if (x86_vendor == X86_VENDOR_AMD ||
4278 			    x86_vendor == X86_VENDOR_HYGON) {
4279 				remove_x86_feature(featureset, X86FSET_SEP);
4280 			}
4281 
4282 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4283 				add_x86_feature(featureset, X86FSET_TSCP);
4284 			}
4285 
4286 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4287 				add_x86_feature(featureset, X86FSET_SVM);
4288 			}
4289 
4290 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4291 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4292 			}
4293 
4294 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4295 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4296 			}
4297 
4298 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4299 				add_x86_feature(featureset, X86FSET_XOP);
4300 			}
4301 
4302 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4303 				add_x86_feature(featureset, X86FSET_FMA4);
4304 			}
4305 
4306 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4307 				add_x86_feature(featureset, X86FSET_TBM);
4308 			}
4309 
4310 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4311 				add_x86_feature(featureset, X86FSET_MONITORX);
4312 			}
4313 			break;
4314 		default:
4315 			break;
4316 		}
4317 
4318 		/*
4319 		 * Get CPUID data about processor cores and hyperthreads.
4320 		 */
4321 		switch (cpi->cpi_vendor) {
4322 		case X86_VENDOR_Intel:
4323 			if (cpi->cpi_maxeax >= 4) {
4324 				cp = &cpi->cpi_std[4];
4325 				cp->cp_eax = 4;
4326 				cp->cp_ecx = 0;
4327 				(void) __cpuid_insn(cp);
4328 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4329 			}
4330 			/*FALLTHROUGH*/
4331 		case X86_VENDOR_AMD:
4332 		case X86_VENDOR_HYGON:
4333 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4334 				break;
4335 			cp = &cpi->cpi_extd[8];
4336 			cp->cp_eax = CPUID_LEAF_EXT_8;
4337 			(void) __cpuid_insn(cp);
4338 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4339 			    cp);
4340 
4341 			/*
4342 			 * AMD uses ebx for some extended functions.
4343 			 */
4344 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4345 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4346 				/*
4347 				 * While we're here, check for the AMD "Error
4348 				 * Pointer Zero/Restore" feature. This can be
4349 				 * used to setup the FP save handlers
4350 				 * appropriately.
4351 				 */
4352 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4353 					cpi->cpi_fp_amd_save = 0;
4354 				} else {
4355 					cpi->cpi_fp_amd_save = 1;
4356 				}
4357 
4358 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4359 					add_x86_feature(featureset,
4360 					    X86FSET_CLZERO);
4361 				}
4362 			}
4363 
4364 			/*
4365 			 * Virtual and physical address limits from
4366 			 * cpuid override previously guessed values.
4367 			 */
4368 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4369 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4370 			break;
4371 		default:
4372 			break;
4373 		}
4374 
4375 		/*
4376 		 * Get CPUID data about TSC Invariance in Deep C-State.
4377 		 */
4378 		switch (cpi->cpi_vendor) {
4379 		case X86_VENDOR_Intel:
4380 		case X86_VENDOR_AMD:
4381 		case X86_VENDOR_HYGON:
4382 			if (cpi->cpi_maxeax >= 7) {
4383 				cp = &cpi->cpi_extd[7];
4384 				cp->cp_eax = 0x80000007;
4385 				cp->cp_ecx = 0;
4386 				(void) __cpuid_insn(cp);
4387 			}
4388 			break;
4389 		default:
4390 			break;
4391 		}
4392 	}
4393 
4394 	/*
4395 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4396 	 * run and thus gathered some of its dependent leaves.
4397 	 */
4398 	cpuid_basic_topology(cpu, featureset);
4399 	cpuid_basic_thermal(cpu, featureset);
4400 #if !defined(__xpv)
4401 	cpuid_basic_ppin(cpu, featureset);
4402 #endif
4403 
4404 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4405 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4406 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4407 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4408 			/* Special handling for AMD FP not necessary. */
4409 			cpi->cpi_fp_amd_save = 0;
4410 		} else {
4411 			cpi->cpi_fp_amd_save = 1;
4412 		}
4413 	}
4414 
4415 	/*
4416 	 * Check (and potentially set) if lfence is serializing.
4417 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4418 	 */
4419 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4420 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4421 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4422 		/*
4423 		 * The AMD white paper Software Techniques For Managing
4424 		 * Speculation on AMD Processors details circumstances for when
4425 		 * lfence instructions are serializing.
4426 		 *
4427 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4428 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4429 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4430 		 * committed to supporting that MSR on all later CPUs.
4431 		 */
4432 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4433 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4434 		} else if (cpi->cpi_family >= 0x10) {
4435 #if !defined(__xpv)
4436 			uint64_t val;
4437 
4438 			/*
4439 			 * Be careful when attempting to enable the bit, and
4440 			 * verify that it was actually set in case we are
4441 			 * running in a hypervisor which is less than faithful
4442 			 * about its emulation of this feature.
4443 			 */
4444 			on_trap_data_t otd;
4445 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4446 				val = rdmsr(MSR_AMD_DE_CFG);
4447 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4448 				wrmsr(MSR_AMD_DE_CFG, val);
4449 				val = rdmsr(MSR_AMD_DE_CFG);
4450 			} else {
4451 				val = 0;
4452 			}
4453 			no_trap();
4454 
4455 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4456 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4457 			}
4458 #endif
4459 		}
4460 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4461 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4462 		/*
4463 		 * Documentation and other OSes indicate that lfence is always
4464 		 * serializing on Intel CPUs.
4465 		 */
4466 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4467 	}
4468 
4469 
4470 	/*
4471 	 * Check the processor leaves that are used for security features.
4472 	 */
4473 	cpuid_scan_security(cpu, featureset);
4474 }
4475 
4476 /*
4477  * Make copies of the cpuid table entries we depend on, in
4478  * part for ease of parsing now, in part so that we have only
4479  * one place to correct any of it, in part for ease of
4480  * later export to userland, and in part so we can look at
4481  * this stuff in a crash dump.
4482  */
4483 
4484 static void
4485 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4486 {
4487 	uint_t n, nmax;
4488 	int i;
4489 	struct cpuid_regs *cp;
4490 	uint8_t *dp;
4491 	uint32_t *iptr;
4492 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4493 
4494 	if (cpi->cpi_maxeax < 1)
4495 		return;
4496 
4497 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4498 		nmax = NMAX_CPI_STD;
4499 	/*
4500 	 * (We already handled n == 0 and n == 1 in the basic pass)
4501 	 */
4502 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4503 		/*
4504 		 * leaves 6 and 7 were handled in the basic pass
4505 		 */
4506 		if (n == 6 || n == 7)
4507 			continue;
4508 
4509 		cp->cp_eax = n;
4510 
4511 		/*
4512 		 * CPUID function 4 expects %ecx to be initialized
4513 		 * with an index which indicates which cache to return
4514 		 * information about. The OS is expected to call function 4
4515 		 * with %ecx set to 0, 1, 2, ... until it returns with
4516 		 * EAX[4:0] set to 0, which indicates there are no more
4517 		 * caches.
4518 		 *
4519 		 * Here, populate cpi_std[4] with the information returned by
4520 		 * function 4 when %ecx == 0, and do the rest in a later pass
4521 		 * when dynamic memory allocation becomes available.
4522 		 *
4523 		 * Note: we need to explicitly initialize %ecx here, since
4524 		 * function 4 may have been previously invoked.
4525 		 */
4526 		if (n == 4)
4527 			cp->cp_ecx = 0;
4528 
4529 		(void) __cpuid_insn(cp);
4530 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4531 		switch (n) {
4532 		case 2:
4533 			/*
4534 			 * "the lower 8 bits of the %eax register
4535 			 * contain a value that identifies the number
4536 			 * of times the cpuid [instruction] has to be
4537 			 * executed to obtain a complete image of the
4538 			 * processor's caching systems."
4539 			 *
4540 			 * How *do* they make this stuff up?
4541 			 */
4542 			cpi->cpi_ncache = sizeof (*cp) *
4543 			    BITX(cp->cp_eax, 7, 0);
4544 			if (cpi->cpi_ncache == 0)
4545 				break;
4546 			cpi->cpi_ncache--;	/* skip count byte */
4547 
4548 			/*
4549 			 * Well, for now, rather than attempt to implement
4550 			 * this slightly dubious algorithm, we just look
4551 			 * at the first 15 ..
4552 			 */
4553 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4554 				cpi->cpi_ncache = sizeof (*cp) - 1;
4555 
4556 			dp = cpi->cpi_cacheinfo;
4557 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4558 				uint8_t *p = (void *)&cp->cp_eax;
4559 				for (i = 1; i < 4; i++)
4560 					if (p[i] != 0)
4561 						*dp++ = p[i];
4562 			}
4563 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4564 				uint8_t *p = (void *)&cp->cp_ebx;
4565 				for (i = 0; i < 4; i++)
4566 					if (p[i] != 0)
4567 						*dp++ = p[i];
4568 			}
4569 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4570 				uint8_t *p = (void *)&cp->cp_ecx;
4571 				for (i = 0; i < 4; i++)
4572 					if (p[i] != 0)
4573 						*dp++ = p[i];
4574 			}
4575 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4576 				uint8_t *p = (void *)&cp->cp_edx;
4577 				for (i = 0; i < 4; i++)
4578 					if (p[i] != 0)
4579 						*dp++ = p[i];
4580 			}
4581 			break;
4582 
4583 		case 3:	/* Processor serial number, if PSN supported */
4584 			break;
4585 
4586 		case 4:	/* Deterministic cache parameters */
4587 			break;
4588 
4589 		case 5:	/* Monitor/Mwait parameters */
4590 		{
4591 			size_t mwait_size;
4592 
4593 			/*
4594 			 * check cpi_mwait.support which was set in
4595 			 * cpuid_pass_basic()
4596 			 */
4597 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4598 				break;
4599 
4600 			/*
4601 			 * Protect ourself from insane mwait line size.
4602 			 * Workaround for incomplete hardware emulator(s).
4603 			 */
4604 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4605 			if (mwait_size < sizeof (uint32_t) ||
4606 			    !ISP2(mwait_size)) {
4607 #if DEBUG
4608 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4609 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4610 #endif
4611 				break;
4612 			}
4613 
4614 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4615 			cpi->cpi_mwait.mon_max = mwait_size;
4616 			if (MWAIT_EXTENSION(cpi)) {
4617 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4618 				if (MWAIT_INT_ENABLE(cpi))
4619 					cpi->cpi_mwait.support |=
4620 					    MWAIT_ECX_INT_ENABLE;
4621 			}
4622 			break;
4623 		}
4624 		default:
4625 			break;
4626 		}
4627 	}
4628 
4629 	/*
4630 	 * XSAVE enumeration
4631 	 */
4632 	if (cpi->cpi_maxeax >= 0xD) {
4633 		struct cpuid_regs regs;
4634 		boolean_t cpuid_d_valid = B_TRUE;
4635 
4636 		cp = &regs;
4637 		cp->cp_eax = 0xD;
4638 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4639 
4640 		(void) __cpuid_insn(cp);
4641 
4642 		/*
4643 		 * Sanity checks for debug
4644 		 */
4645 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4646 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4647 			cpuid_d_valid = B_FALSE;
4648 		}
4649 
4650 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4651 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4652 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4653 
4654 		/*
4655 		 * If the hw supports AVX, get the size and offset in the save
4656 		 * area for the ymm state.
4657 		 */
4658 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4659 			cp->cp_eax = 0xD;
4660 			cp->cp_ecx = 2;
4661 			cp->cp_edx = cp->cp_ebx = 0;
4662 
4663 			(void) __cpuid_insn(cp);
4664 
4665 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4666 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4667 				cpuid_d_valid = B_FALSE;
4668 			}
4669 
4670 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4671 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4672 		}
4673 
4674 		/*
4675 		 * If the hw supports MPX, get the size and offset in the
4676 		 * save area for BNDREGS and BNDCSR.
4677 		 */
4678 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4679 			cp->cp_eax = 0xD;
4680 			cp->cp_ecx = 3;
4681 			cp->cp_edx = cp->cp_ebx = 0;
4682 
4683 			(void) __cpuid_insn(cp);
4684 
4685 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4686 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4687 
4688 			cp->cp_eax = 0xD;
4689 			cp->cp_ecx = 4;
4690 			cp->cp_edx = cp->cp_ebx = 0;
4691 
4692 			(void) __cpuid_insn(cp);
4693 
4694 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4695 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4696 		}
4697 
4698 		/*
4699 		 * If the hw supports AVX512, get the size and offset in the
4700 		 * save area for the opmask registers and zmm state.
4701 		 */
4702 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4703 			cp->cp_eax = 0xD;
4704 			cp->cp_ecx = 5;
4705 			cp->cp_edx = cp->cp_ebx = 0;
4706 
4707 			(void) __cpuid_insn(cp);
4708 
4709 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4710 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4711 
4712 			cp->cp_eax = 0xD;
4713 			cp->cp_ecx = 6;
4714 			cp->cp_edx = cp->cp_ebx = 0;
4715 
4716 			(void) __cpuid_insn(cp);
4717 
4718 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4719 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4720 
4721 			cp->cp_eax = 0xD;
4722 			cp->cp_ecx = 7;
4723 			cp->cp_edx = cp->cp_ebx = 0;
4724 
4725 			(void) __cpuid_insn(cp);
4726 
4727 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4728 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4729 		}
4730 
4731 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4732 			xsave_state_size = 0;
4733 		} else if (cpuid_d_valid) {
4734 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4735 		} else {
4736 			/* Broken CPUID 0xD, probably in HVM */
4737 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4738 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4739 			    ", ymm_size = %d, ymm_offset = %d\n",
4740 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4741 			    cpi->cpi_xsave.xsav_hw_features_high,
4742 			    (int)cpi->cpi_xsave.xsav_max_size,
4743 			    (int)cpi->cpi_xsave.ymm_size,
4744 			    (int)cpi->cpi_xsave.ymm_offset);
4745 
4746 			if (xsave_state_size != 0) {
4747 				/*
4748 				 * This must be a non-boot CPU. We cannot
4749 				 * continue, because boot cpu has already
4750 				 * enabled XSAVE.
4751 				 */
4752 				ASSERT(cpu->cpu_id != 0);
4753 				cmn_err(CE_PANIC, "cpu%d: we have already "
4754 				    "enabled XSAVE on boot cpu, cannot "
4755 				    "continue.", cpu->cpu_id);
4756 			} else {
4757 				/*
4758 				 * If we reached here on the boot CPU, it's also
4759 				 * almost certain that we'll reach here on the
4760 				 * non-boot CPUs. When we're here on a boot CPU
4761 				 * we should disable the feature, on a non-boot
4762 				 * CPU we need to confirm that we have.
4763 				 */
4764 				if (cpu->cpu_id == 0) {
4765 					remove_x86_feature(x86_featureset,
4766 					    X86FSET_XSAVE);
4767 					remove_x86_feature(x86_featureset,
4768 					    X86FSET_AVX);
4769 					remove_x86_feature(x86_featureset,
4770 					    X86FSET_F16C);
4771 					remove_x86_feature(x86_featureset,
4772 					    X86FSET_BMI1);
4773 					remove_x86_feature(x86_featureset,
4774 					    X86FSET_BMI2);
4775 					remove_x86_feature(x86_featureset,
4776 					    X86FSET_FMA);
4777 					remove_x86_feature(x86_featureset,
4778 					    X86FSET_AVX2);
4779 					remove_x86_feature(x86_featureset,
4780 					    X86FSET_MPX);
4781 					remove_x86_feature(x86_featureset,
4782 					    X86FSET_AVX512F);
4783 					remove_x86_feature(x86_featureset,
4784 					    X86FSET_AVX512DQ);
4785 					remove_x86_feature(x86_featureset,
4786 					    X86FSET_AVX512PF);
4787 					remove_x86_feature(x86_featureset,
4788 					    X86FSET_AVX512ER);
4789 					remove_x86_feature(x86_featureset,
4790 					    X86FSET_AVX512CD);
4791 					remove_x86_feature(x86_featureset,
4792 					    X86FSET_AVX512BW);
4793 					remove_x86_feature(x86_featureset,
4794 					    X86FSET_AVX512VL);
4795 					remove_x86_feature(x86_featureset,
4796 					    X86FSET_AVX512FMA);
4797 					remove_x86_feature(x86_featureset,
4798 					    X86FSET_AVX512VBMI);
4799 					remove_x86_feature(x86_featureset,
4800 					    X86FSET_AVX512VNNI);
4801 					remove_x86_feature(x86_featureset,
4802 					    X86FSET_AVX512VPOPCDQ);
4803 					remove_x86_feature(x86_featureset,
4804 					    X86FSET_AVX512NNIW);
4805 					remove_x86_feature(x86_featureset,
4806 					    X86FSET_AVX512FMAPS);
4807 					remove_x86_feature(x86_featureset,
4808 					    X86FSET_VAES);
4809 					remove_x86_feature(x86_featureset,
4810 					    X86FSET_VPCLMULQDQ);
4811 					remove_x86_feature(x86_featureset,
4812 					    X86FSET_GFNI);
4813 					remove_x86_feature(x86_featureset,
4814 					    X86FSET_AVX512_VP2INT);
4815 					remove_x86_feature(x86_featureset,
4816 					    X86FSET_AVX512_BITALG);
4817 					remove_x86_feature(x86_featureset,
4818 					    X86FSET_AVX512_VBMI2);
4819 					remove_x86_feature(x86_featureset,
4820 					    X86FSET_AVX512_BF16);
4821 
4822 					xsave_force_disable = B_TRUE;
4823 				} else {
4824 					VERIFY(is_x86_feature(x86_featureset,
4825 					    X86FSET_XSAVE) == B_FALSE);
4826 				}
4827 			}
4828 		}
4829 	}
4830 
4831 
4832 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4833 		return;
4834 
4835 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4836 		nmax = NMAX_CPI_EXTD;
4837 	/*
4838 	 * Copy the extended properties, fixing them as we go.
4839 	 * (We already handled n == 0 and n == 1 in the basic pass)
4840 	 */
4841 	iptr = (void *)cpi->cpi_brandstr;
4842 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4843 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4844 		(void) __cpuid_insn(cp);
4845 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4846 		    cp);
4847 		switch (n) {
4848 		case 2:
4849 		case 3:
4850 		case 4:
4851 			/*
4852 			 * Extract the brand string
4853 			 */
4854 			*iptr++ = cp->cp_eax;
4855 			*iptr++ = cp->cp_ebx;
4856 			*iptr++ = cp->cp_ecx;
4857 			*iptr++ = cp->cp_edx;
4858 			break;
4859 		case 5:
4860 			switch (cpi->cpi_vendor) {
4861 			case X86_VENDOR_AMD:
4862 				/*
4863 				 * The Athlon and Duron were the first
4864 				 * parts to report the sizes of the
4865 				 * TLB for large pages. Before then,
4866 				 * we don't trust the data.
4867 				 */
4868 				if (cpi->cpi_family < 6 ||
4869 				    (cpi->cpi_family == 6 &&
4870 				    cpi->cpi_model < 1))
4871 					cp->cp_eax = 0;
4872 				break;
4873 			default:
4874 				break;
4875 			}
4876 			break;
4877 		case 6:
4878 			switch (cpi->cpi_vendor) {
4879 			case X86_VENDOR_AMD:
4880 				/*
4881 				 * The Athlon and Duron were the first
4882 				 * AMD parts with L2 TLB's.
4883 				 * Before then, don't trust the data.
4884 				 */
4885 				if (cpi->cpi_family < 6 ||
4886 				    (cpi->cpi_family == 6 &&
4887 				    cpi->cpi_model < 1))
4888 					cp->cp_eax = cp->cp_ebx = 0;
4889 				/*
4890 				 * AMD Duron rev A0 reports L2
4891 				 * cache size incorrectly as 1K
4892 				 * when it is really 64K
4893 				 */
4894 				if (cpi->cpi_family == 6 &&
4895 				    cpi->cpi_model == 3 &&
4896 				    cpi->cpi_step == 0) {
4897 					cp->cp_ecx &= 0xffff;
4898 					cp->cp_ecx |= 0x400000;
4899 				}
4900 				break;
4901 			case X86_VENDOR_Cyrix:	/* VIA C3 */
4902 				/*
4903 				 * VIA C3 processors are a bit messed
4904 				 * up w.r.t. encoding cache sizes in %ecx
4905 				 */
4906 				if (cpi->cpi_family != 6)
4907 					break;
4908 				/*
4909 				 * model 7 and 8 were incorrectly encoded
4910 				 *
4911 				 * xxx is model 8 really broken?
4912 				 */
4913 				if (cpi->cpi_model == 7 ||
4914 				    cpi->cpi_model == 8)
4915 					cp->cp_ecx =
4916 					    BITX(cp->cp_ecx, 31, 24) << 16 |
4917 					    BITX(cp->cp_ecx, 23, 16) << 12 |
4918 					    BITX(cp->cp_ecx, 15, 8) << 8 |
4919 					    BITX(cp->cp_ecx, 7, 0);
4920 				/*
4921 				 * model 9 stepping 1 has wrong associativity
4922 				 */
4923 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4924 					cp->cp_ecx |= 8 << 12;
4925 				break;
4926 			case X86_VENDOR_Intel:
4927 				/*
4928 				 * Extended L2 Cache features function.
4929 				 * First appeared on Prescott.
4930 				 */
4931 			default:
4932 				break;
4933 			}
4934 			break;
4935 		default:
4936 			break;
4937 		}
4938 	}
4939 }
4940 
4941 static const char *
4942 intel_cpubrand(const struct cpuid_info *cpi)
4943 {
4944 	int i;
4945 
4946 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
4947 
4948 	switch (cpi->cpi_family) {
4949 	case 5:
4950 		return ("Intel Pentium(r)");
4951 	case 6:
4952 		switch (cpi->cpi_model) {
4953 			uint_t celeron, xeon;
4954 			const struct cpuid_regs *cp;
4955 		case 0:
4956 		case 1:
4957 		case 2:
4958 			return ("Intel Pentium(r) Pro");
4959 		case 3:
4960 		case 4:
4961 			return ("Intel Pentium(r) II");
4962 		case 6:
4963 			return ("Intel Celeron(r)");
4964 		case 5:
4965 		case 7:
4966 			celeron = xeon = 0;
4967 			cp = &cpi->cpi_std[2];	/* cache info */
4968 
4969 			for (i = 1; i < 4; i++) {
4970 				uint_t tmp;
4971 
4972 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4973 				if (tmp == 0x40)
4974 					celeron++;
4975 				if (tmp >= 0x44 && tmp <= 0x45)
4976 					xeon++;
4977 			}
4978 
4979 			for (i = 0; i < 2; i++) {
4980 				uint_t tmp;
4981 
4982 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4983 				if (tmp == 0x40)
4984 					celeron++;
4985 				else if (tmp >= 0x44 && tmp <= 0x45)
4986 					xeon++;
4987 			}
4988 
4989 			for (i = 0; i < 4; i++) {
4990 				uint_t tmp;
4991 
4992 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4993 				if (tmp == 0x40)
4994 					celeron++;
4995 				else if (tmp >= 0x44 && tmp <= 0x45)
4996 					xeon++;
4997 			}
4998 
4999 			for (i = 0; i < 4; i++) {
5000 				uint_t tmp;
5001 
5002 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5003 				if (tmp == 0x40)
5004 					celeron++;
5005 				else if (tmp >= 0x44 && tmp <= 0x45)
5006 					xeon++;
5007 			}
5008 
5009 			if (celeron)
5010 				return ("Intel Celeron(r)");
5011 			if (xeon)
5012 				return (cpi->cpi_model == 5 ?
5013 				    "Intel Pentium(r) II Xeon(tm)" :
5014 				    "Intel Pentium(r) III Xeon(tm)");
5015 			return (cpi->cpi_model == 5 ?
5016 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5017 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5018 		default:
5019 			break;
5020 		}
5021 	default:
5022 		break;
5023 	}
5024 
5025 	/* BrandID is present if the field is nonzero */
5026 	if (cpi->cpi_brandid != 0) {
5027 		static const struct {
5028 			uint_t bt_bid;
5029 			const char *bt_str;
5030 		} brand_tbl[] = {
5031 			{ 0x1,	"Intel(r) Celeron(r)" },
5032 			{ 0x2,	"Intel(r) Pentium(r) III" },
5033 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5034 			{ 0x4,	"Intel(r) Pentium(r) III" },
5035 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5036 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5037 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5038 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5039 			{ 0xa,	"Intel(r) Celeron(r)" },
5040 			{ 0xb,	"Intel(r) Xeon(tm)" },
5041 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5042 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5043 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5044 			{ 0x11, "Mobile Genuine Intel(r)" },
5045 			{ 0x12, "Intel(r) Celeron(r) M" },
5046 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5047 			{ 0x14, "Intel(r) Celeron(r)" },
5048 			{ 0x15, "Mobile Genuine Intel(r)" },
5049 			{ 0x16,	"Intel(r) Pentium(r) M" },
5050 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5051 		};
5052 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5053 		uint_t sgn;
5054 
5055 		sgn = (cpi->cpi_family << 8) |
5056 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5057 
5058 		for (i = 0; i < btblmax; i++)
5059 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5060 				break;
5061 		if (i < btblmax) {
5062 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5063 				return ("Intel(r) Celeron(r)");
5064 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5065 				return ("Intel(r) Xeon(tm) MP");
5066 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5067 				return ("Intel(r) Xeon(tm)");
5068 			return (brand_tbl[i].bt_str);
5069 		}
5070 	}
5071 
5072 	return (NULL);
5073 }
5074 
5075 static const char *
5076 amd_cpubrand(const struct cpuid_info *cpi)
5077 {
5078 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5079 
5080 	switch (cpi->cpi_family) {
5081 	case 5:
5082 		switch (cpi->cpi_model) {
5083 		case 0:
5084 		case 1:
5085 		case 2:
5086 		case 3:
5087 		case 4:
5088 		case 5:
5089 			return ("AMD-K5(r)");
5090 		case 6:
5091 		case 7:
5092 			return ("AMD-K6(r)");
5093 		case 8:
5094 			return ("AMD-K6(r)-2");
5095 		case 9:
5096 			return ("AMD-K6(r)-III");
5097 		default:
5098 			return ("AMD (family 5)");
5099 		}
5100 	case 6:
5101 		switch (cpi->cpi_model) {
5102 		case 1:
5103 			return ("AMD-K7(tm)");
5104 		case 0:
5105 		case 2:
5106 		case 4:
5107 			return ("AMD Athlon(tm)");
5108 		case 3:
5109 		case 7:
5110 			return ("AMD Duron(tm)");
5111 		case 6:
5112 		case 8:
5113 		case 10:
5114 			/*
5115 			 * Use the L2 cache size to distinguish
5116 			 */
5117 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5118 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5119 		default:
5120 			return ("AMD (family 6)");
5121 		}
5122 	default:
5123 		break;
5124 	}
5125 
5126 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5127 	    cpi->cpi_brandid != 0) {
5128 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5129 		case 3:
5130 			return ("AMD Opteron(tm) UP 1xx");
5131 		case 4:
5132 			return ("AMD Opteron(tm) DP 2xx");
5133 		case 5:
5134 			return ("AMD Opteron(tm) MP 8xx");
5135 		default:
5136 			return ("AMD Opteron(tm)");
5137 		}
5138 	}
5139 
5140 	return (NULL);
5141 }
5142 
5143 static const char *
5144 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5145 {
5146 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5147 
5148 	switch (type) {
5149 	case X86_TYPE_CYRIX_6x86:
5150 		return ("Cyrix 6x86");
5151 	case X86_TYPE_CYRIX_6x86L:
5152 		return ("Cyrix 6x86L");
5153 	case X86_TYPE_CYRIX_6x86MX:
5154 		return ("Cyrix 6x86MX");
5155 	case X86_TYPE_CYRIX_GXm:
5156 		return ("Cyrix GXm");
5157 	case X86_TYPE_CYRIX_MediaGX:
5158 		return ("Cyrix MediaGX");
5159 	case X86_TYPE_CYRIX_MII:
5160 		return ("Cyrix M2");
5161 	case X86_TYPE_VIA_CYRIX_III:
5162 		return ("VIA Cyrix M3");
5163 	default:
5164 		/*
5165 		 * Have another wild guess ..
5166 		 */
5167 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5168 			return ("Cyrix 5x86");
5169 		else if (cpi->cpi_family == 5) {
5170 			switch (cpi->cpi_model) {
5171 			case 2:
5172 				return ("Cyrix 6x86");	/* Cyrix M1 */
5173 			case 4:
5174 				return ("Cyrix MediaGX");
5175 			default:
5176 				break;
5177 			}
5178 		} else if (cpi->cpi_family == 6) {
5179 			switch (cpi->cpi_model) {
5180 			case 0:
5181 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5182 			case 5:
5183 			case 6:
5184 			case 7:
5185 			case 8:
5186 			case 9:
5187 				return ("VIA C3");
5188 			default:
5189 				break;
5190 			}
5191 		}
5192 		break;
5193 	}
5194 	return (NULL);
5195 }
5196 
5197 /*
5198  * This only gets called in the case that the CPU extended
5199  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5200  * aren't available, or contain null bytes for some reason.
5201  */
5202 static void
5203 fabricate_brandstr(struct cpuid_info *cpi)
5204 {
5205 	const char *brand = NULL;
5206 
5207 	switch (cpi->cpi_vendor) {
5208 	case X86_VENDOR_Intel:
5209 		brand = intel_cpubrand(cpi);
5210 		break;
5211 	case X86_VENDOR_AMD:
5212 		brand = amd_cpubrand(cpi);
5213 		break;
5214 	case X86_VENDOR_Cyrix:
5215 		brand = cyrix_cpubrand(cpi, x86_type);
5216 		break;
5217 	case X86_VENDOR_NexGen:
5218 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5219 			brand = "NexGen Nx586";
5220 		break;
5221 	case X86_VENDOR_Centaur:
5222 		if (cpi->cpi_family == 5)
5223 			switch (cpi->cpi_model) {
5224 			case 4:
5225 				brand = "Centaur C6";
5226 				break;
5227 			case 8:
5228 				brand = "Centaur C2";
5229 				break;
5230 			case 9:
5231 				brand = "Centaur C3";
5232 				break;
5233 			default:
5234 				break;
5235 			}
5236 		break;
5237 	case X86_VENDOR_Rise:
5238 		if (cpi->cpi_family == 5 &&
5239 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5240 			brand = "Rise mP6";
5241 		break;
5242 	case X86_VENDOR_SiS:
5243 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5244 			brand = "SiS 55x";
5245 		break;
5246 	case X86_VENDOR_TM:
5247 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5248 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5249 		break;
5250 	case X86_VENDOR_NSC:
5251 	case X86_VENDOR_UMC:
5252 	default:
5253 		break;
5254 	}
5255 	if (brand) {
5256 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5257 		return;
5258 	}
5259 
5260 	/*
5261 	 * If all else fails ...
5262 	 */
5263 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5264 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5265 	    cpi->cpi_model, cpi->cpi_step);
5266 }
5267 
5268 /*
5269  * This routine is called just after kernel memory allocation
5270  * becomes available on cpu0, and as part of mp_startup() on
5271  * the other cpus.
5272  *
5273  * Fixup the brand string, and collect any information from cpuid
5274  * that requires dynamically allocated storage to represent.
5275  */
5276 
5277 static void
5278 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5279 {
5280 	int	i, max, shft, level, size;
5281 	struct cpuid_regs regs;
5282 	struct cpuid_regs *cp;
5283 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5284 
5285 	/*
5286 	 * Deterministic cache parameters
5287 	 *
5288 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5289 	 * values that are present are currently defined to be the same. This
5290 	 * means we can use the same logic to parse it as long as we use the
5291 	 * appropriate leaf to get the data. If you're updating this, make sure
5292 	 * you're careful about which vendor supports which aspect.
5293 	 *
5294 	 * Take this opportunity to detect the number of threads sharing the
5295 	 * last level cache, and construct a corresponding cache id. The
5296 	 * respective cpuid_info members are initialized to the default case of
5297 	 * "no last level cache sharing".
5298 	 */
5299 	cpi->cpi_ncpu_shr_last_cache = 1;
5300 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5301 
5302 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5303 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5304 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5305 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5306 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5307 		uint32_t leaf;
5308 
5309 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5310 			leaf = 4;
5311 		} else {
5312 			leaf = CPUID_LEAF_EXT_1d;
5313 		}
5314 
5315 		/*
5316 		 * Find the # of elements (size) returned by the leaf and along
5317 		 * the way detect last level cache sharing details.
5318 		 */
5319 		bzero(&regs, sizeof (regs));
5320 		cp = &regs;
5321 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5322 			cp->cp_eax = leaf;
5323 			cp->cp_ecx = i;
5324 
5325 			(void) __cpuid_insn(cp);
5326 
5327 			if (CPI_CACHE_TYPE(cp) == 0)
5328 				break;
5329 			level = CPI_CACHE_LVL(cp);
5330 			if (level > max) {
5331 				max = level;
5332 				cpi->cpi_ncpu_shr_last_cache =
5333 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5334 			}
5335 		}
5336 		cpi->cpi_cache_leaf_size = size = i;
5337 
5338 		/*
5339 		 * Allocate the cpi_cache_leaves array. The first element
5340 		 * references the regs for the corresponding leaf with %ecx set
5341 		 * to 0. This was gathered in cpuid_pass_extended().
5342 		 */
5343 		if (size > 0) {
5344 			cpi->cpi_cache_leaves =
5345 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5346 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5347 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5348 			} else {
5349 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5350 			}
5351 
5352 			/*
5353 			 * Allocate storage to hold the additional regs
5354 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5355 			 *
5356 			 * The regs for the leaf, %ecx == 0 has already
5357 			 * been allocated as indicated above.
5358 			 */
5359 			for (i = 1; i < size; i++) {
5360 				cp = cpi->cpi_cache_leaves[i] =
5361 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5362 				cp->cp_eax = leaf;
5363 				cp->cp_ecx = i;
5364 
5365 				(void) __cpuid_insn(cp);
5366 			}
5367 		}
5368 		/*
5369 		 * Determine the number of bits needed to represent
5370 		 * the number of CPUs sharing the last level cache.
5371 		 *
5372 		 * Shift off that number of bits from the APIC id to
5373 		 * derive the cache id.
5374 		 */
5375 		shft = 0;
5376 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5377 			shft++;
5378 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5379 	}
5380 
5381 	/*
5382 	 * Now fixup the brand string
5383 	 */
5384 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5385 		fabricate_brandstr(cpi);
5386 	} else {
5387 
5388 		/*
5389 		 * If we successfully extracted a brand string from the cpuid
5390 		 * instruction, clean it up by removing leading spaces and
5391 		 * similar junk.
5392 		 */
5393 		if (cpi->cpi_brandstr[0]) {
5394 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5395 			char *src, *dst;
5396 
5397 			dst = src = (char *)cpi->cpi_brandstr;
5398 			src[maxlen - 1] = '\0';
5399 			/*
5400 			 * strip leading spaces
5401 			 */
5402 			while (*src == ' ')
5403 				src++;
5404 			/*
5405 			 * Remove any 'Genuine' or "Authentic" prefixes
5406 			 */
5407 			if (strncmp(src, "Genuine ", 8) == 0)
5408 				src += 8;
5409 			if (strncmp(src, "Authentic ", 10) == 0)
5410 				src += 10;
5411 
5412 			/*
5413 			 * Now do an in-place copy.
5414 			 * Map (R) to (r) and (TM) to (tm).
5415 			 * The era of teletypes is long gone, and there's
5416 			 * -really- no need to shout.
5417 			 */
5418 			while (*src != '\0') {
5419 				if (src[0] == '(') {
5420 					if (strncmp(src + 1, "R)", 2) == 0) {
5421 						(void) strncpy(dst, "(r)", 3);
5422 						src += 3;
5423 						dst += 3;
5424 						continue;
5425 					}
5426 					if (strncmp(src + 1, "TM)", 3) == 0) {
5427 						(void) strncpy(dst, "(tm)", 4);
5428 						src += 4;
5429 						dst += 4;
5430 						continue;
5431 					}
5432 				}
5433 				*dst++ = *src++;
5434 			}
5435 			*dst = '\0';
5436 
5437 			/*
5438 			 * Finally, remove any trailing spaces
5439 			 */
5440 			while (--dst > cpi->cpi_brandstr)
5441 				if (*dst == ' ')
5442 					*dst = '\0';
5443 				else
5444 					break;
5445 		} else
5446 			fabricate_brandstr(cpi);
5447 	}
5448 }
5449 
5450 typedef struct {
5451 	uint32_t avm_av;
5452 	uint32_t avm_feat;
5453 } av_feat_map_t;
5454 
5455 /*
5456  * These arrays are used to map features that we should add based on x86
5457  * features that are present. As a large number depend on kernel features,
5458  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5459  * There is an array of these for each hwcap word. Some features aren't tracked
5460  * in the kernel x86 featureset and that's ok. They will not show up in here.
5461  */
5462 static const av_feat_map_t x86fset_to_av1[] = {
5463 	{ AV_386_CX8, X86FSET_CX8 },
5464 	{ AV_386_SEP, X86FSET_SEP },
5465 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5466 	{ AV_386_CMOV, X86FSET_CMOV },
5467 	{ AV_386_FXSR, X86FSET_SSE },
5468 	{ AV_386_SSE, X86FSET_SSE },
5469 	{ AV_386_SSE2, X86FSET_SSE2 },
5470 	{ AV_386_SSE3, X86FSET_SSE3 },
5471 	{ AV_386_CX16, X86FSET_CX16 },
5472 	{ AV_386_TSCP, X86FSET_TSCP },
5473 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5474 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5475 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5476 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5477 	{ AV_386_AES, X86FSET_AES },
5478 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5479 	{ AV_386_XSAVE, X86FSET_XSAVE },
5480 	{ AV_386_AVX, X86FSET_AVX },
5481 	{ AV_386_VMX, X86FSET_VMX },
5482 	{ AV_386_AMD_SVM, X86FSET_SVM }
5483 };
5484 
5485 static const av_feat_map_t x86fset_to_av2[] = {
5486 	{ AV_386_2_F16C, X86FSET_F16C },
5487 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5488 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5489 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5490 	{ AV_386_2_FMA, X86FSET_FMA },
5491 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5492 	{ AV_386_2_ADX, X86FSET_ADX },
5493 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5494 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5495 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5496 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5497 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5498 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5499 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5500 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5501 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5502 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5503 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5504 	{ AV_386_2_SHA, X86FSET_SHA },
5505 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5506 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5507 	{ AV_386_2_CLWB, X86FSET_CLWB },
5508 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5509 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5510 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5511 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5512 	{ AV_386_2_VAES, X86FSET_VAES },
5513 	{ AV_386_2_GFNI, X86FSET_GFNI },
5514 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5515 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5516 };
5517 
5518 static const av_feat_map_t x86fset_to_av3[] = {
5519 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5520 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5521 };
5522 
5523 /*
5524  * This routine is called out of bind_hwcap() much later in the life
5525  * of the kernel (post_startup()).  The job of this routine is to resolve
5526  * the hardware feature support and kernel support for those features into
5527  * what we're actually going to tell applications via the aux vector.
5528  *
5529  * Most of the aux vector is derived from the x86_featureset array vector where
5530  * a given feature indicates that an aux vector should be plumbed through. This
5531  * allows the kernel to use one tracking mechanism for these based on whether or
5532  * not it has the required hardware support (most often xsave). Most newer
5533  * features are added there in case we need them in the kernel. Otherwise,
5534  * features are evaluated based on looking at the cpuid features that remain. If
5535  * you find yourself wanting to clear out cpuid features for some reason, they
5536  * should instead be driven by the feature set so we have a consistent view.
5537  */
5538 
5539 static void
5540 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5541 {
5542 	uint_t *hwcap_out = (uint_t *)arg;
5543 	struct cpuid_info *cpi;
5544 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5545 
5546 	cpi = cpu->cpu_m.mcpu_cpi;
5547 
5548 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5549 		if (is_x86_feature(x86_featureset,
5550 		    x86fset_to_av1[i].avm_feat)) {
5551 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5552 		}
5553 	}
5554 
5555 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5556 		if (is_x86_feature(x86_featureset,
5557 		    x86fset_to_av2[i].avm_feat)) {
5558 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5559 		}
5560 	}
5561 
5562 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5563 		if (is_x86_feature(x86_featureset,
5564 		    x86fset_to_av3[i].avm_feat)) {
5565 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5566 		}
5567 	}
5568 
5569 	/*
5570 	 * From here on out we're working through features that don't have
5571 	 * corresponding kernel feature flags for various reasons that are
5572 	 * mostly just due to the historical implementation.
5573 	 */
5574 	if (cpi->cpi_maxeax >= 1) {
5575 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5576 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5577 
5578 		*edx = CPI_FEATURES_EDX(cpi);
5579 		*ecx = CPI_FEATURES_ECX(cpi);
5580 
5581 		/*
5582 		 * [no explicit support required beyond x87 fp context]
5583 		 */
5584 		if (!fpu_exists)
5585 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5586 
5587 		/*
5588 		 * Now map the supported feature vector to things that we
5589 		 * think userland will care about.
5590 		 */
5591 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5592 			hwcap_flags |= AV_386_MOVBE;
5593 
5594 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5595 			hwcap_flags |= AV_386_POPCNT;
5596 		if (*edx & CPUID_INTC_EDX_FPU)
5597 			hwcap_flags |= AV_386_FPU;
5598 		if (*edx & CPUID_INTC_EDX_MMX)
5599 			hwcap_flags |= AV_386_MMX;
5600 		if (*edx & CPUID_INTC_EDX_TSC)
5601 			hwcap_flags |= AV_386_TSC;
5602 	}
5603 
5604 	/*
5605 	 * Check a few miscellaneous features.
5606 	 */
5607 	if (cpi->cpi_xmaxeax < 0x80000001)
5608 		goto resolve_done;
5609 
5610 	switch (cpi->cpi_vendor) {
5611 		uint32_t *edx, *ecx;
5612 
5613 	case X86_VENDOR_Intel:
5614 		/*
5615 		 * Seems like Intel duplicated what we necessary
5616 		 * here to make the initial crop of 64-bit OS's work.
5617 		 * Hopefully, those are the only "extended" bits
5618 		 * they'll add.
5619 		 */
5620 		/*FALLTHROUGH*/
5621 
5622 	case X86_VENDOR_AMD:
5623 	case X86_VENDOR_HYGON:
5624 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5625 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5626 
5627 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5628 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5629 
5630 		/*
5631 		 * [no explicit support required beyond
5632 		 * x87 fp context and exception handlers]
5633 		 */
5634 		if (!fpu_exists)
5635 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5636 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5637 
5638 		/*
5639 		 * Now map the supported feature vector to
5640 		 * things that we think userland will care about.
5641 		 */
5642 		if (*edx & CPUID_AMD_EDX_MMXamd)
5643 			hwcap_flags |= AV_386_AMD_MMX;
5644 		if (*edx & CPUID_AMD_EDX_3DNow)
5645 			hwcap_flags |= AV_386_AMD_3DNow;
5646 		if (*edx & CPUID_AMD_EDX_3DNowx)
5647 			hwcap_flags |= AV_386_AMD_3DNowx;
5648 
5649 		switch (cpi->cpi_vendor) {
5650 		case X86_VENDOR_AMD:
5651 		case X86_VENDOR_HYGON:
5652 			if (*ecx & CPUID_AMD_ECX_AHF64)
5653 				hwcap_flags |= AV_386_AHF;
5654 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5655 				hwcap_flags |= AV_386_AMD_LZCNT;
5656 			break;
5657 
5658 		case X86_VENDOR_Intel:
5659 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5660 				hwcap_flags |= AV_386_AMD_LZCNT;
5661 			/*
5662 			 * Aarrgh.
5663 			 * Intel uses a different bit in the same word.
5664 			 */
5665 			if (*ecx & CPUID_INTC_ECX_AHF64)
5666 				hwcap_flags |= AV_386_AHF;
5667 			break;
5668 		default:
5669 			break;
5670 		}
5671 		break;
5672 
5673 	default:
5674 		break;
5675 	}
5676 
5677 resolve_done:
5678 	if (hwcap_out != NULL) {
5679 		hwcap_out[0] = hwcap_flags;
5680 		hwcap_out[1] = hwcap_flags_2;
5681 		hwcap_out[2] = hwcap_flags_3;
5682 	}
5683 }
5684 
5685 
5686 /*
5687  * Simulate the cpuid instruction using the data we previously
5688  * captured about this CPU.  We try our best to return the truth
5689  * about the hardware, independently of kernel support.
5690  */
5691 uint32_t
5692 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5693 {
5694 	struct cpuid_info *cpi;
5695 	struct cpuid_regs *xcp;
5696 
5697 	if (cpu == NULL)
5698 		cpu = CPU;
5699 	cpi = cpu->cpu_m.mcpu_cpi;
5700 
5701 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5702 
5703 	/*
5704 	 * CPUID data is cached in two separate places: cpi_std for standard
5705 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5706 	 */
5707 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5708 		xcp = &cpi->cpi_std[cp->cp_eax];
5709 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5710 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5711 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5712 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5713 	} else {
5714 		/*
5715 		 * The caller is asking for data from an input parameter which
5716 		 * the kernel has not cached.  In this case we go fetch from
5717 		 * the hardware and return the data directly to the user.
5718 		 */
5719 		return (__cpuid_insn(cp));
5720 	}
5721 
5722 	cp->cp_eax = xcp->cp_eax;
5723 	cp->cp_ebx = xcp->cp_ebx;
5724 	cp->cp_ecx = xcp->cp_ecx;
5725 	cp->cp_edx = xcp->cp_edx;
5726 	return (cp->cp_eax);
5727 }
5728 
5729 boolean_t
5730 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5731 {
5732 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5733 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5734 }
5735 
5736 int
5737 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5738 {
5739 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5740 
5741 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5742 }
5743 
5744 int
5745 cpuid_is_cmt(cpu_t *cpu)
5746 {
5747 	if (cpu == NULL)
5748 		cpu = CPU;
5749 
5750 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5751 
5752 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5753 }
5754 
5755 /*
5756  * AMD and Intel both implement the 64-bit variant of the syscall
5757  * instruction (syscallq), so if there's -any- support for syscall,
5758  * cpuid currently says "yes, we support this".
5759  *
5760  * However, Intel decided to -not- implement the 32-bit variant of the
5761  * syscall instruction, so we provide a predicate to allow our caller
5762  * to test that subtlety here.
5763  *
5764  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5765  *	even in the case where the hardware would in fact support it.
5766  */
5767 /*ARGSUSED*/
5768 int
5769 cpuid_syscall32_insn(cpu_t *cpu)
5770 {
5771 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5772 
5773 #if !defined(__xpv)
5774 	if (cpu == NULL)
5775 		cpu = CPU;
5776 
5777 	/*CSTYLED*/
5778 	{
5779 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5780 
5781 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5782 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5783 		    cpi->cpi_xmaxeax >= 0x80000001 &&
5784 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5785 			return (1);
5786 	}
5787 #endif
5788 	return (0);
5789 }
5790 
5791 int
5792 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5793 {
5794 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5795 
5796 	static const char fmt[] =
5797 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5798 	static const char fmt_ht[] =
5799 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5800 
5801 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5802 
5803 	if (cpuid_is_cmt(cpu))
5804 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5805 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5806 		    cpi->cpi_family, cpi->cpi_model,
5807 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5808 	return (snprintf(s, n, fmt,
5809 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5810 	    cpi->cpi_family, cpi->cpi_model,
5811 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5812 }
5813 
5814 const char *
5815 cpuid_getvendorstr(cpu_t *cpu)
5816 {
5817 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5818 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5819 }
5820 
5821 uint_t
5822 cpuid_getvendor(cpu_t *cpu)
5823 {
5824 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5825 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5826 }
5827 
5828 uint_t
5829 cpuid_getfamily(cpu_t *cpu)
5830 {
5831 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5832 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5833 }
5834 
5835 uint_t
5836 cpuid_getmodel(cpu_t *cpu)
5837 {
5838 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5839 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5840 }
5841 
5842 uint_t
5843 cpuid_get_ncpu_per_chip(cpu_t *cpu)
5844 {
5845 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5846 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5847 }
5848 
5849 uint_t
5850 cpuid_get_ncore_per_chip(cpu_t *cpu)
5851 {
5852 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5853 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5854 }
5855 
5856 uint_t
5857 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5858 {
5859 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5860 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5861 }
5862 
5863 id_t
5864 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5865 {
5866 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
5867 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5868 }
5869 
5870 uint_t
5871 cpuid_getstep(cpu_t *cpu)
5872 {
5873 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5874 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5875 }
5876 
5877 uint_t
5878 cpuid_getsig(struct cpu *cpu)
5879 {
5880 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5881 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5882 }
5883 
5884 uint32_t
5885 cpuid_getchiprev(struct cpu *cpu)
5886 {
5887 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5888 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5889 }
5890 
5891 const char *
5892 cpuid_getchiprevstr(struct cpu *cpu)
5893 {
5894 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5895 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5896 }
5897 
5898 uint32_t
5899 cpuid_getsockettype(struct cpu *cpu)
5900 {
5901 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5902 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5903 }
5904 
5905 const char *
5906 cpuid_getsocketstr(cpu_t *cpu)
5907 {
5908 	static const char *socketstr = NULL;
5909 	struct cpuid_info *cpi;
5910 
5911 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5912 	cpi = cpu->cpu_m.mcpu_cpi;
5913 
5914 	/* Assume that socket types are the same across the system */
5915 	if (socketstr == NULL)
5916 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5917 		    cpi->cpi_model, cpi->cpi_step);
5918 
5919 
5920 	return (socketstr);
5921 }
5922 
5923 x86_uarchrev_t
5924 cpuid_getuarchrev(cpu_t *cpu)
5925 {
5926 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
5927 }
5928 
5929 int
5930 cpuid_get_chipid(cpu_t *cpu)
5931 {
5932 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5933 
5934 	if (cpuid_is_cmt(cpu))
5935 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5936 	return (cpu->cpu_id);
5937 }
5938 
5939 id_t
5940 cpuid_get_coreid(cpu_t *cpu)
5941 {
5942 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5943 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5944 }
5945 
5946 int
5947 cpuid_get_pkgcoreid(cpu_t *cpu)
5948 {
5949 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5950 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5951 }
5952 
5953 int
5954 cpuid_get_clogid(cpu_t *cpu)
5955 {
5956 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5957 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5958 }
5959 
5960 int
5961 cpuid_get_cacheid(cpu_t *cpu)
5962 {
5963 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5964 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5965 }
5966 
5967 uint_t
5968 cpuid_get_procnodeid(cpu_t *cpu)
5969 {
5970 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5971 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5972 }
5973 
5974 uint_t
5975 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5976 {
5977 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5978 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5979 }
5980 
5981 uint_t
5982 cpuid_get_compunitid(cpu_t *cpu)
5983 {
5984 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5985 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5986 }
5987 
5988 uint_t
5989 cpuid_get_cores_per_compunit(cpu_t *cpu)
5990 {
5991 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5992 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5993 }
5994 
5995 uint32_t
5996 cpuid_get_apicid(cpu_t *cpu)
5997 {
5998 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5999 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6000 		return (UINT32_MAX);
6001 	} else {
6002 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6003 	}
6004 }
6005 
6006 void
6007 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6008 {
6009 	struct cpuid_info *cpi;
6010 
6011 	if (cpu == NULL)
6012 		cpu = CPU;
6013 	cpi = cpu->cpu_m.mcpu_cpi;
6014 
6015 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6016 
6017 	if (pabits)
6018 		*pabits = cpi->cpi_pabits;
6019 	if (vabits)
6020 		*vabits = cpi->cpi_vabits;
6021 }
6022 
6023 size_t
6024 cpuid_get_xsave_size()
6025 {
6026 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6027 	    sizeof (struct xsave_state)));
6028 }
6029 
6030 /*
6031  * Return true if the CPUs on this system require 'pointer clearing' for the
6032  * floating point error pointer exception handling. In the past, this has been
6033  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6034  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6035  * feature bit and is reflected in the cpi_fp_amd_save member.
6036  */
6037 boolean_t
6038 cpuid_need_fp_excp_handling()
6039 {
6040 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6041 	    cpuid_info0.cpi_fp_amd_save != 0);
6042 }
6043 
6044 /*
6045  * Returns the number of data TLB entries for a corresponding
6046  * pagesize.  If it can't be computed, or isn't known, the
6047  * routine returns zero.  If you ask about an architecturally
6048  * impossible pagesize, the routine will panic (so that the
6049  * hat implementor knows that things are inconsistent.)
6050  */
6051 uint_t
6052 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6053 {
6054 	struct cpuid_info *cpi;
6055 	uint_t dtlb_nent = 0;
6056 
6057 	if (cpu == NULL)
6058 		cpu = CPU;
6059 	cpi = cpu->cpu_m.mcpu_cpi;
6060 
6061 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6062 
6063 	/*
6064 	 * Check the L2 TLB info
6065 	 */
6066 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6067 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6068 
6069 		switch (pagesize) {
6070 
6071 		case 4 * 1024:
6072 			/*
6073 			 * All zero in the top 16 bits of the register
6074 			 * indicates a unified TLB. Size is in low 16 bits.
6075 			 */
6076 			if ((cp->cp_ebx & 0xffff0000) == 0)
6077 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6078 			else
6079 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6080 			break;
6081 
6082 		case 2 * 1024 * 1024:
6083 			if ((cp->cp_eax & 0xffff0000) == 0)
6084 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6085 			else
6086 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6087 			break;
6088 
6089 		default:
6090 			panic("unknown L2 pagesize");
6091 			/*NOTREACHED*/
6092 		}
6093 	}
6094 
6095 	if (dtlb_nent != 0)
6096 		return (dtlb_nent);
6097 
6098 	/*
6099 	 * No L2 TLB support for this size, try L1.
6100 	 */
6101 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6102 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6103 
6104 		switch (pagesize) {
6105 		case 4 * 1024:
6106 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6107 			break;
6108 		case 2 * 1024 * 1024:
6109 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6110 			break;
6111 		default:
6112 			panic("unknown L1 d-TLB pagesize");
6113 			/*NOTREACHED*/
6114 		}
6115 	}
6116 
6117 	return (dtlb_nent);
6118 }
6119 
6120 /*
6121  * Return 0 if the erratum is not present or not applicable, positive
6122  * if it is, and negative if the status of the erratum is unknown.
6123  *
6124  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6125  * Processors" #25759, Rev 3.57, August 2005
6126  */
6127 int
6128 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6129 {
6130 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6131 	uint_t eax;
6132 
6133 	/*
6134 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6135 	 * a legacy (32-bit) AMD CPU.
6136 	 */
6137 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6138 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6139 	    cpi->cpi_family == 6) {
6140 		return (0);
6141 	}
6142 
6143 	eax = cpi->cpi_std[1].cp_eax;
6144 
6145 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6146 #define	SH_B3(eax)	(eax == 0xf51)
6147 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6148 
6149 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6150 
6151 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6152 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6153 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6154 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6155 
6156 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6157 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6158 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6159 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6160 
6161 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6162 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6163 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6164 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6165 #define	BH_E4(eax)	(eax == 0x20fb1)
6166 #define	SH_E5(eax)	(eax == 0x20f42)
6167 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6168 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6169 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6170 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6171 			    DH_E6(eax) || JH_E6(eax))
6172 
6173 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6174 #define	DR_B0(eax)	(eax == 0x100f20)
6175 #define	DR_B1(eax)	(eax == 0x100f21)
6176 #define	DR_BA(eax)	(eax == 0x100f2a)
6177 #define	DR_B2(eax)	(eax == 0x100f22)
6178 #define	DR_B3(eax)	(eax == 0x100f23)
6179 #define	RB_C0(eax)	(eax == 0x100f40)
6180 
6181 	switch (erratum) {
6182 	case 1:
6183 		return (cpi->cpi_family < 0x10);
6184 	case 51:	/* what does the asterisk mean? */
6185 		return (B(eax) || SH_C0(eax) || CG(eax));
6186 	case 52:
6187 		return (B(eax));
6188 	case 57:
6189 		return (cpi->cpi_family <= 0x11);
6190 	case 58:
6191 		return (B(eax));
6192 	case 60:
6193 		return (cpi->cpi_family <= 0x11);
6194 	case 61:
6195 	case 62:
6196 	case 63:
6197 	case 64:
6198 	case 65:
6199 	case 66:
6200 	case 68:
6201 	case 69:
6202 	case 70:
6203 	case 71:
6204 		return (B(eax));
6205 	case 72:
6206 		return (SH_B0(eax));
6207 	case 74:
6208 		return (B(eax));
6209 	case 75:
6210 		return (cpi->cpi_family < 0x10);
6211 	case 76:
6212 		return (B(eax));
6213 	case 77:
6214 		return (cpi->cpi_family <= 0x11);
6215 	case 78:
6216 		return (B(eax) || SH_C0(eax));
6217 	case 79:
6218 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6219 	case 80:
6220 	case 81:
6221 	case 82:
6222 		return (B(eax));
6223 	case 83:
6224 		return (B(eax) || SH_C0(eax) || CG(eax));
6225 	case 85:
6226 		return (cpi->cpi_family < 0x10);
6227 	case 86:
6228 		return (SH_C0(eax) || CG(eax));
6229 	case 88:
6230 		return (B(eax) || SH_C0(eax));
6231 	case 89:
6232 		return (cpi->cpi_family < 0x10);
6233 	case 90:
6234 		return (B(eax) || SH_C0(eax) || CG(eax));
6235 	case 91:
6236 	case 92:
6237 		return (B(eax) || SH_C0(eax));
6238 	case 93:
6239 		return (SH_C0(eax));
6240 	case 94:
6241 		return (B(eax) || SH_C0(eax) || CG(eax));
6242 	case 95:
6243 		return (B(eax) || SH_C0(eax));
6244 	case 96:
6245 		return (B(eax) || SH_C0(eax) || CG(eax));
6246 	case 97:
6247 	case 98:
6248 		return (SH_C0(eax) || CG(eax));
6249 	case 99:
6250 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6251 	case 100:
6252 		return (B(eax) || SH_C0(eax));
6253 	case 101:
6254 	case 103:
6255 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6256 	case 104:
6257 		return (SH_C0(eax) || CG(eax) || D0(eax));
6258 	case 105:
6259 	case 106:
6260 	case 107:
6261 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6262 	case 108:
6263 		return (DH_CG(eax));
6264 	case 109:
6265 		return (SH_C0(eax) || CG(eax) || D0(eax));
6266 	case 110:
6267 		return (D0(eax) || EX(eax));
6268 	case 111:
6269 		return (CG(eax));
6270 	case 112:
6271 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6272 	case 113:
6273 		return (eax == 0x20fc0);
6274 	case 114:
6275 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6276 	case 115:
6277 		return (SH_E0(eax) || JH_E1(eax));
6278 	case 116:
6279 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6280 	case 117:
6281 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6282 	case 118:
6283 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6284 		    JH_E6(eax));
6285 	case 121:
6286 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6287 	case 122:
6288 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6289 	case 123:
6290 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6291 	case 131:
6292 		return (cpi->cpi_family < 0x10);
6293 	case 6336786:
6294 
6295 		/*
6296 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6297 		 * if this is a K8 family or newer processor. We're testing for
6298 		 * this 'erratum' to determine whether or not we have a constant
6299 		 * TSC.
6300 		 *
6301 		 * Our current fix for this is to disable the C1-Clock ramping.
6302 		 * However, this doesn't work on newer processor families nor
6303 		 * does it work when virtualized as those devices don't exist.
6304 		 */
6305 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6306 			return (0);
6307 		}
6308 
6309 		if (CPI_FAMILY(cpi) == 0xf) {
6310 			struct cpuid_regs regs;
6311 			regs.cp_eax = 0x80000007;
6312 			(void) __cpuid_insn(&regs);
6313 			return (!(regs.cp_edx & 0x100));
6314 		}
6315 		return (0);
6316 	case 147:
6317 		/*
6318 		 * This erratum (K8 #147) is not present on family 10 and newer.
6319 		 */
6320 		if (cpi->cpi_family >= 0x10) {
6321 			return (0);
6322 		}
6323 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6324 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6325 
6326 	case 6671130:
6327 		/*
6328 		 * check for processors (pre-Shanghai) that do not provide
6329 		 * optimal management of 1gb ptes in its tlb.
6330 		 */
6331 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6332 
6333 	case 298:
6334 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6335 		    DR_B2(eax) || RB_C0(eax));
6336 
6337 	case 721:
6338 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6339 
6340 	default:
6341 		return (-1);
6342 
6343 	}
6344 }
6345 
6346 /*
6347  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6348  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6349  */
6350 int
6351 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6352 {
6353 	struct cpuid_info	*cpi;
6354 	uint_t			osvwid;
6355 	static int		osvwfeature = -1;
6356 	uint64_t		osvwlength;
6357 
6358 
6359 	cpi = cpu->cpu_m.mcpu_cpi;
6360 
6361 	/* confirm OSVW supported */
6362 	if (osvwfeature == -1) {
6363 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6364 	} else {
6365 		/* assert that osvw feature setting is consistent on all cpus */
6366 		ASSERT(osvwfeature ==
6367 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6368 	}
6369 	if (!osvwfeature)
6370 		return (-1);
6371 
6372 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6373 
6374 	switch (erratum) {
6375 	case 298:	/* osvwid is 0 */
6376 		osvwid = 0;
6377 		if (osvwlength <= (uint64_t)osvwid) {
6378 			/* osvwid 0 is unknown */
6379 			return (-1);
6380 		}
6381 
6382 		/*
6383 		 * Check the OSVW STATUS MSR to determine the state
6384 		 * of the erratum where:
6385 		 *   0 - fixed by HW
6386 		 *   1 - BIOS has applied the workaround when BIOS
6387 		 *   workaround is available. (Or for other errata,
6388 		 *   OS workaround is required.)
6389 		 * For a value of 1, caller will confirm that the
6390 		 * erratum 298 workaround has indeed been applied by BIOS.
6391 		 *
6392 		 * A 1 may be set in cpus that have a HW fix
6393 		 * in a mixed cpu system. Regarding erratum 298:
6394 		 *   In a multiprocessor platform, the workaround above
6395 		 *   should be applied to all processors regardless of
6396 		 *   silicon revision when an affected processor is
6397 		 *   present.
6398 		 */
6399 
6400 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6401 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6402 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6403 
6404 	default:
6405 		return (-1);
6406 	}
6407 }
6408 
6409 static const char assoc_str[] = "associativity";
6410 static const char line_str[] = "line-size";
6411 static const char size_str[] = "size";
6412 
6413 static void
6414 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6415     uint32_t val)
6416 {
6417 	char buf[128];
6418 
6419 	/*
6420 	 * ndi_prop_update_int() is used because it is desirable for
6421 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6422 	 */
6423 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6424 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6425 }
6426 
6427 /*
6428  * Intel-style cache/tlb description
6429  *
6430  * Standard cpuid level 2 gives a randomly ordered
6431  * selection of tags that index into a table that describes
6432  * cache and tlb properties.
6433  */
6434 
6435 static const char l1_icache_str[] = "l1-icache";
6436 static const char l1_dcache_str[] = "l1-dcache";
6437 static const char l2_cache_str[] = "l2-cache";
6438 static const char l3_cache_str[] = "l3-cache";
6439 static const char itlb4k_str[] = "itlb-4K";
6440 static const char dtlb4k_str[] = "dtlb-4K";
6441 static const char itlb2M_str[] = "itlb-2M";
6442 static const char itlb4M_str[] = "itlb-4M";
6443 static const char dtlb4M_str[] = "dtlb-4M";
6444 static const char dtlb24_str[] = "dtlb0-2M-4M";
6445 static const char itlb424_str[] = "itlb-4K-2M-4M";
6446 static const char itlb24_str[] = "itlb-2M-4M";
6447 static const char dtlb44_str[] = "dtlb-4K-4M";
6448 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6449 static const char sl2_cache_str[] = "sectored-l2-cache";
6450 static const char itrace_str[] = "itrace-cache";
6451 static const char sl3_cache_str[] = "sectored-l3-cache";
6452 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6453 
6454 static const struct cachetab {
6455 	uint8_t		ct_code;
6456 	uint8_t		ct_assoc;
6457 	uint16_t	ct_line_size;
6458 	size_t		ct_size;
6459 	const char	*ct_label;
6460 } intel_ctab[] = {
6461 	/*
6462 	 * maintain descending order!
6463 	 *
6464 	 * Codes ignored - Reason
6465 	 * ----------------------
6466 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6467 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6468 	 */
6469 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6470 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6471 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6472 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6473 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6474 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6475 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6476 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6477 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6478 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6479 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6480 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6481 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6482 	{ 0xc0, 4, 0, 8, dtlb44_str },
6483 	{ 0xba, 4, 0, 64, dtlb4k_str },
6484 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6485 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6486 	{ 0xb2, 4, 0, 64, itlb4k_str },
6487 	{ 0xb0, 4, 0, 128, itlb4k_str },
6488 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6489 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6490 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6491 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6492 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6493 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6494 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6495 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6496 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6497 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6498 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6499 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6500 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6501 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6502 	{ 0x73, 8, 0, 64*1024, itrace_str},
6503 	{ 0x72, 8, 0, 32*1024, itrace_str},
6504 	{ 0x71, 8, 0, 16*1024, itrace_str},
6505 	{ 0x70, 8, 0, 12*1024, itrace_str},
6506 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6507 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6508 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6509 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6510 	{ 0x5d, 0, 0, 256, dtlb44_str},
6511 	{ 0x5c, 0, 0, 128, dtlb44_str},
6512 	{ 0x5b, 0, 0, 64, dtlb44_str},
6513 	{ 0x5a, 4, 0, 32, dtlb24_str},
6514 	{ 0x59, 0, 0, 16, dtlb4k_str},
6515 	{ 0x57, 4, 0, 16, dtlb4k_str},
6516 	{ 0x56, 4, 0, 16, dtlb4M_str},
6517 	{ 0x55, 0, 0, 7, itlb24_str},
6518 	{ 0x52, 0, 0, 256, itlb424_str},
6519 	{ 0x51, 0, 0, 128, itlb424_str},
6520 	{ 0x50, 0, 0, 64, itlb424_str},
6521 	{ 0x4f, 0, 0, 32, itlb4k_str},
6522 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6523 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6524 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6525 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6526 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6527 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6528 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6529 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6530 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6531 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6532 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6533 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6534 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6535 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6536 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6537 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6538 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6539 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6540 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6541 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6542 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6543 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6544 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6545 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6546 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6547 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6548 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6549 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6550 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6551 	{ 0x0b, 4, 0, 4, itlb4M_str},
6552 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6553 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6554 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6555 	{ 0x05, 4, 0, 32, dtlb4M_str},
6556 	{ 0x04, 4, 0, 8, dtlb4M_str},
6557 	{ 0x03, 4, 0, 64, dtlb4k_str},
6558 	{ 0x02, 4, 0, 2, itlb4M_str},
6559 	{ 0x01, 4, 0, 32, itlb4k_str},
6560 	{ 0 }
6561 };
6562 
6563 static const struct cachetab cyrix_ctab[] = {
6564 	{ 0x70, 4, 0, 32, "tlb-4K" },
6565 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6566 	{ 0 }
6567 };
6568 
6569 /*
6570  * Search a cache table for a matching entry
6571  */
6572 static const struct cachetab *
6573 find_cacheent(const struct cachetab *ct, uint_t code)
6574 {
6575 	if (code != 0) {
6576 		for (; ct->ct_code != 0; ct++)
6577 			if (ct->ct_code <= code)
6578 				break;
6579 		if (ct->ct_code == code)
6580 			return (ct);
6581 	}
6582 	return (NULL);
6583 }
6584 
6585 /*
6586  * Populate cachetab entry with L2 or L3 cache-information using
6587  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6588  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6589  * information is found.
6590  */
6591 static int
6592 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6593 {
6594 	uint32_t level, i;
6595 	int ret = 0;
6596 
6597 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6598 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6599 
6600 		if (level == 2 || level == 3) {
6601 			ct->ct_assoc =
6602 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6603 			ct->ct_line_size =
6604 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6605 			ct->ct_size = ct->ct_assoc *
6606 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6607 			    ct->ct_line_size *
6608 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6609 
6610 			if (level == 2) {
6611 				ct->ct_label = l2_cache_str;
6612 			} else if (level == 3) {
6613 				ct->ct_label = l3_cache_str;
6614 			}
6615 			ret = 1;
6616 		}
6617 	}
6618 
6619 	return (ret);
6620 }
6621 
6622 /*
6623  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6624  * The walk is terminated if the walker returns non-zero.
6625  */
6626 static void
6627 intel_walk_cacheinfo(struct cpuid_info *cpi,
6628     void *arg, int (*func)(void *, const struct cachetab *))
6629 {
6630 	const struct cachetab *ct;
6631 	struct cachetab des_49_ct, des_b1_ct;
6632 	uint8_t *dp;
6633 	int i;
6634 
6635 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6636 		return;
6637 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6638 		/*
6639 		 * For overloaded descriptor 0x49 we use cpuid function 4
6640 		 * if supported by the current processor, to create
6641 		 * cache information.
6642 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6643 		 * to disambiguate the cache information.
6644 		 */
6645 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6646 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6647 				ct = &des_49_ct;
6648 		} else if (*dp == 0xb1) {
6649 			des_b1_ct.ct_code = 0xb1;
6650 			des_b1_ct.ct_assoc = 4;
6651 			des_b1_ct.ct_line_size = 0;
6652 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6653 				des_b1_ct.ct_size = 8;
6654 				des_b1_ct.ct_label = itlb2M_str;
6655 			} else {
6656 				des_b1_ct.ct_size = 4;
6657 				des_b1_ct.ct_label = itlb4M_str;
6658 			}
6659 			ct = &des_b1_ct;
6660 		} else {
6661 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6662 				continue;
6663 			}
6664 		}
6665 
6666 		if (func(arg, ct) != 0) {
6667 			break;
6668 		}
6669 	}
6670 }
6671 
6672 /*
6673  * (Like the Intel one, except for Cyrix CPUs)
6674  */
6675 static void
6676 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6677     void *arg, int (*func)(void *, const struct cachetab *))
6678 {
6679 	const struct cachetab *ct;
6680 	uint8_t *dp;
6681 	int i;
6682 
6683 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6684 		return;
6685 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6686 		/*
6687 		 * Search Cyrix-specific descriptor table first ..
6688 		 */
6689 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6690 			if (func(arg, ct) != 0)
6691 				break;
6692 			continue;
6693 		}
6694 		/*
6695 		 * .. else fall back to the Intel one
6696 		 */
6697 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6698 			if (func(arg, ct) != 0)
6699 				break;
6700 			continue;
6701 		}
6702 	}
6703 }
6704 
6705 /*
6706  * A cacheinfo walker that adds associativity, line-size, and size properties
6707  * to the devinfo node it is passed as an argument.
6708  */
6709 static int
6710 add_cacheent_props(void *arg, const struct cachetab *ct)
6711 {
6712 	dev_info_t *devi = arg;
6713 
6714 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6715 	if (ct->ct_line_size != 0)
6716 		add_cache_prop(devi, ct->ct_label, line_str,
6717 		    ct->ct_line_size);
6718 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6719 	return (0);
6720 }
6721 
6722 
6723 static const char fully_assoc[] = "fully-associative?";
6724 
6725 /*
6726  * AMD style cache/tlb description
6727  *
6728  * Extended functions 5 and 6 directly describe properties of
6729  * tlbs and various cache levels.
6730  */
6731 static void
6732 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6733 {
6734 	switch (assoc) {
6735 	case 0:	/* reserved; ignore */
6736 		break;
6737 	default:
6738 		add_cache_prop(devi, label, assoc_str, assoc);
6739 		break;
6740 	case 0xff:
6741 		add_cache_prop(devi, label, fully_assoc, 1);
6742 		break;
6743 	}
6744 }
6745 
6746 static void
6747 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6748 {
6749 	if (size == 0)
6750 		return;
6751 	add_cache_prop(devi, label, size_str, size);
6752 	add_amd_assoc(devi, label, assoc);
6753 }
6754 
6755 static void
6756 add_amd_cache(dev_info_t *devi, const char *label,
6757     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6758 {
6759 	if (size == 0 || line_size == 0)
6760 		return;
6761 	add_amd_assoc(devi, label, assoc);
6762 	/*
6763 	 * Most AMD parts have a sectored cache. Multiple cache lines are
6764 	 * associated with each tag. A sector consists of all cache lines
6765 	 * associated with a tag. For example, the AMD K6-III has a sector
6766 	 * size of 2 cache lines per tag.
6767 	 */
6768 	if (lines_per_tag != 0)
6769 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6770 	add_cache_prop(devi, label, line_str, line_size);
6771 	add_cache_prop(devi, label, size_str, size * 1024);
6772 }
6773 
6774 static void
6775 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6776 {
6777 	switch (assoc) {
6778 	case 0:	/* off */
6779 		break;
6780 	case 1:
6781 	case 2:
6782 	case 4:
6783 		add_cache_prop(devi, label, assoc_str, assoc);
6784 		break;
6785 	case 6:
6786 		add_cache_prop(devi, label, assoc_str, 8);
6787 		break;
6788 	case 8:
6789 		add_cache_prop(devi, label, assoc_str, 16);
6790 		break;
6791 	case 0xf:
6792 		add_cache_prop(devi, label, fully_assoc, 1);
6793 		break;
6794 	default: /* reserved; ignore */
6795 		break;
6796 	}
6797 }
6798 
6799 static void
6800 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6801 {
6802 	if (size == 0 || assoc == 0)
6803 		return;
6804 	add_amd_l2_assoc(devi, label, assoc);
6805 	add_cache_prop(devi, label, size_str, size);
6806 }
6807 
6808 static void
6809 add_amd_l2_cache(dev_info_t *devi, const char *label,
6810     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6811 {
6812 	if (size == 0 || assoc == 0 || line_size == 0)
6813 		return;
6814 	add_amd_l2_assoc(devi, label, assoc);
6815 	if (lines_per_tag != 0)
6816 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6817 	add_cache_prop(devi, label, line_str, line_size);
6818 	add_cache_prop(devi, label, size_str, size * 1024);
6819 }
6820 
6821 static void
6822 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6823 {
6824 	struct cpuid_regs *cp;
6825 
6826 	if (cpi->cpi_xmaxeax < 0x80000005)
6827 		return;
6828 	cp = &cpi->cpi_extd[5];
6829 
6830 	/*
6831 	 * 4M/2M L1 TLB configuration
6832 	 *
6833 	 * We report the size for 2M pages because AMD uses two
6834 	 * TLB entries for one 4M page.
6835 	 */
6836 	add_amd_tlb(devi, "dtlb-2M",
6837 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6838 	add_amd_tlb(devi, "itlb-2M",
6839 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6840 
6841 	/*
6842 	 * 4K L1 TLB configuration
6843 	 */
6844 
6845 	switch (cpi->cpi_vendor) {
6846 		uint_t nentries;
6847 	case X86_VENDOR_TM:
6848 		if (cpi->cpi_family >= 5) {
6849 			/*
6850 			 * Crusoe processors have 256 TLB entries, but
6851 			 * cpuid data format constrains them to only
6852 			 * reporting 255 of them.
6853 			 */
6854 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6855 				nentries = 256;
6856 			/*
6857 			 * Crusoe processors also have a unified TLB
6858 			 */
6859 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6860 			    nentries);
6861 			break;
6862 		}
6863 		/*FALLTHROUGH*/
6864 	default:
6865 		add_amd_tlb(devi, itlb4k_str,
6866 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6867 		add_amd_tlb(devi, dtlb4k_str,
6868 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6869 		break;
6870 	}
6871 
6872 	/*
6873 	 * data L1 cache configuration
6874 	 */
6875 
6876 	add_amd_cache(devi, l1_dcache_str,
6877 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6878 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6879 
6880 	/*
6881 	 * code L1 cache configuration
6882 	 */
6883 
6884 	add_amd_cache(devi, l1_icache_str,
6885 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6886 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6887 
6888 	if (cpi->cpi_xmaxeax < 0x80000006)
6889 		return;
6890 	cp = &cpi->cpi_extd[6];
6891 
6892 	/* Check for a unified L2 TLB for large pages */
6893 
6894 	if (BITX(cp->cp_eax, 31, 16) == 0)
6895 		add_amd_l2_tlb(devi, "l2-tlb-2M",
6896 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6897 	else {
6898 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
6899 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6900 		add_amd_l2_tlb(devi, "l2-itlb-2M",
6901 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6902 	}
6903 
6904 	/* Check for a unified L2 TLB for 4K pages */
6905 
6906 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
6907 		add_amd_l2_tlb(devi, "l2-tlb-4K",
6908 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6909 	} else {
6910 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
6911 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6912 		add_amd_l2_tlb(devi, "l2-itlb-4K",
6913 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6914 	}
6915 
6916 	add_amd_l2_cache(devi, l2_cache_str,
6917 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6918 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6919 }
6920 
6921 /*
6922  * There are two basic ways that the x86 world describes it cache
6923  * and tlb architecture - Intel's way and AMD's way.
6924  *
6925  * Return which flavor of cache architecture we should use
6926  */
6927 static int
6928 x86_which_cacheinfo(struct cpuid_info *cpi)
6929 {
6930 	switch (cpi->cpi_vendor) {
6931 	case X86_VENDOR_Intel:
6932 		if (cpi->cpi_maxeax >= 2)
6933 			return (X86_VENDOR_Intel);
6934 		break;
6935 	case X86_VENDOR_AMD:
6936 		/*
6937 		 * The K5 model 1 was the first part from AMD that reported
6938 		 * cache sizes via extended cpuid functions.
6939 		 */
6940 		if (cpi->cpi_family > 5 ||
6941 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6942 			return (X86_VENDOR_AMD);
6943 		break;
6944 	case X86_VENDOR_HYGON:
6945 		return (X86_VENDOR_AMD);
6946 	case X86_VENDOR_TM:
6947 		if (cpi->cpi_family >= 5)
6948 			return (X86_VENDOR_AMD);
6949 		/*FALLTHROUGH*/
6950 	default:
6951 		/*
6952 		 * If they have extended CPU data for 0x80000005
6953 		 * then we assume they have AMD-format cache
6954 		 * information.
6955 		 *
6956 		 * If not, and the vendor happens to be Cyrix,
6957 		 * then try our-Cyrix specific handler.
6958 		 *
6959 		 * If we're not Cyrix, then assume we're using Intel's
6960 		 * table-driven format instead.
6961 		 */
6962 		if (cpi->cpi_xmaxeax >= 0x80000005)
6963 			return (X86_VENDOR_AMD);
6964 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6965 			return (X86_VENDOR_Cyrix);
6966 		else if (cpi->cpi_maxeax >= 2)
6967 			return (X86_VENDOR_Intel);
6968 		break;
6969 	}
6970 	return (-1);
6971 }
6972 
6973 void
6974 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6975     struct cpuid_info *cpi)
6976 {
6977 	dev_info_t *cpu_devi;
6978 	int create;
6979 
6980 	cpu_devi = (dev_info_t *)dip;
6981 
6982 	/* device_type */
6983 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6984 	    "device_type", "cpu");
6985 
6986 	/* reg */
6987 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6988 	    "reg", cpu_id);
6989 
6990 	/* cpu-mhz, and clock-frequency */
6991 	if (cpu_freq > 0) {
6992 		long long mul;
6993 
6994 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6995 		    "cpu-mhz", cpu_freq);
6996 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6997 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6998 			    "clock-frequency", (int)mul);
6999 	}
7000 
7001 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7002 
7003 	/* vendor-id */
7004 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7005 	    "vendor-id", cpi->cpi_vendorstr);
7006 
7007 	if (cpi->cpi_maxeax == 0) {
7008 		return;
7009 	}
7010 
7011 	/*
7012 	 * family, model, and step
7013 	 */
7014 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7015 	    "family", CPI_FAMILY(cpi));
7016 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7017 	    "cpu-model", CPI_MODEL(cpi));
7018 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7019 	    "stepping-id", CPI_STEP(cpi));
7020 
7021 	/* type */
7022 	switch (cpi->cpi_vendor) {
7023 	case X86_VENDOR_Intel:
7024 		create = 1;
7025 		break;
7026 	default:
7027 		create = 0;
7028 		break;
7029 	}
7030 	if (create)
7031 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7032 		    "type", CPI_TYPE(cpi));
7033 
7034 	/* ext-family */
7035 	switch (cpi->cpi_vendor) {
7036 	case X86_VENDOR_Intel:
7037 	case X86_VENDOR_AMD:
7038 		create = cpi->cpi_family >= 0xf;
7039 		break;
7040 	case X86_VENDOR_HYGON:
7041 		create = 1;
7042 		break;
7043 	default:
7044 		create = 0;
7045 		break;
7046 	}
7047 	if (create)
7048 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7049 		    "ext-family", CPI_FAMILY_XTD(cpi));
7050 
7051 	/* ext-model */
7052 	switch (cpi->cpi_vendor) {
7053 	case X86_VENDOR_Intel:
7054 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7055 		break;
7056 	case X86_VENDOR_AMD:
7057 		create = CPI_FAMILY(cpi) == 0xf;
7058 		break;
7059 	case X86_VENDOR_HYGON:
7060 		create = 1;
7061 		break;
7062 	default:
7063 		create = 0;
7064 		break;
7065 	}
7066 	if (create)
7067 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7068 		    "ext-model", CPI_MODEL_XTD(cpi));
7069 
7070 	/* generation */
7071 	switch (cpi->cpi_vendor) {
7072 	case X86_VENDOR_AMD:
7073 	case X86_VENDOR_HYGON:
7074 		/*
7075 		 * AMD K5 model 1 was the first part to support this
7076 		 */
7077 		create = cpi->cpi_xmaxeax >= 0x80000001;
7078 		break;
7079 	default:
7080 		create = 0;
7081 		break;
7082 	}
7083 	if (create)
7084 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7085 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7086 
7087 	/* brand-id */
7088 	switch (cpi->cpi_vendor) {
7089 	case X86_VENDOR_Intel:
7090 		/*
7091 		 * brand id first appeared on Pentium III Xeon model 8,
7092 		 * and Celeron model 8 processors and Opteron
7093 		 */
7094 		create = cpi->cpi_family > 6 ||
7095 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7096 		break;
7097 	case X86_VENDOR_AMD:
7098 		create = cpi->cpi_family >= 0xf;
7099 		break;
7100 	case X86_VENDOR_HYGON:
7101 		create = 1;
7102 		break;
7103 	default:
7104 		create = 0;
7105 		break;
7106 	}
7107 	if (create && cpi->cpi_brandid != 0) {
7108 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7109 		    "brand-id", cpi->cpi_brandid);
7110 	}
7111 
7112 	/* chunks, and apic-id */
7113 	switch (cpi->cpi_vendor) {
7114 		/*
7115 		 * first available on Pentium IV and Opteron (K8)
7116 		 */
7117 	case X86_VENDOR_Intel:
7118 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7119 		break;
7120 	case X86_VENDOR_AMD:
7121 		create = cpi->cpi_family >= 0xf;
7122 		break;
7123 	case X86_VENDOR_HYGON:
7124 		create = 1;
7125 		break;
7126 	default:
7127 		create = 0;
7128 		break;
7129 	}
7130 	if (create) {
7131 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7132 		    "chunks", CPI_CHUNKS(cpi));
7133 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7134 		    "apic-id", cpi->cpi_apicid);
7135 		if (cpi->cpi_chipid >= 0) {
7136 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7137 			    "chip#", cpi->cpi_chipid);
7138 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7139 			    "clog#", cpi->cpi_clogid);
7140 		}
7141 	}
7142 
7143 	/* cpuid-features */
7144 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7145 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7146 
7147 
7148 	/* cpuid-features-ecx */
7149 	switch (cpi->cpi_vendor) {
7150 	case X86_VENDOR_Intel:
7151 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7152 		break;
7153 	case X86_VENDOR_AMD:
7154 		create = cpi->cpi_family >= 0xf;
7155 		break;
7156 	case X86_VENDOR_HYGON:
7157 		create = 1;
7158 		break;
7159 	default:
7160 		create = 0;
7161 		break;
7162 	}
7163 	if (create)
7164 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7165 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7166 
7167 	/* ext-cpuid-features */
7168 	switch (cpi->cpi_vendor) {
7169 	case X86_VENDOR_Intel:
7170 	case X86_VENDOR_AMD:
7171 	case X86_VENDOR_HYGON:
7172 	case X86_VENDOR_Cyrix:
7173 	case X86_VENDOR_TM:
7174 	case X86_VENDOR_Centaur:
7175 		create = cpi->cpi_xmaxeax >= 0x80000001;
7176 		break;
7177 	default:
7178 		create = 0;
7179 		break;
7180 	}
7181 	if (create) {
7182 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7183 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7184 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7185 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7186 	}
7187 
7188 	/*
7189 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7190 	 * model 1, and Cyrix GXm.  On earlier models we try and
7191 	 * simulate something similar .. so this string should always
7192 	 * same -something- about the processor, however lame.
7193 	 */
7194 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7195 	    "brand-string", cpi->cpi_brandstr);
7196 
7197 	/*
7198 	 * Finally, cache and tlb information
7199 	 */
7200 	switch (x86_which_cacheinfo(cpi)) {
7201 	case X86_VENDOR_Intel:
7202 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7203 		break;
7204 	case X86_VENDOR_Cyrix:
7205 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7206 		break;
7207 	case X86_VENDOR_AMD:
7208 		amd_cache_info(cpi, cpu_devi);
7209 		break;
7210 	default:
7211 		break;
7212 	}
7213 }
7214 
7215 struct l2info {
7216 	int *l2i_csz;
7217 	int *l2i_lsz;
7218 	int *l2i_assoc;
7219 	int l2i_ret;
7220 };
7221 
7222 /*
7223  * A cacheinfo walker that fetches the size, line-size and associativity
7224  * of the L2 cache
7225  */
7226 static int
7227 intel_l2cinfo(void *arg, const struct cachetab *ct)
7228 {
7229 	struct l2info *l2i = arg;
7230 	int *ip;
7231 
7232 	if (ct->ct_label != l2_cache_str &&
7233 	    ct->ct_label != sl2_cache_str)
7234 		return (0);	/* not an L2 -- keep walking */
7235 
7236 	if ((ip = l2i->l2i_csz) != NULL)
7237 		*ip = ct->ct_size;
7238 	if ((ip = l2i->l2i_lsz) != NULL)
7239 		*ip = ct->ct_line_size;
7240 	if ((ip = l2i->l2i_assoc) != NULL)
7241 		*ip = ct->ct_assoc;
7242 	l2i->l2i_ret = ct->ct_size;
7243 	return (1);		/* was an L2 -- terminate walk */
7244 }
7245 
7246 /*
7247  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7248  *
7249  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7250  *	value is the associativity, the associativity for the L2 cache and
7251  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7252  *	an index into the amd_afd[] array to determine the associativity.
7253  *	-1 is undefined. 0 is fully associative.
7254  */
7255 
7256 static int amd_afd[] =
7257 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7258 
7259 static void
7260 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7261 {
7262 	struct cpuid_regs *cp;
7263 	uint_t size, assoc;
7264 	int i;
7265 	int *ip;
7266 
7267 	if (cpi->cpi_xmaxeax < 0x80000006)
7268 		return;
7269 	cp = &cpi->cpi_extd[6];
7270 
7271 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7272 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7273 		uint_t cachesz = size * 1024;
7274 		assoc = amd_afd[i];
7275 
7276 		ASSERT(assoc != -1);
7277 
7278 		if ((ip = l2i->l2i_csz) != NULL)
7279 			*ip = cachesz;
7280 		if ((ip = l2i->l2i_lsz) != NULL)
7281 			*ip = BITX(cp->cp_ecx, 7, 0);
7282 		if ((ip = l2i->l2i_assoc) != NULL)
7283 			*ip = assoc;
7284 		l2i->l2i_ret = cachesz;
7285 	}
7286 }
7287 
7288 int
7289 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7290 {
7291 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7292 	struct l2info __l2info, *l2i = &__l2info;
7293 
7294 	l2i->l2i_csz = csz;
7295 	l2i->l2i_lsz = lsz;
7296 	l2i->l2i_assoc = assoc;
7297 	l2i->l2i_ret = -1;
7298 
7299 	switch (x86_which_cacheinfo(cpi)) {
7300 	case X86_VENDOR_Intel:
7301 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7302 		break;
7303 	case X86_VENDOR_Cyrix:
7304 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7305 		break;
7306 	case X86_VENDOR_AMD:
7307 		amd_l2cacheinfo(cpi, l2i);
7308 		break;
7309 	default:
7310 		break;
7311 	}
7312 	return (l2i->l2i_ret);
7313 }
7314 
7315 #if !defined(__xpv)
7316 
7317 uint32_t *
7318 cpuid_mwait_alloc(cpu_t *cpu)
7319 {
7320 	uint32_t	*ret;
7321 	size_t		mwait_size;
7322 
7323 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7324 
7325 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7326 	if (mwait_size == 0)
7327 		return (NULL);
7328 
7329 	/*
7330 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7331 	 * allocations.  mwait_size is currently cache line sized.  Neither
7332 	 * of these implementation details are guarantied to be true in the
7333 	 * future.
7334 	 *
7335 	 * First try allocating mwait_size as kmem_alloc() currently returns
7336 	 * correctly aligned memory.  If kmem_alloc() does not return
7337 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7338 	 *
7339 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7340 	 * decide to free this memory.
7341 	 */
7342 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7343 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7344 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7345 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7346 		*ret = MWAIT_RUNNING;
7347 		return (ret);
7348 	} else {
7349 		kmem_free(ret, mwait_size);
7350 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7351 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7352 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7353 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7354 		*ret = MWAIT_RUNNING;
7355 		return (ret);
7356 	}
7357 }
7358 
7359 void
7360 cpuid_mwait_free(cpu_t *cpu)
7361 {
7362 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7363 		return;
7364 	}
7365 
7366 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7367 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7368 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7369 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7370 	}
7371 
7372 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7373 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7374 }
7375 
7376 void
7377 patch_tsc_read(int flag)
7378 {
7379 	size_t cnt;
7380 
7381 	switch (flag) {
7382 	case TSC_NONE:
7383 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7384 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7385 		break;
7386 	case TSC_RDTSC_LFENCE:
7387 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7388 		(void) memcpy((void *)tsc_read,
7389 		    (void *)&_tsc_lfence_start, cnt);
7390 		break;
7391 	case TSC_TSCP:
7392 		cnt = &_tscp_end - &_tscp_start;
7393 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7394 		break;
7395 	default:
7396 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7397 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7398 		break;
7399 	}
7400 	tsc_type = flag;
7401 }
7402 
7403 int
7404 cpuid_deep_cstates_supported(void)
7405 {
7406 	struct cpuid_info *cpi;
7407 	struct cpuid_regs regs;
7408 
7409 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7410 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7411 
7412 	cpi = CPU->cpu_m.mcpu_cpi;
7413 
7414 	switch (cpi->cpi_vendor) {
7415 	case X86_VENDOR_Intel:
7416 		if (cpi->cpi_xmaxeax < 0x80000007)
7417 			return (0);
7418 
7419 		/*
7420 		 * Does TSC run at a constant rate in all C-states?
7421 		 */
7422 		regs.cp_eax = 0x80000007;
7423 		(void) __cpuid_insn(&regs);
7424 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7425 
7426 	default:
7427 		return (0);
7428 	}
7429 }
7430 
7431 #endif	/* !__xpv */
7432 
7433 void
7434 post_startup_cpu_fixups(void)
7435 {
7436 #ifndef __xpv
7437 	/*
7438 	 * Some AMD processors support C1E state. Entering this state will
7439 	 * cause the local APIC timer to stop, which we can't deal with at
7440 	 * this time.
7441 	 */
7442 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7443 		on_trap_data_t otd;
7444 		uint64_t reg;
7445 
7446 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7447 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7448 			/* Disable C1E state if it is enabled by BIOS */
7449 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7450 			    AMD_ACTONCMPHALT_MASK) {
7451 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7452 				    AMD_ACTONCMPHALT_SHIFT);
7453 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7454 			}
7455 		}
7456 		no_trap();
7457 	}
7458 #endif	/* !__xpv */
7459 }
7460 
7461 void
7462 enable_pcid(void)
7463 {
7464 	if (x86_use_pcid == -1)
7465 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7466 
7467 	if (x86_use_invpcid == -1) {
7468 		x86_use_invpcid = is_x86_feature(x86_featureset,
7469 		    X86FSET_INVPCID);
7470 	}
7471 
7472 	if (!x86_use_pcid)
7473 		return;
7474 
7475 	/*
7476 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7477 	 * bits; better make sure there's nothing there.
7478 	 */
7479 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7480 
7481 	setcr4(getcr4() | CR4_PCIDE);
7482 }
7483 
7484 /*
7485  * Setup necessary registers to enable XSAVE feature on this processor.
7486  * This function needs to be called early enough, so that no xsave/xrstor
7487  * ops will execute on the processor before the MSRs are properly set up.
7488  *
7489  * Current implementation has the following assumption:
7490  * - cpuid_pass_basic() is done, so that X86 features are known.
7491  * - fpu_probe() is done, so that fp_save_mech is chosen.
7492  */
7493 void
7494 xsave_setup_msr(cpu_t *cpu)
7495 {
7496 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7497 	ASSERT(fp_save_mech == FP_XSAVE);
7498 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7499 
7500 	/* Enable OSXSAVE in CR4. */
7501 	setcr4(getcr4() | CR4_OSXSAVE);
7502 	/*
7503 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7504 	 * correct value.
7505 	 */
7506 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7507 	setup_xfem();
7508 }
7509 
7510 /*
7511  * Starting with the Westmere processor the local
7512  * APIC timer will continue running in all C-states,
7513  * including the deepest C-states.
7514  */
7515 int
7516 cpuid_arat_supported(void)
7517 {
7518 	struct cpuid_info *cpi;
7519 	struct cpuid_regs regs;
7520 
7521 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7522 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7523 
7524 	cpi = CPU->cpu_m.mcpu_cpi;
7525 
7526 	switch (cpi->cpi_vendor) {
7527 	case X86_VENDOR_Intel:
7528 		/*
7529 		 * Always-running Local APIC Timer is
7530 		 * indicated by CPUID.6.EAX[2].
7531 		 */
7532 		if (cpi->cpi_maxeax >= 6) {
7533 			regs.cp_eax = 6;
7534 			(void) cpuid_insn(NULL, &regs);
7535 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7536 		} else {
7537 			return (0);
7538 		}
7539 	default:
7540 		return (0);
7541 	}
7542 }
7543 
7544 /*
7545  * Check support for Intel ENERGY_PERF_BIAS feature
7546  */
7547 int
7548 cpuid_iepb_supported(struct cpu *cp)
7549 {
7550 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7551 	struct cpuid_regs regs;
7552 
7553 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7554 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7555 
7556 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7557 		return (0);
7558 	}
7559 
7560 	/*
7561 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7562 	 * capability bit CPUID.6.ECX.3
7563 	 */
7564 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7565 		return (0);
7566 
7567 	regs.cp_eax = 0x6;
7568 	(void) cpuid_insn(NULL, &regs);
7569 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7570 }
7571 
7572 /*
7573  * Check support for TSC deadline timer
7574  *
7575  * TSC deadline timer provides a superior software programming
7576  * model over local APIC timer that eliminates "time drifts".
7577  * Instead of specifying a relative time, software specifies an
7578  * absolute time as the target at which the processor should
7579  * generate a timer event.
7580  */
7581 int
7582 cpuid_deadline_tsc_supported(void)
7583 {
7584 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7585 	struct cpuid_regs regs;
7586 
7587 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7588 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7589 
7590 	switch (cpi->cpi_vendor) {
7591 	case X86_VENDOR_Intel:
7592 		if (cpi->cpi_maxeax >= 1) {
7593 			regs.cp_eax = 1;
7594 			(void) cpuid_insn(NULL, &regs);
7595 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7596 		} else {
7597 			return (0);
7598 		}
7599 	default:
7600 		return (0);
7601 	}
7602 }
7603 
7604 #if !defined(__xpv)
7605 /*
7606  * Patch in versions of bcopy for high performance Intel Nhm processors
7607  * and later...
7608  */
7609 void
7610 patch_memops(uint_t vendor)
7611 {
7612 	size_t cnt, i;
7613 	caddr_t to, from;
7614 
7615 	if ((vendor == X86_VENDOR_Intel) &&
7616 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7617 		cnt = &bcopy_patch_end - &bcopy_patch_start;
7618 		to = &bcopy_ck_size;
7619 		from = &bcopy_patch_start;
7620 		for (i = 0; i < cnt; i++) {
7621 			*to++ = *from++;
7622 		}
7623 	}
7624 }
7625 #endif  /*  !__xpv */
7626 
7627 /*
7628  * We're being asked to tell the system how many bits are required to represent
7629  * the various thread and strand IDs. While it's tempting to derive this based
7630  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7631  * correct. Instead, this needs to be based on the number of bits that the APIC
7632  * allows for these different configurations. We only update these to a larger
7633  * value if we find one.
7634  */
7635 void
7636 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7637 {
7638 	struct cpuid_info *cpi;
7639 
7640 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7641 	cpi = cpu->cpu_m.mcpu_cpi;
7642 
7643 	if (cpi->cpi_ncore_bits > *core_nbits) {
7644 		*core_nbits = cpi->cpi_ncore_bits;
7645 	}
7646 
7647 	if (cpi->cpi_nthread_bits > *strand_nbits) {
7648 		*strand_nbits = cpi->cpi_nthread_bits;
7649 	}
7650 }
7651 
7652 void
7653 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7654 {
7655 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7656 	struct cpuid_regs cp;
7657 
7658 	/*
7659 	 * Reread the CPUID portions that we need for various security
7660 	 * information.
7661 	 */
7662 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7663 		/*
7664 		 * Check if we now have leaf 7 available to us.
7665 		 */
7666 		if (cpi->cpi_maxeax < 7) {
7667 			bzero(&cp, sizeof (cp));
7668 			cp.cp_eax = 0;
7669 			cpi->cpi_maxeax = __cpuid_insn(&cp);
7670 			if (cpi->cpi_maxeax < 7)
7671 				return;
7672 		}
7673 
7674 		bzero(&cp, sizeof (cp));
7675 		cp.cp_eax = 7;
7676 		cp.cp_ecx = 0;
7677 		(void) __cpuid_insn(&cp);
7678 		cpi->cpi_std[7] = cp;
7679 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7680 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
7681 		/* No xcpuid support */
7682 		if (cpi->cpi_family < 5 ||
7683 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7684 			return;
7685 
7686 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7687 			bzero(&cp, sizeof (cp));
7688 			cp.cp_eax = CPUID_LEAF_EXT_0;
7689 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7690 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7691 				return;
7692 			}
7693 		}
7694 
7695 		bzero(&cp, sizeof (cp));
7696 		cp.cp_eax = CPUID_LEAF_EXT_8;
7697 		(void) __cpuid_insn(&cp);
7698 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7699 		cpi->cpi_extd[8] = cp;
7700 	} else {
7701 		/*
7702 		 * Nothing to do here. Return an empty set which has already
7703 		 * been zeroed for us.
7704 		 */
7705 		return;
7706 	}
7707 	cpuid_scan_security(cpu, fset);
7708 }
7709 
7710 /* ARGSUSED */
7711 static int
7712 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7713 {
7714 	uchar_t *fset;
7715 	boolean_t first_pass = (boolean_t)arg1;
7716 
7717 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7718 	if (first_pass && CPU->cpu_id != 0)
7719 		return (0);
7720 	if (!first_pass && CPU->cpu_id == 0)
7721 		return (0);
7722 	cpuid_pass_ucode(CPU, fset);
7723 
7724 	return (0);
7725 }
7726 
7727 /*
7728  * After a microcode update where the version has changed, then we need to
7729  * rescan CPUID. To do this we check every CPU to make sure that they have the
7730  * same microcode. Then we perform a cross call to all such CPUs. It's the
7731  * caller's job to make sure that no one else can end up doing an update while
7732  * this is going on.
7733  *
7734  * We assume that the system is microcode capable if we're called.
7735  */
7736 void
7737 cpuid_post_ucodeadm(void)
7738 {
7739 	uint32_t rev;
7740 	int i;
7741 	struct cpu *cpu;
7742 	cpuset_t cpuset;
7743 	void *argdata;
7744 	uchar_t *f0;
7745 
7746 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7747 
7748 	mutex_enter(&cpu_lock);
7749 	cpu = cpu_get(0);
7750 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7751 	CPUSET_ONLY(cpuset, 0);
7752 	for (i = 1; i < max_ncpus; i++) {
7753 		if ((cpu = cpu_get(i)) == NULL)
7754 			continue;
7755 
7756 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7757 			panic("post microcode update CPU %d has differing "
7758 			    "microcode revision (%u) from CPU 0 (%u)",
7759 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7760 		}
7761 		CPUSET_ADD(cpuset, i);
7762 	}
7763 
7764 	/*
7765 	 * We do the cross calls in two passes. The first pass is only for the
7766 	 * boot CPU. The second pass is for all of the other CPUs. This allows
7767 	 * the boot CPU to go through and change behavior related to patching or
7768 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
7769 	 * other CPUs to follow suit.
7770 	 */
7771 	kpreempt_disable();
7772 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7773 	    cpuid_post_ucodeadm_xc);
7774 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7775 	    cpuid_post_ucodeadm_xc);
7776 	kpreempt_enable();
7777 
7778 	/*
7779 	 * OK, now look at each CPU and see if their feature sets are equal.
7780 	 */
7781 	f0 = argdata;
7782 	for (i = 1; i < max_ncpus; i++) {
7783 		uchar_t *fset;
7784 		if (!CPU_IN_SET(cpuset, i))
7785 			continue;
7786 
7787 		fset = (uchar_t *)((uintptr_t)argdata +
7788 		    sizeof (x86_featureset) * i);
7789 
7790 		if (!compare_x86_featureset(f0, fset)) {
7791 			panic("Post microcode update CPU %d has "
7792 			    "differing security feature (%p) set from CPU 0 "
7793 			    "(%p), not appending to feature set", i,
7794 			    (void *)fset, (void *)f0);
7795 		}
7796 	}
7797 
7798 	mutex_exit(&cpu_lock);
7799 
7800 	for (i = 0; i < NUM_X86_FEATURES; i++) {
7801 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
7802 		    x86_feature_names[i]);
7803 		if (is_x86_feature(f0, i)) {
7804 			add_x86_feature(x86_featureset, i);
7805 		}
7806 	}
7807 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
7808 }
7809 
7810 typedef void (*cpuid_pass_f)(cpu_t *, void *);
7811 
7812 typedef struct cpuid_pass_def {
7813 	cpuid_pass_t cpd_pass;
7814 	cpuid_pass_f cpd_func;
7815 } cpuid_pass_def_t;
7816 
7817 /*
7818  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
7819  * normal sense and should not appear here.
7820  */
7821 static const cpuid_pass_def_t cpuid_pass_defs[] = {
7822 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
7823 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
7824 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
7825 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
7826 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
7827 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
7828 };
7829 
7830 void
7831 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
7832 {
7833 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
7834 
7835 	if (cp == NULL)
7836 		cp = CPU;
7837 
7838 	/*
7839 	 * Space statically allocated for BSP, ensure pointer is set
7840 	 */
7841 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
7842 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
7843 
7844 	ASSERT(cpuid_checkpass(cp, pass - 1));
7845 
7846 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
7847 		if (cpuid_pass_defs[i].cpd_pass == pass) {
7848 			cpuid_pass_defs[i].cpd_func(cp, arg);
7849 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
7850 			return;
7851 		}
7852 	}
7853 
7854 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
7855 	    pass, cp->cpu_id);
7856 }
7857 
7858 /*
7859  * Extract the processor family from a chiprev.  Processor families are not the
7860  * same as cpuid families; see comments above and in x86_archext.h.
7861  */
7862 x86_processor_family_t
7863 chiprev_family(const x86_chiprev_t cr)
7864 {
7865 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
7866 }
7867 
7868 /*
7869  * A chiprev matches its template if the vendor and family are identical and the
7870  * revision of the chiprev matches one of the bits set in the template.  Callers
7871  * may bitwise-OR together chiprevs of the same vendor and family to form the
7872  * template, or use the _ANY variant.  It is not possible to match chiprevs of
7873  * multiple vendors or processor families with a single call.  Note that this
7874  * function operates on processor families, not cpuid families.
7875  */
7876 boolean_t
7877 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
7878 {
7879 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
7880 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
7881 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
7882 }
7883 
7884 /*
7885  * A chiprev is at least min if the vendor and family are identical and the
7886  * revision of the chiprev is at least as recent as that of min.  Processor
7887  * families are considered unordered and cannot be compared using this function.
7888  * Note that this function operates on processor families, not cpuid families.
7889  * Use of the _ANY chiprev variant with this function is not useful; it will
7890  * always return B_FALSE if the _ANY variant is supplied as the minimum
7891  * revision.  To determine only whether a chiprev is of a given processor
7892  * family, test the return value of chiprev_family() instead.
7893  */
7894 boolean_t
7895 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
7896 {
7897 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
7898 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
7899 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
7900 }
7901 
7902 /*
7903  * The uarch functions operate in a manner similar to the chiprev functions
7904  * above.  While it is tempting to allow these to operate on microarchitectures
7905  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
7906  * than ZEN2), we elect not to do so because a manufacturer may supply
7907  * processors of multiple different microarchitecture families each of which may
7908  * be internally ordered but unordered with respect to those of other families.
7909  */
7910 x86_uarch_t
7911 uarchrev_uarch(const x86_uarchrev_t ur)
7912 {
7913 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
7914 }
7915 
7916 boolean_t
7917 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
7918 {
7919 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
7920 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
7921 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
7922 }
7923 
7924 boolean_t
7925 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
7926 {
7927 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
7928 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
7929 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
7930 }
7931