xref: /illumos-gate/usr/src/uts/intel/os/cpuid.c (revision 5cd084ed)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25  * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2023 Oxide Computer Company
28  * Copyright 2024 MNX Cloud, Inc.
29  */
30 /*
31  * Copyright (c) 2010, Intel Corporation.
32  * All rights reserved.
33  */
34 /*
35  * Portions Copyright 2009 Advanced Micro Devices, Inc.
36  */
37 
38 /*
39  * CPU Identification logic
40  *
41  * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42  * with the identification of CPUs, their features, and their topologies. More
43  * specifically, this file helps drive the following:
44  *
45  * 1. Enumeration of features of the processor which are used by the kernel to
46  *    determine what features to enable or disable. These may be instruction set
47  *    enhancements or features that we use.
48  *
49  * 2. Enumeration of instruction set architecture (ISA) additions that userland
50  *    will be told about through the auxiliary vector.
51  *
52  * 3. Understanding the physical topology of the CPU such as the number of
53  *    caches, how many cores it has, whether or not it supports symmetric
54  *    multi-processing (SMT), etc.
55  *
56  * ------------------------
57  * CPUID History and Basics
58  * ------------------------
59  *
60  * The cpuid instruction was added by Intel roughly around the time that the
61  * original Pentium was introduced. The purpose of cpuid was to tell in a
62  * programmatic fashion information about the CPU that previously was guessed
63  * at. For example, an important part of cpuid is that we can know what
64  * extensions to the ISA exist. If you use an invalid opcode you would get a
65  * #UD, so this method allows a program (whether a user program or the kernel)
66  * to determine what exists without crashing or getting a SIGILL. Of course,
67  * this was also during the era of the clones and the AMD Am5x86. The vendor
68  * name shows up first in cpuid for a reason.
69  *
70  * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71  * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72  * its own meaning. The different leaves are broken down into different regions:
73  *
74  *	[ 0, 7fffffff ]			This region is called the 'basic'
75  *					region. This region is generally defined
76  *					by Intel, though some of the original
77  *					portions have different meanings based
78  *					on the manufacturer. These days, Intel
79  *					adds most new features to this region.
80  *					AMD adds non-Intel compatible
81  *					information in the third, extended
82  *					region. Intel uses this for everything
83  *					including ISA extensions, CPU
84  *					features, cache information, topology,
85  *					and more.
86  *
87  *					There is a hole carved out of this
88  *					region which is reserved for
89  *					hypervisors.
90  *
91  *	[ 40000000, 4fffffff ]		This region, which is found in the
92  *					middle of the previous region, is
93  *					explicitly promised to never be used by
94  *					CPUs. Instead, it is used by hypervisors
95  *					to communicate information about
96  *					themselves to the operating system. The
97  *					values and details are unique for each
98  *					hypervisor.
99  *
100  *	[ 80000000, ffffffff ]		This region is called the 'extended'
101  *					region. Some of the low leaves mirror
102  *					parts of the basic leaves. This region
103  *					has generally been used by AMD for
104  *					various extensions. For example, AMD-
105  *					specific information about caches,
106  *					features, and topology are found in this
107  *					region.
108  *
109  * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110  * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111  * the ranges, one of the primary things returned is the maximum valid leaf in
112  * that range. This allows for discovery of what range of CPUID is valid.
113  *
114  * The CPUs have potentially surprising behavior when using an invalid leaf or
115  * unimplemented leaf. If the requested leaf is within the valid basic or
116  * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117  * set to zero. However, if you specify a leaf that is outside of a valid range,
118  * then instead it will be filled with the last valid _basic_ leaf. For example,
119  * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120  * an invalid extended leaf will return the information for leaf 3.
121  *
122  * Some leaves are broken down into sub-leaves. This means that the value
123  * depends on both the leaf asked for in %eax and a secondary register. For
124  * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125  * additional information. Or when getting topology information in leaf 0xb, the
126  * initial value in %ecx changes which level of the topology that you are
127  * getting information about.
128  *
129  * cpuid values are always kept to 32 bits regardless of whether or not the
130  * program is in 64-bit mode. When executing in 64-bit mode, the upper
131  * 32 bits of the register are always set to zero so that way the values are the
132  * same regardless of execution mode.
133  *
134  * ----------------------
135  * Identifying Processors
136  * ----------------------
137  *
138  * We can identify a processor in two steps. The first step looks at cpuid leaf
139  * 0. Leaf 0 contains the processor's vendor information. This is done by
140  * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141  * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142  *
143  * From there, a processor is identified by a combination of three different
144  * values:
145  *
146  *  1. Family
147  *  2. Model
148  *  3. Stepping
149  *
150  * Each vendor uses the family and model to uniquely identify a processor. The
151  * way that family and model are changed depends on the vendor. For example,
152  * Intel has been using family 0x6 for almost all of their processor since the
153  * Pentium Pro/Pentium II era, often called the P6. The model is used to
154  * identify the exact processor. Different models are often used for the client
155  * (consumer) and server parts. Even though each processor often has major
156  * architectural differences, they still are considered the same family by
157  * Intel.
158  *
159  * On the other hand, each major AMD architecture generally has its own family.
160  * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161  * the model number is used to help identify specific processors.  As AMD's
162  * product lines have expanded, they have started putting a mixed bag of
163  * processors into the same family, with each processor under a single
164  * identifying banner (e.g., Milan, Cezanne) using a range of model numbers.  We
165  * refer to each such collection as a processor family, distinct from cpuid
166  * family.  Importantly, each processor family has a BIOS and Kernel Developer's
167  * Guide (BKDG, older parts) or Processor Programming Reference (PPR) that
168  * defines the processor family's non-architectural features.  In general, we'll
169  * use "family" here to mean the family number reported by the cpuid instruction
170  * and distinguish the processor family from it where appropriate.
171  *
172  * The stepping is used to refer to a revision of a specific microprocessor. The
173  * term comes from equipment used to produce masks that are used to create
174  * integrated circuits.
175  *
176  * The information is present in leaf 1, %eax. In technical documentation you
177  * will see the terms extended model and extended family. The original family,
178  * model, and stepping fields were each 4 bits wide. If the values in either
179  * are 0xf, then one is to consult the extended model and extended family, which
180  * take previously reserved bits and allow for a larger number of models and add
181  * 0xf to them.
182  *
183  * When we process this information, we store the full family, model, and
184  * stepping in the struct cpuid_info members cpi_family, cpi_model, and
185  * cpi_step, respectively. Whenever you are performing comparisons with the
186  * family, model, and stepping, you should use these members and not the raw
187  * values from cpuid. If you must use the raw values from cpuid directly, you
188  * must make sure that you add the extended model and family to the base model
189  * and family.
190  *
191  * In general, we do not use information about the family, model, and stepping
192  * to determine whether or not a feature is present; that is generally driven by
193  * specific leaves. However, when something we care about on the processor is
194  * not considered 'architectural' meaning that it is specific to a set of
195  * processors and not promised in the architecture model to be consistent from
196  * generation to generation, then we will fall back on this information. The
197  * most common cases where this comes up is when we have to workaround errata in
198  * the processor, are dealing with processor-specific features such as CPU
199  * performance counters, or we want to provide additional information for things
200  * such as fault management.
201  *
202  * While processors also do have a brand string, which is the name that people
203  * are familiar with when buying the processor, they are not meant for
204  * programmatic consumption. That is what the family, model, and stepping are
205  * for.
206  *
207  * We use the x86_chiprev_t to encode a combination of vendor, processor family,
208  * and stepping(s) that refer to a single or very closely related set of silicon
209  * implementations; while there are sometimes more specific ways to learn of the
210  * presence or absence of a particular erratum or workaround, one may generally
211  * assume that all processors of the same chiprev have the same errata and we
212  * have chosen to represent them this way precisely because that is how AMD
213  * groups them in their revision guides (errata documentation).  The processor
214  * family (x86_processor_family_t) may be extracted from the chiprev if that
215  * level of detail is not needed.  Processor families are considered unordered
216  * but revisions within a family may be compared for either an exact match or at
217  * least as recent as a reference revision.  See the chiprev_xxx() functions
218  * below.
219  *
220  * Similarly, each processor family implements a particular microarchitecture,
221  * which itself may have multiple revisions.  In general, non-architectural
222  * features are specific to a processor family, but some may exist across
223  * families containing cores that implement the same microarchitectural revision
224  * (and, such cores share common bugs, too).  We provide utility routines
225  * analogous to those for extracting and comparing chiprevs for
226  * microarchitectures as well; see the uarch_xxx() functions.
227  *
228  * Both chiprevs and uarchrevs are defined in x86_archext.h and both are at
229  * present used and available only for AMD and AMD-like processors.
230  *
231  * ------------
232  * CPUID Passes
233  * ------------
234  *
235  * As part of performing feature detection, we break this into several different
236  * passes. There used to be a pass 0 that was done from assembly in locore.s to
237  * support processors that have a missing or broken cpuid instruction (notably
238  * certain Cyrix processors) but those were all 32-bit processors which are no
239  * longer supported. Passes are no longer numbered explicitly to make it easier
240  * to break them up or move them around as needed; however, they still have a
241  * well-defined execution ordering enforced by the definition of cpuid_pass_t in
242  * x86_archext.h. The external interface to execute a cpuid pass or determine
243  * whether a pass has been completed consists of cpuid_execpass() and
244  * cpuid_checkpass() respectively.  The passes now, in that execution order,
245  * are as follows:
246  *
247  *	PRELUDE		This pass does not have any dependencies on system
248  *			setup; in particular, unlike all subsequent passes it is
249  *			guaranteed not to require PCI config space access.  It
250  *			sets the flag indicating that the processor we are
251  *			running on supports the cpuid instruction, which all
252  *			64-bit processors do.  This would also be the place to
253  *			add any other basic state that is required later on and
254  *			can be learned without dependencies.
255  *
256  *	IDENT		Determine which vendor manufactured the CPU, the family,
257  *			model, and stepping information, and compute basic
258  *			identifying tags from those values.  This is done first
259  *			so that machine-dependent code can control the features
260  *			the cpuid instruction will report during subsequent
261  *			passes if needed, and so that any intervening
262  *			machine-dependent code that needs basic identity will
263  *			have it available.  This includes synthesised
264  *			identifiers such as chiprev and uarchrev as well as the
265  *			values obtained directly from cpuid.  Prior to executing
266  *			this pass, machine-depedent boot code is responsible for
267  *			ensuring that the PCI configuration space access
268  *			functions have been set up and, if necessary, that
269  *			determine_platform() has been called.
270  *
271  *	BASIC		This is the primary pass and is responsible for doing a
272  *			large number of different things:
273  *
274  *			1. Gathering a large number of feature flags to
275  *			determine which features the CPU support and which
276  *			indicate things that we need to do other work in the OS
277  *			to enable. Features detected this way are added to the
278  *			x86_featureset which can be queried to
279  *			determine what we should do. This includes processing
280  *			all of the basic and extended CPU features that we care
281  *			about.
282  *
283  *			2. Determining the CPU's topology. This includes
284  *			information about how many cores and threads are present
285  *			in the package. It also is responsible for figuring out
286  *			which logical CPUs are potentially part of the same core
287  *			and what other resources they might share. For more
288  *			information see the 'Topology' section.
289  *
290  *			3. Determining the set of CPU security-specific features
291  *			that we need to worry about and determine the
292  *			appropriate set of workarounds.
293  *
294  *			Pass 1 on the boot CPU occurs before KMDB is started.
295  *
296  *	EXTENDED	The second pass is done after startup(). Here, we check
297  *			other miscellaneous features. Most of this is gathering
298  *			additional basic and extended features that we'll use in
299  *			later passes or for debugging support.
300  *
301  *	DYNAMIC		The third pass occurs after the kernel memory allocator
302  *			has been fully initialized. This gathers information
303  *			where we might need dynamic memory available for our
304  *			uses. This includes several varying width leaves that
305  *			have cache information and the processor's brand string.
306  *
307  *	RESOLVE		The fourth and final normal pass is performed after the
308  *			kernel has brought most everything online. This is
309  *			invoked from post_startup(). In this pass, we go through
310  *			the set of features that we have enabled and turn that
311  *			into the hardware auxiliary vector features that
312  *			userland receives. This is used by userland, primarily
313  *			by the run-time link-editor (RTLD), though userland
314  *			software could also refer to it directly.
315  *
316  * The function that performs a pass is currently assumed to be infallible, and
317  * all existing implementation are.  This simplifies callers by allowing
318  * cpuid_execpass() to return void. Similarly, implementers do not need to check
319  * for a NULL CPU argument; the current CPU's cpu_t is substituted if necessary.
320  * Both of these assumptions can be relaxed if needed by future developments.
321  * Tracking of completed states is handled by cpuid_execpass(). It is programmer
322  * error to attempt to execute a pass before all previous passes have been
323  * completed on the specified CPU, or to request cpuid information before the
324  * pass that captures it has been executed.  These conditions can be tested
325  * using cpuid_checkpass().
326  *
327  * The Microcode Pass
328  *
329  * After a microcode update, we do a selective rescan of the cpuid leaves to
330  * determine what features have changed. Microcode updates can provide more
331  * details about security related features to deal with issues like Spectre and
332  * L1TF. On occasion, vendors have violated their contract and removed bits.
333  * However, we don't try to detect that because that puts us in a situation that
334  * we really can't deal with. As such, the only thing we rescan are security
335  * related features today. See cpuid_pass_ucode().  This pass may be run in a
336  * different sequence on APs and therefore is not part of the sequential order;
337  * It is invoked directly instead of by cpuid_execpass() and its completion
338  * status cannot be checked by cpuid_checkpass().  This could be integrated with
339  * a more complex dependency mechanism if warranted by future developments.
340  *
341  * All of the passes are run on all CPUs. However, for the most part we only
342  * care about what the boot CPU says about this information and use the other
343  * CPUs as a rough guide to sanity check that we have the same feature set.
344  *
345  * We do not support running multiple logical CPUs with disjoint, let alone
346  * different, feature sets.
347  *
348  * ------------------
349  * Processor Topology
350  * ------------------
351  *
352  * One of the important things that we need to do is to understand the topology
353  * of the underlying processor. When we say topology in this case, we're trying
354  * to understand the relationship between the logical CPUs that the operating
355  * system sees and the underlying physical layout. Different logical CPUs may
356  * share different resources which can have important consequences for the
357  * performance of the system. For example, they may share caches, execution
358  * units, and more.
359  *
360  * The topology of the processor changes from generation to generation and
361  * vendor to vendor.  Along with that, different vendors use different
362  * terminology, and the operating system itself uses occasionally overlapping
363  * terminology. It's important to understand what this topology looks like so
364  * one can understand the different things that we try to calculate and
365  * determine.
366  *
367  * To get started, let's talk about a little bit of terminology that we've used
368  * so far, is used throughout this file, and is fairly generic across multiple
369  * vendors:
370  *
371  * CPU
372  *	A central processing unit (CPU) refers to a logical and/or virtual
373  *	entity that the operating system can execute instructions on. The
374  *	underlying resources for this CPU may be shared between multiple
375  *	entities; however, to the operating system it is a discrete unit.
376  *
377  * PROCESSOR and PACKAGE
378  *
379  *	Generally, when we use the term 'processor' on its own, we are referring
380  *	to the physical entity that one buys and plugs into a board. However,
381  *	because processor has been overloaded and one might see it used to mean
382  *	multiple different levels, we will instead use the term 'package' for
383  *	the rest of this file. The term package comes from the electrical
384  *	engineering side and refers to the physical entity that encloses the
385  *	electronics inside. Strictly speaking the package can contain more than
386  *	just the CPU, for example, on many processors it may also have what's
387  *	called an 'integrated graphical processing unit (GPU)'. Because the
388  *	package can encapsulate multiple units, it is the largest physical unit
389  *	that we refer to.
390  *
391  * SOCKET
392  *
393  *	A socket refers to unit on a system board (generally the motherboard)
394  *	that can receive a package. A single package, or processor, is plugged
395  *	into a single socket. A system may have multiple sockets. Often times,
396  *	the term socket is used interchangeably with package and refers to the
397  *	electrical component that has plugged in, and not the receptacle itself.
398  *
399  * CORE
400  *
401  *	A core refers to the physical instantiation of a CPU, generally, with a
402  *	full set of hardware resources available to it. A package may contain
403  *	multiple cores inside of it or it may just have a single one. A
404  *	processor with more than one core is often referred to as 'multi-core'.
405  *	In illumos, we will use the feature X86FSET_CMP to refer to a system
406  *	that has 'multi-core' processors.
407  *
408  *	A core may expose a single logical CPU to the operating system, or it
409  *	may expose multiple CPUs, which we call threads, defined below.
410  *
411  *	Some resources may still be shared by cores in the same package. For
412  *	example, many processors will share the level 3 cache between cores.
413  *	Some AMD generations share hardware resources between cores. For more
414  *	information on that see the section 'AMD Topology'.
415  *
416  * THREAD and STRAND
417  *
418  *	In this file, generally a thread refers to a hardware resources and not
419  *	the operating system's logical abstraction. A thread is always exposed
420  *	as an independent logical CPU to the operating system. A thread belongs
421  *	to a specific core. A core may have more than one thread. When that is
422  *	the case, the threads that are part of the same core are often referred
423  *	to as 'siblings'.
424  *
425  *	When multiple threads exist, this is generally referred to as
426  *	simultaneous multi-threading (SMT). When Intel introduced this in their
427  *	processors they called it hyper-threading (HT). When multiple threads
428  *	are active in a core, they split the resources of the core. For example,
429  *	two threads may share the same set of hardware execution units.
430  *
431  *	The operating system often uses the term 'strand' to refer to a thread.
432  *	This helps disambiguate it from the software concept.
433  *
434  * CHIP
435  *
436  *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
437  *	base meaning, it is used to refer to a single integrated circuit, which
438  *	may or may not be the only thing in the package. In illumos, when you
439  *	see the term 'chip' it is almost always referring to the same thing as
440  *	the 'package'. However, many vendors may use chip to refer to one of
441  *	many integrated circuits that have been placed in the package. As an
442  *	example, see the subsequent definition.
443  *
444  *	To try and keep things consistent, we will only use chip when referring
445  *	to the entire integrated circuit package, with the exception of the
446  *	definition of multi-chip module (because it is in the name) and use the
447  *	term 'die' when we want the more general, potential sub-component
448  *	definition.
449  *
450  * DIE
451  *
452  *	A die refers to an integrated circuit. Inside of the package there may
453  *	be a single die or multiple dies. This is sometimes called a 'chip' in
454  *	vendor's parlance, but in this file, we use the term die to refer to a
455  *	subcomponent.
456  *
457  * MULTI-CHIP MODULE
458  *
459  *	A multi-chip module (MCM) refers to putting multiple distinct chips that
460  *	are connected together in the same package. When a multi-chip design is
461  *	used, generally each chip is manufactured independently and then joined
462  *	together in the package. For example, on AMD's Zen microarchitecture
463  *	(family 0x17), the package contains several dies (the second meaning of
464  *	chip from above) that are connected together.
465  *
466  * CACHE
467  *
468  *	A cache is a part of the processor that maintains copies of recently
469  *	accessed memory. Caches are split into levels and then into types.
470  *	Commonly there are one to three levels, called level one, two, and
471  *	three. The lower the level, the smaller it is, the closer it is to the
472  *	execution units of the CPU, and the faster it is to access. The layout
473  *	and design of the cache come in many different flavors, consult other
474  *	resources for a discussion of those.
475  *
476  *	Caches are generally split into two types, the instruction and data
477  *	cache. The caches contain what their names suggest, the instruction
478  *	cache has executable program text, while the data cache has all other
479  *	memory that the processor accesses. As of this writing, data is kept
480  *	coherent between all of the caches on x86, so if one modifies program
481  *	text before it is executed, that will be in the data cache, and the
482  *	instruction cache will be synchronized with that change when the
483  *	processor actually executes those instructions. This coherency also
484  *	covers the fact that data could show up in multiple caches.
485  *
486  *	Generally, the lowest level caches are specific to a core. However, the
487  *	last layer cache is shared between some number of cores. The number of
488  *	CPUs sharing this last level cache is important. This has implications
489  *	for the choices that the scheduler makes, as accessing memory that might
490  *	be in a remote cache after thread migration can be quite expensive.
491  *
492  *	Sometimes, the word cache is abbreviated with a '$', because in US
493  *	English the word cache is pronounced the same as cash. So L1D$ refers to
494  *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
495  *	in the rest of this theory statement for clarity.
496  *
497  * MEMORY CONTROLLER
498  *
499  *	The memory controller is a component that provides access to DRAM. Each
500  *	memory controller can access a set number of DRAM channels. Each channel
501  *	can have a number of DIMMs (sticks of memory) associated with it. A
502  *	given package may have more than one memory controller. The association
503  *	of the memory controller to a group of cores is important as it is
504  *	cheaper to access memory on the controller that you are associated with.
505  *
506  * NUMA
507  *
508  *	NUMA or non-uniform memory access, describes a way that systems are
509  *	built. On x86, any processor core can address all of the memory in the
510  *	system. However, When using multiple sockets or possibly within a
511  *	multi-chip module, some of that memory is physically closer and some of
512  *	it is further. Memory that is further away is more expensive to access.
513  *	Consider the following image of multiple sockets with memory:
514  *
515  *	+--------+                                                +--------+
516  *	| DIMM A |         +----------+      +----------+         | DIMM D |
517  *	+--------+-+       |          |      |          |       +-+------+-+
518  *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
519  *	  +--------+-+     |          |      |          |     +-+------+-+
520  *	    | DIMM C |     +----------+      +----------+     | DIMM F |
521  *	    +--------+                                        +--------+
522  *
523  *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
524  *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
525  *	access DIMMs A-C and more expensive to access D-F as it has to go
526  *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
527  *	D-F are cheaper than A-C. While the socket form is the most common, when
528  *	using multi-chip modules, this can also sometimes occur. For another
529  *	example of this that's more involved, see the AMD topology section.
530  *
531  *
532  * Intel Topology
533  * --------------
534  *
535  * Most Intel processors since Nehalem, (as of this writing the current gen
536  * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
537  * the package is a single monolithic die. MCMs currently aren't used. Most
538  * parts have three levels of caches, with the L3 cache being shared between
539  * all of the cores on the package. The L1/L2 cache is generally specific to
540  * an individual core. The following image shows at a simplified level what
541  * this looks like. The memory controller is commonly part of something called
542  * the 'Uncore', that used to be separate physical chips that were not a part of
543  * the package, but are now part of the same chip.
544  *
545  *  +-----------------------------------------------------------------------+
546  *  | Package                                                               |
547  *  |  +-------------------+  +-------------------+  +-------------------+  |
548  *  |  | Core              |  | Core              |  | Core              |  |
549  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
550  *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
551  *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
552  *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
553  *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
554  *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
555  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
556  *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
557  *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
558  *  |  +-------------------+  +-------------------+  +-------------------+  |
559  *  | +-------------------------------------------------------------------+ |
560  *  | |                         Shared L3 Cache                           | |
561  *  | +-------------------------------------------------------------------+ |
562  *  | +-------------------------------------------------------------------+ |
563  *  | |                        Memory Controller                          | |
564  *  | +-------------------------------------------------------------------+ |
565  *  +-----------------------------------------------------------------------+
566  *
567  * A side effect of this current architecture is that what we care about from a
568  * scheduling and topology perspective, is simplified. In general we care about
569  * understanding which logical CPUs are part of the same core and socket.
570  *
571  * To determine the relationship between threads and cores, Intel initially used
572  * the identifier in the advanced programmable interrupt controller (APIC). They
573  * also added cpuid leaf 4 to give additional information about the number of
574  * threads and CPUs in the processor. With the addition of x2apic (which
575  * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
576  * additional cpuid topology leaf 0xB was added.
577  *
578  * AMD Topology
579  * ------------
580  *
581  * When discussing AMD topology, we want to break this into three distinct
582  * generations of topology. There's the basic topology that has been used in
583  * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
584  * with family 0x15 (Bulldozer), and there's the topology that was introduced
585  * with family 0x17 (Zen), evolved more dramatically in Zen 2 (still family
586  * 0x17), and tweaked slightly in Zen 3 (family 19h). AMD also has some
587  * additional terminology that's worth talking about.
588  *
589  * Until the introduction of family 0x17 (Zen), AMD did not implement something
590  * that they considered SMT. Whether or not the AMD processors have SMT
591  * influences many things including scheduling and reliability, availability,
592  * and serviceability (RAS) features.
593  *
594  * NODE
595  *
596  *	AMD uses the term node to refer to a die that contains a number of cores
597  *	and I/O resources. Depending on the processor family and model, more
598  *	than one node can be present in the package. When there is more than one
599  *	node this indicates a multi-chip module. Usually each node has its own
600  *	access to memory and I/O devices. This is important and generally
601  *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
602  *	result, we track this relationship in the operating system.
603  *
604  *	In processors with an L3 cache, the L3 cache is generally shared across
605  *	the entire node, though the way this is carved up varies from generation
606  *	to generation.
607  *
608  * BULLDOZER
609  *
610  *	Starting with the Bulldozer family (0x15) and continuing until the
611  *	introduction of the Zen microarchitecture, AMD introduced the idea of a
612  *	compute unit. In a compute unit, two traditional cores share a number of
613  *	hardware resources. Critically, they share the FPU, L1 instruction
614  *	cache, and the L2 cache. Several compute units were then combined inside
615  *	of a single node.  Because the integer execution units, L1 data cache,
616  *	and some other resources were not shared between the cores, AMD never
617  *	considered this to be SMT.
618  *
619  * ZEN
620  *
621  *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
622  *	is called Zeppelin. These modules are similar to the idea of nodes used
623  *	previously. Each of these nodes has two DRAM channels which all of the
624  *	cores in the node can access uniformly. These nodes are linked together
625  *	in the package, creating a NUMA environment.
626  *
627  *	The Zeppelin die itself contains two different 'core complexes'. Each
628  *	core complex consists of four cores which each have two threads, for a
629  *	total of 8 logical CPUs per complex. Unlike other generations,
630  *	where all the logical CPUs in a given node share the L3 cache, here each
631  *	core complex has its own shared L3 cache.
632  *
633  *	A further thing that we need to consider is that in some configurations,
634  *	particularly with the Threadripper line of processors, not every die
635  *	actually has its memory controllers wired up to actual memory channels.
636  *	This means that some cores have memory attached to them and others
637  *	don't.
638  *
639  *	To put Zen in perspective, consider the following images:
640  *
641  *      +--------------------------------------------------------+
642  *      | Core Complex                                           |
643  *      | +-------------------+    +-------------------+  +---+  |
644  *      | | Core       +----+ |    | Core       +----+ |  |   |  |
645  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
646  *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
647  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
648  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
649  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
650  *      | +-------------------+    +-------------------+  | C |  |
651  *      | +-------------------+    +-------------------+  | a |  |
652  *      | | Core       +----+ |    | Core       +----+ |  | c |  |
653  *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
654  *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
655  *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
656  *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
657  *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
658  *      | +-------------------+    +-------------------+  +---+  |
659  *      |                                                        |
660  *	+--------------------------------------------------------+
661  *
662  *  This first image represents a single Zen core complex that consists of four
663  *  cores.
664  *
665  *
666  *	+--------------------------------------------------------+
667  *	| Zeppelin Die                                           |
668  *	|  +--------------------------------------------------+  |
669  *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
670  *	|  +--------------------------------------------------+  |
671  *      |                           HH                           |
672  *	|          +-----------+    HH    +-----------+          |
673  *	|          |           |    HH    |           |          |
674  *	|          |    Core   |==========|    Core   |          |
675  *	|          |  Complex  |==========|  Complex  |          |
676  *	|          |           |    HH    |           |          |
677  *	|          +-----------+    HH    +-----------+          |
678  *      |                           HH                           |
679  *	|  +--------------------------------------------------+  |
680  *	|  |                Memory Controller                 |  |
681  *	|  +--------------------------------------------------+  |
682  *      |                                                        |
683  *	+--------------------------------------------------------+
684  *
685  *  This image represents a single Zeppelin Die. Note how both cores are
686  *  connected to the same memory controller and I/O units. While each core
687  *  complex has its own L3 cache as seen in the first image, they both have
688  *  uniform access to memory.
689  *
690  *
691  *                      PP                     PP
692  *                      PP                     PP
693  *           +----------PP---------------------PP---------+
694  *           |          PP                     PP         |
695  *           |    +-----------+          +-----------+    |
696  *           |    |           |          |           |    |
697  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
698  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
699  *           |    |           |          |           |    |
700  *           |    +-----------+ooo    ...+-----------+    |
701  *           |          HH      ooo  ...       HH         |
702  *           |          HH        oo..         HH         |
703  *           |          HH        ..oo         HH         |
704  *           |          HH      ...  ooo       HH         |
705  *           |    +-----------+...    ooo+-----------+    |
706  *           |    |           |          |           |    |
707  *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
708  *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
709  *           |    |           |          |           |    |
710  *           |    +-----------+          +-----------+    |
711  *           |          PP                     PP         |
712  *           +----------PP---------------------PP---------+
713  *                      PP                     PP
714  *                      PP                     PP
715  *
716  *  This image represents a single Zen package. In this example, it has four
717  *  Zeppelin dies, though some configurations only have a single one. In this
718  *  example, each die is directly connected to the next. Also, each die is
719  *  represented as being connected to memory by the 'M' character and connected
720  *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
721  *  die is made up of two core complexes, we have multiple different NUMA
722  *  domains that we care about for these systems.
723  *
724  * ZEN 2
725  *
726  *	Zen 2 changes things in a dramatic way from Zen 1. Whereas in Zen 1
727  *	each Zeppelin Die had its own I/O die, that has been moved out of the
728  *	core complex in Zen 2. The actual core complex looks pretty similar, but
729  *	now the die actually looks much simpler:
730  *
731  *      +--------------------------------------------------------+
732  *      | Zen 2 Core Complex Die    HH                           |
733  *      |                           HH                           |
734  *      |          +-----------+    HH    +-----------+          |
735  *      |          |           |    HH    |           |          |
736  *      |          |    Core   |==========|    Core   |          |
737  *      |          |  Complex  |==========|  Complex  |          |
738  *      |          |           |    HH    |           |          |
739  *      |          +-----------+    HH    +-----------+          |
740  *      |                           HH                           |
741  *      |                           HH                           |
742  *      +--------------------------------------------------------+
743  *
744  *	From here, when we add the central I/O die, this changes things a bit.
745  *	Each die is connected to the I/O die, rather than trying to interconnect
746  *	them directly. The following image takes the same Zen 1 image that we
747  *	had earlier and shows what it looks like with the I/O die instead:
748  *
749  *                                 PP    PP
750  *                                 PP    PP
751  *           +---------------------PP----PP---------------------+
752  *           |                     PP    PP                     |
753  *           |  +-----------+      PP    PP      +-----------+  |
754  *           |  |           |      PP    PP      |           |  |
755  *           |  |   Zen 2   |    +-PP----PP-+    |   Zen 2   |  |
756  *           |  |    Die   _|    | PP    PP |    |_   Die    |  |
757  *           |  |         |o|oooo|          |oooo|o|         |  |
758  *           |  +-----------+    |          |    +-----------+  |
759  *           |                   |   I/O    |                   |
760  *       MMMMMMMMMMMMMMMMMMMMMMMMMM  Die   MMMMMMMMMMMMMMMMMMMMMMMMMM
761  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
762  *           |                   |          |                   |
763  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
764  *       MMMMMMMMMMMMMMMMMMMMMMMMMM        MMMMMMMMMMMMMMMMMMMMMMMMMM
765  *           |                   |          |                   |
766  *           |  +-----------+    |          |    +-----------+  |
767  *           |  |         |o|oooo| PP    PP |oooo|o|         |  |
768  *           |  |   Zen 2  -|    +-PP----PP-+    |-  Zen 2   |  |
769  *           |  |    Die    |      PP    PP      |    Die    |  |
770  *           |  |           |      PP    PP      |           |  |
771  *           |  +-----------+      PP    PP      +-----------+  |
772  *           |                     PP    PP                     |
773  *           +---------------------PP----PP---------------------+
774  *                                 PP    PP
775  *                                 PP    PP
776  *
777  *	The above has four core complex dies installed, though the Zen 2 EPYC
778  *	and ThreadRipper parts allow for up to eight, while the Ryzen parts
779  *	generally only have one to two. The more notable difference here is how
780  *	everything communicates. Note that memory and PCIe come out of the
781  *	central die. This changes the way that one die accesses a resource. It
782  *	basically always has to go to the I/O die, where as in Zen 1 it may have
783  *	satisfied it locally. In general, this ends up being a better strategy
784  *	for most things, though it is possible to still treat everything in four
785  *	distinct NUMA domains with each Zen 2 die slightly closer to some memory
786  *	and PCIe than otherwise. This also impacts the 'amdzen' nexus driver as
787  *	now there is only one 'node' present.
788  *
789  * ZEN 3
790  *
791  *	From an architectural perspective, Zen 3 is a much smaller change from
792  *	Zen 2 than Zen 2 was from Zen 1, though it makes up for most of that in
793  *	its microarchitectural changes. The biggest thing for us is how the die
794  *	changes. In Zen 1 and Zen 2, each core complex still had its own L3
795  *	cache. However, in Zen 3, the L3 is now shared between the entire core
796  *	complex die and is no longer partitioned between each core complex. This
797  *	means that all cores on the die can share the same L3 cache. Otherwise,
798  *	the general layout of the overall package with various core complexes
799  *	and an I/O die stays the same. Here's what the Core Complex Die looks
800  *	like in a bit more detail:
801  *
802  *               +-------------------------------------------------+
803  *               | Zen 3 Core Complex Die                          |
804  *               | +-------------------+    +-------------------+  |
805  *               | | Core       +----+ |    | Core       +----+ |  |
806  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
807  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
808  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
809  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
810  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
811  *               | +-------------------+    +-------------------+  |
812  *               | +-------------------+    +-------------------+  |
813  *               | | Core       +----+ |    | Core       +----+ |  |
814  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
815  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
816  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
817  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
818  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
819  *               | +-------------------+    +-------------------+  |
820  *               |                                                 |
821  *               | +--------------------------------------------+  |
822  *               | |                 L3 Cache                   |  |
823  *               | +--------------------------------------------+  |
824  *               |                                                 |
825  *               | +-------------------+    +-------------------+  |
826  *               | | Core       +----+ |    | Core       +----+ |  |
827  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
828  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
829  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
830  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
831  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
832  *               | +-------------------+    +-------------------+  |
833  *               | +-------------------+    +-------------------+  |
834  *               | | Core       +----+ |    | Core       +----+ |  |
835  *               | | +--------+ | L2 | |    | +--------+ | L2 | |  |
836  *               | | | Thread | +----+ |    | | Thread | +----+ |  |
837  *               | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |
838  *               | |   | Thread | |L1| |    |   | Thread | |L1| |  |
839  *               | |   +--------+ +--+ |    |   +--------+ +--+ |  |
840  *               | +-------------------+    +-------------------+  |
841  *               +-------------------------------------------------+
842  *
843  *	While it is not pictured, there are connections from the die to the
844  *	broader data fabric and additional functional blocks to support that
845  *	communication and coherency.
846  *
847  * CPUID LEAVES
848  *
849  * There are a few different CPUID leaves that we can use to try and understand
850  * the actual state of the world. As part of the introduction of family 0xf, AMD
851  * added CPUID leaf 0x80000008. This leaf tells us the number of logical
852  * processors that are in the system. Because families before Zen didn't have
853  * SMT, this was always the number of cores that were in the system. However, it
854  * should always be thought of as the number of logical threads to be consistent
855  * between generations. In addition we also get the size of the APIC ID that is
856  * used to represent the number of logical processors. This is important for
857  * deriving topology information.
858  *
859  * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
860  * bit between Bulldozer and later families, but it is quite useful in
861  * determining the topology information. Because this information has changed
862  * across family generations, it's worth calling out what these mean
863  * explicitly. The registers have the following meanings:
864  *
865  *	%eax	The APIC ID. The entire register is defined to have a 32-bit
866  *		APIC ID, even though on systems without x2apic support, it will
867  *		be limited to 8 bits.
868  *
869  *	%ebx	On Bulldozer-era systems this contains information about the
870  *		number of cores that are in a compute unit (cores that share
871  *		resources). It also contains a per-package compute unit ID that
872  *		identifies which compute unit the logical CPU is a part of.
873  *
874  *		On Zen-era systems this instead contains the number of threads
875  *		per core and the ID of the core that the logical CPU is a part
876  *		of. Note, this ID is unique only to the package, it is not
877  *		globally unique across the entire system.
878  *
879  *	%ecx	This contains the number of nodes that exist in the package. It
880  *		also contains an ID that identifies which node the logical CPU
881  *		is a part of.
882  *
883  * Finally, we also use cpuid leaf 0x8000001D to determine information about the
884  * cache layout to determine which logical CPUs are sharing which caches.
885  *
886  * illumos Topology
887  * ----------------
888  *
889  * Based on the above we synthesize the information into several different
890  * variables that we store in the 'struct cpuid_info'. We'll go into the details
891  * of what each member is supposed to represent and their uniqueness. In
892  * general, there are two levels of uniqueness that we care about. We care about
893  * an ID that is globally unique. That means that it will be unique across all
894  * entities in the system. For example, the default logical CPU ID is globally
895  * unique. On the other hand, there is some information that we only care about
896  * being unique within the context of a single package / socket. Here are the
897  * variables that we keep track of and their meaning.
898  *
899  * Several of the values that are asking for an identifier, with the exception
900  * of cpi_apicid, are allowed to be synthetic.
901  *
902  *
903  * cpi_apicid
904  *
905  *	This is the value of the CPU's APIC id. This should be the full 32-bit
906  *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
907  *	APIC ID. This value is globally unique between all logical CPUs across
908  *	all packages. This is usually required by the APIC.
909  *
910  * cpi_chipid
911  *
912  *	This value indicates the ID of the package that the logical CPU is a
913  *	part of. This value is allowed to be synthetic. It is usually derived by
914  *	taking the CPU's APIC ID and determining how many bits are used to
915  *	represent CPU cores in the package. All logical CPUs that are part of
916  *	the same package must have the same value.
917  *
918  * cpi_coreid
919  *
920  *	This represents the ID of a CPU core. Two logical CPUs should only have
921  *	the same cpi_coreid value if they are part of the same core. These
922  *	values may be synthetic. On systems that support SMT, this value is
923  *	usually derived from the APIC ID, otherwise it is often synthetic and
924  *	just set to the value of the cpu_id in the cpu_t.
925  *
926  * cpi_pkgcoreid
927  *
928  *	This is similar to the cpi_coreid in that logical CPUs that are part of
929  *	the same core should have the same ID. The main difference is that these
930  *	values are only required to be unique to a given socket.
931  *
932  * cpi_clogid
933  *
934  *	This represents the logical ID of a logical CPU. This value should be
935  *	unique within a given socket for each logical CPU. This is allowed to be
936  *	synthetic, though it is usually based off of the CPU's apic ID. The
937  *	broader system expects that logical CPUs that have are part of the same
938  *	core have contiguous numbers. For example, if there were two threads per
939  *	core, then the core IDs divided by two should be the same and the first
940  *	modulus two should be zero and the second one. For example, IDs 4 and 5
941  *	indicate two logical CPUs that are part of the same core. But IDs 5 and
942  *	6 represent two logical CPUs that are part of different cores.
943  *
944  *	While it is common for the cpi_coreid and the cpi_clogid to be derived
945  *	from the same source, strictly speaking, they don't have to be and the
946  *	two values should be considered logically independent. One should not
947  *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
948  *	some kind of relationship. While this is tempting, we've seen cases on
949  *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
950  *
951  * cpi_ncpu_per_chip
952  *
953  *	This value indicates the total number of logical CPUs that exist in the
954  *	physical package. Critically, this is not the number of logical CPUs
955  *	that exist for just the single core.
956  *
957  *	This value should be the same for all logical CPUs in the same package.
958  *
959  * cpi_ncore_per_chip
960  *
961  *	This value indicates the total number of physical CPU cores that exist
962  *	in the package. The system compares this value with cpi_ncpu_per_chip to
963  *	determine if simultaneous multi-threading (SMT) is enabled. When
964  *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
965  *	the X86FSET_HTT feature is not set. If this value is greater than one,
966  *	than we consider the processor to have the feature X86FSET_CMP, to
967  *	indicate that there is support for more than one core.
968  *
969  *	This value should be the same for all logical CPUs in the same package.
970  *
971  * cpi_procnodes_per_pkg
972  *
973  *	This value indicates the number of 'nodes' that exist in the package.
974  *	When processors are actually a multi-chip module, this represents the
975  *	number of such modules that exist in the package. Currently, on Intel
976  *	based systems this member is always set to 1.
977  *
978  *	This value should be the same for all logical CPUs in the same package.
979  *
980  * cpi_procnodeid
981  *
982  *	This value indicates the ID of the node that the logical CPU is a part
983  *	of. All logical CPUs that are in the same node must have the same value
984  *	here. This value must be unique across all of the packages in the
985  *	system.  On Intel based systems, this is currently set to the value in
986  *	cpi_chipid because there is only one node.
987  *
988  * cpi_cores_per_compunit
989  *
990  *	This value indicates the number of cores that are part of a compute
991  *	unit. See the AMD topology section for this. This member only has real
992  *	meaning currently for AMD Bulldozer family processors. For all other
993  *	processors, this should currently be set to 1.
994  *
995  * cpi_compunitid
996  *
997  *	This indicates the compute unit that the logical CPU belongs to. For
998  *	processors without AMD Bulldozer-style compute units this should be set
999  *	to the value of cpi_coreid.
1000  *
1001  * cpi_ncpu_shr_last_cache
1002  *
1003  *	This indicates the number of logical CPUs that are sharing the same last
1004  *	level cache. This value should be the same for all CPUs that are sharing
1005  *	that cache. The last cache refers to the cache that is closest to memory
1006  *	and furthest away from the CPU.
1007  *
1008  * cpi_last_lvl_cacheid
1009  *
1010  *	This indicates the ID of the last cache that the logical CPU uses. This
1011  *	cache is often shared between multiple logical CPUs and is the cache
1012  *	that is closest to memory and furthest away from the CPU. This value
1013  *	should be the same for a group of logical CPUs only if they actually
1014  *	share the same last level cache. IDs should not overlap between
1015  *	packages.
1016  *
1017  * cpi_ncore_bits
1018  *
1019  *	This indicates the number of bits that are required to represent all of
1020  *	the cores in the system. As cores are derived based on their APIC IDs,
1021  *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
1022  *	this value to be larger than the actual number of IDs that are present
1023  *	in the system. This is used to size tables by the CMI framework. It is
1024  *	only filled in for Intel and AMD CPUs.
1025  *
1026  * cpi_nthread_bits
1027  *
1028  *	This indicates the number of bits required to represent all of the IDs
1029  *	that cover the logical CPUs that exist on a given core. It's OK for this
1030  *	value to be larger than the actual number of IDs that are present in the
1031  *	system.  This is used to size tables by the CMI framework. It is
1032  *	only filled in for Intel and AMD CPUs.
1033  *
1034  * -----------
1035  * Hypervisors
1036  * -----------
1037  *
1038  * If trying to manage the differences between vendors wasn't bad enough, it can
1039  * get worse thanks to our friend hardware virtualization. Hypervisors are given
1040  * the ability to interpose on all cpuid instructions and change them to suit
1041  * their purposes. In general, this is necessary as the hypervisor wants to be
1042  * able to present a more uniform set of features or not necessarily give the
1043  * guest operating system kernel knowledge of all features so it can be
1044  * more easily migrated between systems.
1045  *
1046  * When it comes to trying to determine topology information, this can be a
1047  * double edged sword. When a hypervisor doesn't actually implement a cpuid
1048  * leaf, it'll often return all zeros. Because of that, you'll often see various
1049  * checks scattered about fields being non-zero before we assume we can use
1050  * them.
1051  *
1052  * When it comes to topology information, the hypervisor is often incentivized
1053  * to lie to you about topology. This is because it doesn't always actually
1054  * guarantee that topology at all. The topology path we take in the system
1055  * depends on how the CPU advertises itself. If it advertises itself as an Intel
1056  * or AMD CPU, then we basically do our normal path. However, when they don't
1057  * use an actual vendor, then that usually turns into multiple one-core CPUs
1058  * that we enumerate that are often on different sockets. The actual behavior
1059  * depends greatly on what the hypervisor actually exposes to us.
1060  *
1061  * --------------------
1062  * Exposing Information
1063  * --------------------
1064  *
1065  * We expose CPUID information in three different forms in the system.
1066  *
1067  * The first is through the x86_featureset variable. This is used in conjunction
1068  * with the is_x86_feature() function. This is queried by x86-specific functions
1069  * to determine which features are or aren't present in the system and to make
1070  * decisions based upon them. For example, users of this include everything from
1071  * parts of the system dedicated to reliability, availability, and
1072  * serviceability (RAS), to making decisions about how to handle security
1073  * mitigations, to various x86-specific drivers. General purpose or
1074  * architecture independent drivers should never be calling this function.
1075  *
1076  * The second means is through the auxiliary vector. The auxiliary vector is a
1077  * series of tagged data that the kernel passes down to a user program when it
1078  * begins executing. This information is used to indicate to programs what
1079  * instruction set extensions are present. For example, information about the
1080  * CPU supporting the machine check architecture (MCA) wouldn't be passed down
1081  * since user programs cannot make use of it. However, things like the AVX
1082  * instruction sets are. Programs use this information to make run-time
1083  * decisions about what features they should use. As an example, the run-time
1084  * link-editor (rtld) can relocate different functions depending on the hardware
1085  * support available.
1086  *
1087  * The final form is through a series of accessor functions that all have the
1088  * form cpuid_get*. This is used by a number of different subsystems in the
1089  * kernel to determine more detailed information about what we're running on,
1090  * topology information, etc. Some of these subsystems include processor groups
1091  * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
1092  * microcode, and performance monitoring. These functions all ASSERT that the
1093  * CPU they're being called on has reached a certain cpuid pass. If the passes
1094  * are rearranged, then this needs to be adjusted.
1095  *
1096  * -----------------------------------------------
1097  * Speculative Execution CPU Side Channel Security
1098  * -----------------------------------------------
1099  *
1100  * With the advent of the Spectre and Meltdown attacks which exploit speculative
1101  * execution in the CPU to create side channels there have been a number of
1102  * different attacks and corresponding issues that the operating system needs to
1103  * mitigate against. The following list is some of the common, but not
1104  * exhaustive, set of issues that we know about and have done some or need to do
1105  * more work in the system to mitigate against:
1106  *
1107  *   - Spectre v1
1108  *   - swapgs (Spectre v1 variant)
1109  *   - Spectre v2
1110  *   - Meltdown (Spectre v3)
1111  *   - Rogue Register Read (Spectre v3a)
1112  *   - Speculative Store Bypass (Spectre v4)
1113  *   - ret2spec, SpectreRSB
1114  *   - L1 Terminal Fault (L1TF)
1115  *   - Microarchitectural Data Sampling (MDS)
1116  *   - Register File Data Sampling (RFDS)
1117  *
1118  * Each of these requires different sets of mitigations and has different attack
1119  * surfaces. For the most part, this discussion is about protecting the kernel
1120  * from non-kernel executing environments such as user processes and hardware
1121  * virtual machines. Unfortunately, there are a number of user vs. user
1122  * scenarios that exist with these. The rest of this section will describe the
1123  * overall approach that the system has taken to address these as well as their
1124  * shortcomings. Unfortunately, not all of the above have been handled today.
1125  *
1126  * SPECTRE v2, ret2spec, SpectreRSB
1127  *
1128  * The second variant of the spectre attack focuses on performing branch target
1129  * injection. This generally impacts indirect call instructions in the system.
1130  * There are four different ways to mitigate this issue that are commonly
1131  * described today:
1132  *
1133  *  1. Using Indirect Branch Restricted Speculation (IBRS).
1134  *  2. Using Retpolines and RSB Stuffing
1135  *  3. Using Enhanced Indirect Branch Restricted Speculation (eIBRS)
1136  *  4. Using Automated Indirect Branch Restricted Speculation (AIBRS)
1137  *
1138  * IBRS uses a feature added to microcode to restrict speculation, among other
1139  * things. This form of mitigation has not been used as it has been generally
1140  * seen as too expensive and requires reactivation upon various transitions in
1141  * the system.
1142  *
1143  * As a less impactful alternative to IBRS, retpolines were developed by
1144  * Google. These basically require one to replace indirect calls with a specific
1145  * trampoline that will cause speculation to fail and break the attack.
1146  * Retpolines require compiler support. We always build with retpolines in the
1147  * external thunk mode. This means that a traditional indirect call is replaced
1148  * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
1149  * of this is that all indirect function calls are performed through a register.
1150  *
1151  * We have to use a common external location of the thunk and not inline it into
1152  * the callsite so that way we can have a single place to patch these functions.
1153  * As it turns out, we currently have two different forms of retpolines that
1154  * exist in the system:
1155  *
1156  *  1. A full retpoline
1157  *  2. A no-op version
1158  *
1159  * The first one is used in the general case. Historically, there was an
1160  * AMD-specific optimized retopoline variant that was based around using a
1161  * serializing lfence instruction; however, in March 2022 it was announced that
1162  * this was actually still vulnerable to Spectre v2 and therefore we no longer
1163  * use it and it is no longer available in the system.
1164  *
1165  * The third form described above is the most curious. It turns out that the way
1166  * that retpolines are implemented is that they rely on how speculation is
1167  * performed on a 'ret' instruction. Intel has continued to optimize this
1168  * process (which is partly why we need to have return stack buffer stuffing,
1169  * but more on that in a bit) and in processors starting with Cascade Lake
1170  * on the server side, it's dangerous to rely on retpolines. Instead, a new
1171  * mechanism has been introduced called Enhanced IBRS (eIBRS).
1172  *
1173  * Unlike IBRS, eIBRS is designed to be enabled once at boot and left on each
1174  * physical core. However, if this is the case, we don't want to use retpolines
1175  * any more. Therefore if eIBRS is present, we end up turning each retpoline
1176  * function (called a thunk) into a jmp instruction. This means that we're still
1177  * paying the cost of an extra jump to the external thunk, but it gives us
1178  * flexibility and the ability to have a single kernel image that works across a
1179  * wide variety of systems and hardware features.
1180  *
1181  * Unfortunately, this alone is insufficient. First, Skylake systems have
1182  * additional speculation for the Return Stack Buffer (RSB) which is used to
1183  * return from call instructions which retpolines take advantage of. However,
1184  * this problem is not just limited to Skylake and is actually more pernicious.
1185  * The SpectreRSB paper introduces several more problems that can arise with
1186  * dealing with this. The RSB can be poisoned just like the indirect branch
1187  * predictor. This means that one needs to clear the RSB when transitioning
1188  * between two different privilege domains. Some examples include:
1189  *
1190  *  - Switching between two different user processes
1191  *  - Going between user land and the kernel
1192  *  - Returning to the kernel from a hardware virtual machine
1193  *
1194  * Mitigating this involves combining a couple of different things. The first is
1195  * SMEP (supervisor mode execution protection) which was introduced in Ivy
1196  * Bridge. When an RSB entry refers to a user address and we're executing in the
1197  * kernel, speculation through it will be stopped when SMEP is enabled. This
1198  * protects against a number of the different cases that we would normally be
1199  * worried about such as when we enter the kernel from user land.
1200  *
1201  * To prevent against additional manipulation of the RSB from other contexts
1202  * such as a non-root VMX context attacking the kernel we first look to
1203  * enhanced IBRS. When eIBRS is present and enabled, then there should be
1204  * nothing else that we need to do to protect the kernel at this time.
1205  *
1206  * Unfortunately, eIBRS or not, we need to manually overwrite the contents of
1207  * the return stack buffer. We do this through the x86_rsb_stuff() function.
1208  * Currently this is employed on context switch and vmx_exit. The
1209  * x86_rsb_stuff() function is disabled only when mitigations in general are.
1210  *
1211  * If SMEP is not present, then we would have to stuff the RSB every time we
1212  * transitioned from user mode to the kernel, which isn't very practical right
1213  * now.
1214  *
1215  * To fully protect user to user and vmx to vmx attacks from these classes of
1216  * issues, we would also need to allow them to opt into performing an Indirect
1217  * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1218  *
1219  * The fourth form of mitigation here is specific to AMD and is called Automated
1220  * IBRS (AIBRS). This is similar in spirit to eIBRS; however rather than set the
1221  * IBRS bit in MSR_IA32_SPEC_CTRL (0x48) we instead set a bit in the EFER
1222  * (extended feature enable register) MSR. This bit basically says that IBRS
1223  * acts as though it is always active when executing at CPL0 and when executing
1224  * in the 'host' context when SEV-SNP is enabled.
1225  *
1226  * When this is active, AMD states that the RSB is cleared on VMEXIT and
1227  * therefore it is unnecessary. While this handles RSB stuffing attacks from SVM
1228  * to the kernel, we must still consider the remaining cases that exist, just
1229  * like above. While traditionally AMD employed a 32 entry RSB allowing the
1230  * traditional technique to work, this is not true on all CPUs. While a write to
1231  * IBRS would clear the RSB if the processor supports more than 32 entries (but
1232  * not otherwise), AMD states that as long as at leat a single 4 KiB unmapped
1233  * guard page is present between user and kernel address spaces and SMEP is
1234  * enabled, then there is no need to clear the RSB at all.
1235  *
1236  * By default, the system will enable RSB stuffing and the required variant of
1237  * retpolines and store that information in the x86_spectrev2_mitigation value.
1238  * This will be evaluated after a microcode update as well, though it is
1239  * expected that microcode updates will not take away features. This may mean
1240  * that a late loaded microcode may not end up in the optimal configuration
1241  * (though this should be rare).
1242  *
1243  * Currently we do not build kmdb with retpolines or perform any additional side
1244  * channel security mitigations for it. One complication with kmdb is that it
1245  * requires its own retpoline thunks and it would need to adjust itself based on
1246  * what the kernel does. The threat model of kmdb is more limited and therefore
1247  * it may make more sense to investigate using prediction barriers as the whole
1248  * system is only executing a single instruction at a time while in kmdb.
1249  *
1250  * SPECTRE v1, v4
1251  *
1252  * The v1 and v4 variants of spectre are not currently mitigated in the
1253  * system and require other classes of changes to occur in the code.
1254  *
1255  * SPECTRE v1 (SWAPGS VARIANT)
1256  *
1257  * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1258  * can generally affect any branch-dependent code. The swapgs issue is one
1259  * variant of this. If we are coming in from userspace, we can have code like
1260  * this:
1261  *
1262  *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1263  *	je	1f
1264  *	movq	$0, REGOFF_SAVFP(%rsp)
1265  *	swapgs
1266  *	1:
1267  *	movq	%gs:CPU_THREAD, %rax
1268  *
1269  * If an attacker can cause a mis-speculation of the branch here, we could skip
1270  * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1271  * load. If subsequent code can act as the usual Spectre cache gadget, this
1272  * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1273  * any use of the %gs override.
1274  *
1275  * The other case is also an issue: if we're coming into a trap from kernel
1276  * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1277  * using it. AMD systems are not vulnerable to this version, as a swapgs is
1278  * serializing with respect to subsequent uses. But as AMD /does/ need the other
1279  * case, and the fix is the same in both cases (an lfence at the branch target
1280  * 1: in this example), we'll just do it unconditionally.
1281  *
1282  * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1283  * harder for user-space to actually set a useful %gsbase value: although it's
1284  * not clear, it might still be feasible via lwp_setprivate(), though, so we
1285  * mitigate anyway.
1286  *
1287  * MELTDOWN
1288  *
1289  * Meltdown, or spectre v3, allowed a user process to read any data in their
1290  * address space regardless of whether or not the page tables in question
1291  * allowed the user to have the ability to read them. The solution to meltdown
1292  * is kernel page table isolation. In this world, there are two page tables that
1293  * are used for a process, one in user land and one in the kernel. To implement
1294  * this we use per-CPU page tables and switch between the user and kernel
1295  * variants when entering and exiting the kernel.  For more information about
1296  * this process and how the trampolines work, please see the big theory
1297  * statements and additional comments in:
1298  *
1299  *  - uts/i86pc/ml/kpti_trampolines.s
1300  *  - uts/i86pc/vm/hat_i86.c
1301  *
1302  * While Meltdown only impacted Intel systems and there are also Intel systems
1303  * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1304  * kernel page table isolation enabled. While this may at first seem weird, an
1305  * important thing to remember is that you can't speculatively read an address
1306  * if it's never in your page table at all. Having user processes without kernel
1307  * pages present provides us with an important layer of defense in the kernel
1308  * against any other side channel attacks that exist and have yet to be
1309  * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1310  * default, no matter the x86 system.
1311  *
1312  * L1 TERMINAL FAULT
1313  *
1314  * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1315  * execution uses page table entries. Effectively, it is two different problems.
1316  * The first is that it ignores the not present bit in the page table entries
1317  * when performing speculative execution. This means that something can
1318  * speculatively read the listed physical address if it's present in the L1
1319  * cache under certain conditions (see Intel's documentation for the full set of
1320  * conditions). Secondly, this can be used to bypass hardware virtualization
1321  * extended page tables (EPT) that are part of Intel's hardware virtual machine
1322  * instructions.
1323  *
1324  * For the non-hardware virtualized case, this is relatively easy to deal with.
1325  * We must make sure that all unmapped pages have an address of zero. This means
1326  * that they could read the first 4k of physical memory; however, we never use
1327  * that first page in the operating system and always skip putting it in our
1328  * memory map, even if firmware tells us we can use it in our memory map. While
1329  * other systems try to put extra metadata in the address and reserved bits,
1330  * which led to this being problematic in those cases, we do not.
1331  *
1332  * For hardware virtual machines things are more complicated. Because they can
1333  * construct their own page tables, it isn't hard for them to perform this
1334  * attack against any physical address. The one wrinkle is that this physical
1335  * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1336  * to flush the L1 data cache. We wrap this up in the function
1337  * spec_uarch_flush(). This function is also used in the mitigation of
1338  * microarchitectural data sampling (MDS) discussed later on. Kernel based
1339  * hypervisors such as KVM or bhyve are responsible for performing this before
1340  * entering the guest.
1341  *
1342  * Because this attack takes place in the L1 cache, there's another wrinkle
1343  * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1344  * designs. This means that when a thread enters a hardware virtualized context
1345  * and flushes the L1 data cache, the other thread on the processor may then go
1346  * ahead and put new data in it that can be potentially attacked. While one
1347  * solution is to disable SMT on the system, another option that is available is
1348  * to use a feature for hardware virtualization called 'SMT exclusion'. This
1349  * goes through and makes sure that if a HVM is being scheduled on one thread,
1350  * then the thing on the other thread is from the same hardware virtual machine.
1351  * If an interrupt comes in or the guest exits to the broader system, then the
1352  * other SMT thread will be kicked out.
1353  *
1354  * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1355  * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1356  * perform L1TF related mitigations.
1357  *
1358  * MICROARCHITECTURAL DATA SAMPLING
1359  *
1360  * Microarchitectural data sampling (MDS) is a combination of four discrete
1361  * vulnerabilities that are similar issues affecting various parts of the CPU's
1362  * microarchitectural implementation around load, store, and fill buffers.
1363  * Specifically it is made up of the following subcomponents:
1364  *
1365  *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1366  *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1367  *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1368  *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1369  *
1370  * To begin addressing these, Intel has introduced another feature in microcode
1371  * called MD_CLEAR. This changes the verw instruction to operate in a different
1372  * way. This allows us to execute the verw instruction in a particular way to
1373  * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1374  * updated when this microcode is present to flush this state.
1375  *
1376  * Primarily we need to flush this state whenever we transition from the kernel
1377  * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1378  * little bit different. Here the structures are statically sized when a logical
1379  * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1380  * flush the microarchitectural state before the CPU goes idles by calling hlt,
1381  * mwait, or another ACPI method. To perform these flushes, we call
1382  * x86_md_clear() at all of these transition points.
1383  *
1384  * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1385  * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1386  * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1387  * a no-op.
1388  *
1389  * Unfortunately, with this issue hyperthreading rears its ugly head. In
1390  * particular, everything we've discussed above is only valid for a single
1391  * thread executing on a core. In the case where you have hyper-threading
1392  * present, this attack can be performed between threads. The theoretical fix
1393  * for this is to ensure that both threads are always in the same security
1394  * domain. This means that they are executing in the same ring and mutually
1395  * trust each other. Practically speaking, this would mean that a system call
1396  * would have to issue an inter-processor interrupt (IPI) to the other thread.
1397  * Rather than implement this, we recommend that one disables hyper-threading
1398  * through the use of psradm -aS.
1399  *
1400  * TSX ASYNCHRONOUS ABORT
1401  *
1402  * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1403  * behaves like MDS, but leverages Intel's transactional instructions as another
1404  * vector. Effectively, when a transaction hits one of these cases (unmapped
1405  * page, various cache snoop activity, etc.) then the same data can be exposed
1406  * as in the case of MDS. This means that you can attack your twin.
1407  *
1408  * Intel has described that there are two different ways that we can mitigate
1409  * this problem on affected processors:
1410  *
1411  *   1) We can use the same techniques used to deal with MDS. Flushing the
1412  *      microarchitectural buffers and disabling hyperthreading will mitigate
1413  *      this in the same way.
1414  *
1415  *   2) Using microcode to disable TSX.
1416  *
1417  * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1418  * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1419  * That's OK as we're already doing all such mitigations. On the other hand,
1420  * processors with MDS_NO are all supposed to receive microcode updates that
1421  * enumerate support for disabling TSX. In general, we'd rather use this method
1422  * when available as it doesn't require disabling hyperthreading to be
1423  * effective. Currently we basically are relying on microcode for processors
1424  * that enumerate MDS_NO.
1425  *
1426  * Another MDS-variant in a few select Intel Atom CPUs is Register File Data
1427  * Sampling: RFDS. This allows an attacker to sample values that were in any
1428  * of integer, floating point, or vector registers. This was discovered by
1429  * Intel during internal validation work.  The existence of the RFDS_NO
1430  * capability, or the LACK of a RFDS_CLEAR capability, means we do not have to
1431  * act. Intel has said some CPU models immune to RFDS MAY NOT enumerate
1432  * RFDS_NO. If RFDS_NO is not set, but RFDS_CLEAR is, we must set x86_md_clear,
1433  * and make sure it's using VERW. Unlike MDS, RFDS can't be helped by the
1434  * MSR that L1D uses.
1435  *
1436  * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1437  * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1438  * different powers. The first allows us to cause all transactions to
1439  * immediately abort. The second gives us a means of disabling TSX completely,
1440  * which includes removing it from cpuid. If we have support for this in
1441  * microcode during the first cpuid pass, then we'll disable TSX completely such
1442  * that user land never has a chance to observe the bit. However, if we are late
1443  * loading the microcode, then we must use the functionality to cause
1444  * transactions to automatically abort. This is necessary for user land's sake.
1445  * Once a program sees a cpuid bit, it must not be taken away.
1446  *
1447  * We track whether or not we should do this based on what cpuid pass we're in.
1448  * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1449  * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1450  * should happen twice. Once in the normal cpuid_pass_basic() code and then a
1451  * second time after we do the initial microcode update.  As a result we need to
1452  * be careful in cpuid_apply_tsx() to only use the MSR if we've loaded a
1453  * suitable microcode on the current CPU (which happens prior to
1454  * cpuid_pass_ucode()).
1455  *
1456  * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1457  * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1458  * unfortunate feature in a number of ways, and taking the opportunity to
1459  * finally be able to turn it off is likely to be of benefit in the future.
1460  *
1461  * SUMMARY
1462  *
1463  * The following table attempts to summarize the mitigations for various issues
1464  * and what's done in various places:
1465  *
1466  *  - Spectre v1: Not currently mitigated
1467  *  - swapgs: lfences after swapgs paths
1468  *  - Spectre v2: Retpolines/RSB Stuffing or eIBRS/AIBRS if HW support
1469  *  - Meltdown: Kernel Page Table Isolation
1470  *  - Spectre v3a: Updated CPU microcode
1471  *  - Spectre v4: Not currently mitigated
1472  *  - SpectreRSB: SMEP and RSB Stuffing
1473  *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1474  *  - MDS: x86_md_clear, requires microcode, disabling SMT
1475  *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1476  *  - RFDS: microcode with x86_md_clear if RFDS_CLEAR set and RFDS_NO not.
1477  *
1478  * The following table indicates the x86 feature set bits that indicate that a
1479  * given problem has been solved or a notable feature is present:
1480  *
1481  *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1482  *  - MDS_NO: All forms of MDS
1483  *  - TAA_NO: TAA
1484  *  - RFDS_NO: RFDS
1485  */
1486 
1487 #include <sys/types.h>
1488 #include <sys/archsystm.h>
1489 #include <sys/x86_archext.h>
1490 #include <sys/kmem.h>
1491 #include <sys/systm.h>
1492 #include <sys/cmn_err.h>
1493 #include <sys/sunddi.h>
1494 #include <sys/sunndi.h>
1495 #include <sys/cpuvar.h>
1496 #include <sys/processor.h>
1497 #include <sys/sysmacros.h>
1498 #include <sys/pg.h>
1499 #include <sys/fp.h>
1500 #include <sys/controlregs.h>
1501 #include <sys/bitmap.h>
1502 #include <sys/auxv_386.h>
1503 #include <sys/memnode.h>
1504 #include <sys/pci_cfgspace.h>
1505 #include <sys/comm_page.h>
1506 #include <sys/mach_mmu.h>
1507 #include <sys/ucode.h>
1508 #include <sys/tsc.h>
1509 #include <sys/kobj.h>
1510 #include <sys/asm_misc.h>
1511 #include <sys/bitmap.h>
1512 
1513 #ifdef __xpv
1514 #include <sys/hypervisor.h>
1515 #else
1516 #include <sys/ontrap.h>
1517 #endif
1518 
1519 uint_t x86_vendor = X86_VENDOR_IntelClone;
1520 uint_t x86_type = X86_TYPE_OTHER;
1521 uint_t x86_clflush_size = 0;
1522 
1523 #if defined(__xpv)
1524 int x86_use_pcid = 0;
1525 int x86_use_invpcid = 0;
1526 #else
1527 int x86_use_pcid = -1;
1528 int x86_use_invpcid = -1;
1529 #endif
1530 
1531 typedef enum {
1532 	X86_SPECTREV2_RETPOLINE,
1533 	X86_SPECTREV2_ENHANCED_IBRS,
1534 	X86_SPECTREV2_AUTO_IBRS,
1535 	X86_SPECTREV2_DISABLED
1536 } x86_spectrev2_mitigation_t;
1537 
1538 uint_t x86_disable_spectrev2 = 0;
1539 static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1540     X86_SPECTREV2_RETPOLINE;
1541 
1542 /*
1543  * The mitigation status for TAA:
1544  * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1545  * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1546  * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1547  * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1548  * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1549  * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1550  */
1551 typedef enum {
1552 	X86_TAA_NOTHING,
1553 	X86_TAA_DISABLED,
1554 	X86_TAA_MD_CLEAR,
1555 	X86_TAA_TSX_FORCE_ABORT,
1556 	X86_TAA_TSX_DISABLE,
1557 	X86_TAA_HW_MITIGATED
1558 } x86_taa_mitigation_t;
1559 
1560 uint_t x86_disable_taa = 0;
1561 static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1562 
1563 uint_t pentiumpro_bug4046376;
1564 
1565 uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1566 
1567 static char *x86_feature_names[NUM_X86_FEATURES] = {
1568 	"lgpg",
1569 	"tsc",
1570 	"msr",
1571 	"mtrr",
1572 	"pge",
1573 	"de",
1574 	"cmov",
1575 	"mmx",
1576 	"mca",
1577 	"pae",
1578 	"cv8",
1579 	"pat",
1580 	"sep",
1581 	"sse",
1582 	"sse2",
1583 	"htt",
1584 	"asysc",
1585 	"nx",
1586 	"sse3",
1587 	"cx16",
1588 	"cmp",
1589 	"tscp",
1590 	"mwait",
1591 	"sse4a",
1592 	"cpuid",
1593 	"ssse3",
1594 	"sse4_1",
1595 	"sse4_2",
1596 	"1gpg",
1597 	"clfsh",
1598 	"64",
1599 	"aes",
1600 	"pclmulqdq",
1601 	"xsave",
1602 	"avx",
1603 	"vmx",
1604 	"svm",
1605 	"topoext",
1606 	"f16c",
1607 	"rdrand",
1608 	"x2apic",
1609 	"avx2",
1610 	"bmi1",
1611 	"bmi2",
1612 	"fma",
1613 	"smep",
1614 	"smap",
1615 	"adx",
1616 	"rdseed",
1617 	"mpx",
1618 	"avx512f",
1619 	"avx512dq",
1620 	"avx512pf",
1621 	"avx512er",
1622 	"avx512cd",
1623 	"avx512bw",
1624 	"avx512vl",
1625 	"avx512fma",
1626 	"avx512vbmi",
1627 	"avx512_vpopcntdq",
1628 	"avx512_4vnniw",
1629 	"avx512_4fmaps",
1630 	"xsaveopt",
1631 	"xsavec",
1632 	"xsaves",
1633 	"sha",
1634 	"umip",
1635 	"pku",
1636 	"ospke",
1637 	"pcid",
1638 	"invpcid",
1639 	"ibrs",
1640 	"ibpb",
1641 	"stibp",
1642 	"ssbd",
1643 	"ssbd_virt",
1644 	"rdcl_no",
1645 	"ibrs_all",
1646 	"rsba",
1647 	"ssb_no",
1648 	"stibp_all",
1649 	"flush_cmd",
1650 	"l1d_vmentry_no",
1651 	"fsgsbase",
1652 	"clflushopt",
1653 	"clwb",
1654 	"monitorx",
1655 	"clzero",
1656 	"xop",
1657 	"fma4",
1658 	"tbm",
1659 	"avx512_vnni",
1660 	"amd_pcec",
1661 	"md_clear",
1662 	"mds_no",
1663 	"core_thermal",
1664 	"pkg_thermal",
1665 	"tsx_ctrl",
1666 	"taa_no",
1667 	"ppin",
1668 	"vaes",
1669 	"vpclmulqdq",
1670 	"lfence_serializing",
1671 	"gfni",
1672 	"avx512_vp2intersect",
1673 	"avx512_bitalg",
1674 	"avx512_vbmi2",
1675 	"avx512_bf16",
1676 	"auto_ibrs",
1677 	"rfds_no",
1678 	"rfds_clear"
1679 };
1680 
1681 boolean_t
is_x86_feature(void * featureset,uint_t feature)1682 is_x86_feature(void *featureset, uint_t feature)
1683 {
1684 	ASSERT(feature < NUM_X86_FEATURES);
1685 	return (BT_TEST((ulong_t *)featureset, feature));
1686 }
1687 
1688 void
add_x86_feature(void * featureset,uint_t feature)1689 add_x86_feature(void *featureset, uint_t feature)
1690 {
1691 	ASSERT(feature < NUM_X86_FEATURES);
1692 	BT_SET((ulong_t *)featureset, feature);
1693 }
1694 
1695 void
remove_x86_feature(void * featureset,uint_t feature)1696 remove_x86_feature(void *featureset, uint_t feature)
1697 {
1698 	ASSERT(feature < NUM_X86_FEATURES);
1699 	BT_CLEAR((ulong_t *)featureset, feature);
1700 }
1701 
1702 boolean_t
compare_x86_featureset(void * setA,void * setB)1703 compare_x86_featureset(void *setA, void *setB)
1704 {
1705 	/*
1706 	 * We assume that the unused bits of the bitmap are always zero.
1707 	 */
1708 	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1709 		return (B_TRUE);
1710 	} else {
1711 		return (B_FALSE);
1712 	}
1713 }
1714 
1715 void
print_x86_featureset(void * featureset)1716 print_x86_featureset(void *featureset)
1717 {
1718 	uint_t i;
1719 
1720 	for (i = 0; i < NUM_X86_FEATURES; i++) {
1721 		if (is_x86_feature(featureset, i)) {
1722 			cmn_err(CE_CONT, "?x86_feature: %s\n",
1723 			    x86_feature_names[i]);
1724 		}
1725 	}
1726 }
1727 
1728 /* Note: This is the maximum size for the CPU, not the size of the structure. */
1729 static size_t xsave_state_size = 0;
1730 uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1731 boolean_t xsave_force_disable = B_FALSE;
1732 extern int disable_smap;
1733 
1734 /*
1735  * This is set to platform type we are running on.
1736  */
1737 static int platform_type = -1;
1738 
1739 #if !defined(__xpv)
1740 /*
1741  * Variable to patch if hypervisor platform detection needs to be
1742  * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1743  */
1744 int enable_platform_detection = 1;
1745 #endif
1746 
1747 /*
1748  * monitor/mwait info.
1749  *
1750  * size_actual and buf_actual are the real address and size allocated to get
1751  * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1752  * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1753  * processor cache-line alignment, but this is not guarantied in the furture.
1754  */
1755 struct mwait_info {
1756 	size_t		mon_min;	/* min size to avoid missed wakeups */
1757 	size_t		mon_max;	/* size to avoid false wakeups */
1758 	size_t		size_actual;	/* size actually allocated */
1759 	void		*buf_actual;	/* memory actually allocated */
1760 	uint32_t	support;	/* processor support of monitor/mwait */
1761 };
1762 
1763 /*
1764  * xsave/xrestor info.
1765  *
1766  * This structure contains HW feature bits and the size of the xsave save area.
1767  * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1768  * (xsave_state) to describe the xsave layout. However, at runtime the
1769  * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1770  * xsave_state structure simply represents the legacy layout of the beginning
1771  * of the xsave area.
1772  */
1773 struct xsave_info {
1774 	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1775 	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1776 	size_t		xsav_max_size;  /* max size save area for HW features */
1777 	size_t		ymm_size;	/* AVX: size of ymm save area */
1778 	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1779 	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1780 	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1781 	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1782 	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1783 	size_t		opmask_size;	/* AVX512: size of opmask save */
1784 	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1785 	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1786 	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1787 	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1788 	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1789 };
1790 
1791 
1792 /*
1793  * These constants determine how many of the elements of the
1794  * cpuid we cache in the cpuid_info data structure; the
1795  * remaining elements are accessible via the cpuid instruction.
1796  */
1797 
1798 #define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1799 #define	NMAX_CPI_EXTD	0x22		/* eax = 0x80000000 .. 0x80000021 */
1800 #define	NMAX_CPI_TOPO	0x10		/* Sanity check on leaf 8X26, 1F */
1801 
1802 /*
1803  * See the big theory statement for a more detailed explanation of what some of
1804  * these members mean.
1805  */
1806 struct cpuid_info {
1807 	uint_t cpi_pass;		/* last pass completed */
1808 	/*
1809 	 * standard function information
1810 	 */
1811 	uint_t cpi_maxeax;		/* fn 0: %eax */
1812 	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1813 	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1814 
1815 	uint_t cpi_family;		/* fn 1: extended family */
1816 	uint_t cpi_model;		/* fn 1: extended model */
1817 	uint_t cpi_step;		/* fn 1: stepping */
1818 	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1819 					/*		AMD: package/socket # */
1820 	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1821 	int cpi_clogid;			/* fn 1: %ebx: thread # */
1822 	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1823 	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1824 	uint_t cpi_ncache;		/* fn 2: number of elements */
1825 	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1826 	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1827 	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1828 					/* Intel fn: 4, AMD fn: 8000001d */
1829 	struct cpuid_regs **cpi_cache_leaves;	/* Actual leaves from above */
1830 	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1831 	struct cpuid_regs cpi_sub7[1];	/* Leaf 7, sub-leaf 1 */
1832 	/*
1833 	 * extended function information
1834 	 */
1835 	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1836 	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1837 	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1838 	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1839 	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1840 	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1841 
1842 	id_t cpi_coreid;		/* same coreid => strands share core */
1843 	int cpi_pkgcoreid;		/* core number within single package */
1844 	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1845 					/* Intel: fn 4: %eax[31-26] */
1846 
1847 	/*
1848 	 * These values represent the number of bits that are required to store
1849 	 * information about the number of cores and threads.
1850 	 */
1851 	uint_t cpi_ncore_bits;
1852 	uint_t cpi_nthread_bits;
1853 	/*
1854 	 * supported feature information
1855 	 */
1856 	uint32_t cpi_support[6];
1857 #define	STD_EDX_FEATURES	0
1858 #define	AMD_EDX_FEATURES	1
1859 #define	TM_EDX_FEATURES		2
1860 #define	STD_ECX_FEATURES	3
1861 #define	AMD_ECX_FEATURES	4
1862 #define	STD_EBX_FEATURES	5
1863 	/*
1864 	 * Synthesized information, where known.
1865 	 */
1866 	x86_chiprev_t cpi_chiprev;	/* See X86_CHIPREV_* in x86_archext.h */
1867 	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1868 	uint32_t cpi_socket;		/* Chip package/socket type */
1869 	x86_uarchrev_t cpi_uarchrev;	/* Microarchitecture and revision */
1870 
1871 	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1872 	uint32_t cpi_apicid;
1873 	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1874 	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1875 					/* Intel: 1 */
1876 	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1877 	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1878 
1879 	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1880 
1881 	/*
1882 	 * AMD and Intel extended topology information. Leaf 8X26 (AMD) and
1883 	 * eventually leaf 0x1F (Intel).
1884 	 */
1885 	uint_t cpi_topo_nleaves;
1886 	struct cpuid_regs cpi_topo[NMAX_CPI_TOPO];
1887 };
1888 
1889 
1890 static struct cpuid_info cpuid_info0;
1891 
1892 /*
1893  * These bit fields are defined by the Intel Application Note AP-485
1894  * "Intel Processor Identification and the CPUID Instruction"
1895  */
1896 #define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1897 #define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1898 #define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1899 #define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1900 #define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1901 #define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1902 
1903 #define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1904 #define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1905 #define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1906 #define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1907 #define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1908 #define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1909 #define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1910 #define	CPI_FEATURES_7_1_EAX(cpi)	((cpi)->cpi_sub7[0].cp_eax)
1911 
1912 #define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1913 #define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1914 #define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1915 #define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1916 
1917 #define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1918 #define	CPI_XMAXEAX_MAX		0x80000100
1919 #define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1920 #define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1921 
1922 /*
1923  * Function 4 (Deterministic Cache Parameters) macros
1924  * Defined by Intel Application Note AP-485
1925  */
1926 #define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1927 #define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1928 #define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1929 #define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1930 #define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1931 #define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1932 #define	CPI_CACHE_TYPE_DONE	0
1933 #define	CPI_CACHE_TYPE_DATA	1
1934 #define	CPI_CACHE_TYPE_INSTR	2
1935 #define	CPI_CACHE_TYPE_UNIFIED	3
1936 #define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1937 
1938 #define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1939 #define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1940 #define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1941 
1942 #define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1943 
1944 #define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1945 
1946 
1947 /*
1948  * A couple of shorthand macros to identify "later" P6-family chips
1949  * like the Pentium M and Core.  First, the "older" P6-based stuff
1950  * (loosely defined as "pre-Pentium-4"):
1951  * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1952  */
1953 #define	IS_LEGACY_P6(cpi) (			\
1954 	cpi->cpi_family == 6 &&			\
1955 		(cpi->cpi_model == 1 ||		\
1956 		cpi->cpi_model == 3 ||		\
1957 		cpi->cpi_model == 5 ||		\
1958 		cpi->cpi_model == 6 ||		\
1959 		cpi->cpi_model == 7 ||		\
1960 		cpi->cpi_model == 8 ||		\
1961 		cpi->cpi_model == 0xA ||	\
1962 		cpi->cpi_model == 0xB)		\
1963 )
1964 
1965 /* A "new F6" is everything with family 6 that's not the above */
1966 #define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1967 
1968 /* Extended family/model support */
1969 #define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1970 	cpi->cpi_family >= 0xf)
1971 
1972 /*
1973  * Info for monitor/mwait idle loop.
1974  *
1975  * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1976  * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1977  * 2006.
1978  * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1979  * Documentation Updates" #33633, Rev 2.05, December 2006.
1980  */
1981 #define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1982 #define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1983 #define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1984 #define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1985 #define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1986 #define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1987 #define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1988 #define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1989 /*
1990  * Number of sub-cstates for a given c-state.
1991  */
1992 #define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1993 	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1994 
1995 /*
1996  * XSAVE leaf 0xD enumeration
1997  */
1998 #define	CPUID_LEAFD_2_YMM_OFFSET	576
1999 #define	CPUID_LEAFD_2_YMM_SIZE		256
2000 
2001 /*
2002  * Common extended leaf names to cut down on typos.
2003  */
2004 #define	CPUID_LEAF_EXT_0		0x80000000
2005 #define	CPUID_LEAF_EXT_8		0x80000008
2006 #define	CPUID_LEAF_EXT_1d		0x8000001d
2007 #define	CPUID_LEAF_EXT_1e		0x8000001e
2008 #define	CPUID_LEAF_EXT_21		0x80000021
2009 #define	CPUID_LEAF_EXT_26		0x80000026
2010 
2011 /*
2012  * Functions we consume from cpuid_subr.c;  don't publish these in a header
2013  * file to try and keep people using the expected cpuid_* interfaces.
2014  */
2015 extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
2016 extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
2017 extern x86_chiprev_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
2018 extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
2019 extern x86_uarchrev_t _cpuid_uarchrev(uint_t, uint_t, uint_t, uint_t);
2020 extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
2021 
2022 /*
2023  * Apply up various platform-dependent restrictions where the
2024  * underlying platform restrictions mean the CPU can be marked
2025  * as less capable than its cpuid instruction would imply.
2026  */
2027 #if defined(__xpv)
2028 static void
platform_cpuid_mangle(uint_t vendor,uint32_t eax,struct cpuid_regs * cp)2029 platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
2030 {
2031 	switch (eax) {
2032 	case 1: {
2033 		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
2034 		    0 : CPUID_INTC_EDX_MCA;
2035 		cp->cp_edx &=
2036 		    ~(mcamask |
2037 		    CPUID_INTC_EDX_PSE |
2038 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2039 		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
2040 		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
2041 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2042 		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
2043 		break;
2044 	}
2045 
2046 	case 0x80000001:
2047 		cp->cp_edx &=
2048 		    ~(CPUID_AMD_EDX_PSE |
2049 		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
2050 		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
2051 		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
2052 		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
2053 		    CPUID_AMD_EDX_TSCP);
2054 		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
2055 		break;
2056 	default:
2057 		break;
2058 	}
2059 
2060 	switch (vendor) {
2061 	case X86_VENDOR_Intel:
2062 		switch (eax) {
2063 		case 4:
2064 			/*
2065 			 * Zero out the (ncores-per-chip - 1) field
2066 			 */
2067 			cp->cp_eax &= 0x03fffffff;
2068 			break;
2069 		default:
2070 			break;
2071 		}
2072 		break;
2073 	case X86_VENDOR_AMD:
2074 	case X86_VENDOR_HYGON:
2075 		switch (eax) {
2076 
2077 		case 0x80000001:
2078 			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
2079 			break;
2080 
2081 		case CPUID_LEAF_EXT_8:
2082 			/*
2083 			 * Zero out the (ncores-per-chip - 1) field
2084 			 */
2085 			cp->cp_ecx &= 0xffffff00;
2086 			break;
2087 		default:
2088 			break;
2089 		}
2090 		break;
2091 	default:
2092 		break;
2093 	}
2094 }
2095 #else
2096 #define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
2097 #endif
2098 
2099 /*
2100  *  Some undocumented ways of patching the results of the cpuid
2101  *  instruction to permit running Solaris 10 on future cpus that
2102  *  we don't currently support.  Could be set to non-zero values
2103  *  via settings in eeprom.
2104  */
2105 
2106 uint32_t cpuid_feature_ecx_include;
2107 uint32_t cpuid_feature_ecx_exclude;
2108 uint32_t cpuid_feature_edx_include;
2109 uint32_t cpuid_feature_edx_exclude;
2110 
2111 /*
2112  * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
2113  */
2114 void
cpuid_alloc_space(cpu_t * cpu)2115 cpuid_alloc_space(cpu_t *cpu)
2116 {
2117 	/*
2118 	 * By convention, cpu0 is the boot cpu, which is set up
2119 	 * before memory allocation is available.  All other cpus get
2120 	 * their cpuid_info struct allocated here.
2121 	 */
2122 	ASSERT(cpu->cpu_id != 0);
2123 	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
2124 	cpu->cpu_m.mcpu_cpi =
2125 	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
2126 }
2127 
2128 void
cpuid_free_space(cpu_t * cpu)2129 cpuid_free_space(cpu_t *cpu)
2130 {
2131 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2132 	int i;
2133 
2134 	ASSERT(cpi != NULL);
2135 	ASSERT(cpi != &cpuid_info0);
2136 
2137 	/*
2138 	 * Free up any cache leaf related dynamic storage. The first entry was
2139 	 * cached from the standard cpuid storage, so we should not free it.
2140 	 */
2141 	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
2142 		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
2143 	if (cpi->cpi_cache_leaf_size > 0)
2144 		kmem_free(cpi->cpi_cache_leaves,
2145 		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
2146 
2147 	kmem_free(cpi, sizeof (*cpi));
2148 	cpu->cpu_m.mcpu_cpi = NULL;
2149 }
2150 
2151 #if !defined(__xpv)
2152 /*
2153  * Determine the type of the underlying platform. This is used to customize
2154  * initialization of various subsystems (e.g. TSC). determine_platform() must
2155  * only ever be called once to prevent two processors from seeing different
2156  * values of platform_type. Must be called before cpuid_pass_ident(), the
2157  * earliest consumer to execute; the identification pass will call
2158  * synth_amd_info() to compute the chiprev, which in turn calls get_hwenv().
2159  */
2160 void
determine_platform(void)2161 determine_platform(void)
2162 {
2163 	struct cpuid_regs cp;
2164 	uint32_t base;
2165 	uint32_t regs[4];
2166 	char *hvstr = (char *)regs;
2167 
2168 	ASSERT(platform_type == -1);
2169 
2170 	platform_type = HW_NATIVE;
2171 
2172 	if (!enable_platform_detection)
2173 		return;
2174 
2175 	/*
2176 	 * If Hypervisor CPUID bit is set, try to determine hypervisor
2177 	 * vendor signature, and set platform type accordingly.
2178 	 *
2179 	 * References:
2180 	 * http://lkml.org/lkml/2008/10/1/246
2181 	 * http://kb.vmware.com/kb/1009458
2182 	 */
2183 	cp.cp_eax = 0x1;
2184 	(void) __cpuid_insn(&cp);
2185 	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
2186 		cp.cp_eax = 0x40000000;
2187 		(void) __cpuid_insn(&cp);
2188 		regs[0] = cp.cp_ebx;
2189 		regs[1] = cp.cp_ecx;
2190 		regs[2] = cp.cp_edx;
2191 		regs[3] = 0;
2192 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
2193 			platform_type = HW_XEN_HVM;
2194 			return;
2195 		}
2196 		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
2197 			platform_type = HW_VMWARE;
2198 			return;
2199 		}
2200 		if (strcmp(hvstr, HVSIG_KVM) == 0) {
2201 			platform_type = HW_KVM;
2202 			return;
2203 		}
2204 		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
2205 			platform_type = HW_BHYVE;
2206 			return;
2207 		}
2208 		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0) {
2209 			platform_type = HW_MICROSOFT;
2210 			return;
2211 		}
2212 		if (strcmp(hvstr, HVSIG_QEMU_TCG) == 0) {
2213 			platform_type = HW_QEMU_TCG;
2214 			return;
2215 		}
2216 	} else {
2217 		/*
2218 		 * Check older VMware hardware versions. VMware hypervisor is
2219 		 * detected by performing an IN operation to VMware hypervisor
2220 		 * port and checking that value returned in %ebx is VMware
2221 		 * hypervisor magic value.
2222 		 *
2223 		 * References: http://kb.vmware.com/kb/1009458
2224 		 */
2225 		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
2226 		if (regs[1] == VMWARE_HVMAGIC) {
2227 			platform_type = HW_VMWARE;
2228 			return;
2229 		}
2230 	}
2231 
2232 	/*
2233 	 * Check Xen hypervisor. In a fully virtualized domain,
2234 	 * Xen's pseudo-cpuid function returns a string representing the
2235 	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
2236 	 * supported cpuid function. We need at least a (base + 2) leaf value
2237 	 * to do what we want to do. Try different base values, since the
2238 	 * hypervisor might use a different one depending on whether Hyper-V
2239 	 * emulation is switched on by default or not.
2240 	 */
2241 	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
2242 		cp.cp_eax = base;
2243 		(void) __cpuid_insn(&cp);
2244 		regs[0] = cp.cp_ebx;
2245 		regs[1] = cp.cp_ecx;
2246 		regs[2] = cp.cp_edx;
2247 		regs[3] = 0;
2248 		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
2249 		    cp.cp_eax >= (base + 2)) {
2250 			platform_type &= ~HW_NATIVE;
2251 			platform_type |= HW_XEN_HVM;
2252 			return;
2253 		}
2254 	}
2255 }
2256 
2257 int
get_hwenv(void)2258 get_hwenv(void)
2259 {
2260 	ASSERT(platform_type != -1);
2261 	return (platform_type);
2262 }
2263 
2264 int
is_controldom(void)2265 is_controldom(void)
2266 {
2267 	return (0);
2268 }
2269 
2270 #else
2271 
2272 int
get_hwenv(void)2273 get_hwenv(void)
2274 {
2275 	return (HW_XEN_PV);
2276 }
2277 
2278 int
is_controldom(void)2279 is_controldom(void)
2280 {
2281 	return (DOMAIN_IS_INITDOMAIN(xen_info));
2282 }
2283 
2284 #endif	/* __xpv */
2285 
2286 /*
2287  * Gather the extended topology information. This should be the same for both
2288  * AMD leaf 8X26 and Intel leaf 0x1F (though the data interpretation varies).
2289  */
2290 static void
cpuid_gather_ext_topo_leaf(struct cpuid_info * cpi,uint32_t leaf)2291 cpuid_gather_ext_topo_leaf(struct cpuid_info *cpi, uint32_t leaf)
2292 {
2293 	uint_t i;
2294 
2295 	for (i = 0; i < ARRAY_SIZE(cpi->cpi_topo); i++) {
2296 		struct cpuid_regs *regs = &cpi->cpi_topo[i];
2297 
2298 		bzero(regs, sizeof (struct cpuid_regs));
2299 		regs->cp_eax = leaf;
2300 		regs->cp_ecx = i;
2301 
2302 		(void) __cpuid_insn(regs);
2303 		if (CPUID_AMD_8X26_ECX_TYPE(regs->cp_ecx) ==
2304 		    CPUID_AMD_8X26_TYPE_DONE) {
2305 			break;
2306 		}
2307 	}
2308 
2309 	cpi->cpi_topo_nleaves = i;
2310 }
2311 
2312 /*
2313  * Make sure that we have gathered all of the CPUID leaves that we might need to
2314  * determine topology. We assume that the standard leaf 1 has already been done
2315  * and that xmaxeax has already been calculated.
2316  */
2317 static void
cpuid_gather_amd_topology_leaves(cpu_t * cpu)2318 cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2319 {
2320 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2321 
2322 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2323 		struct cpuid_regs *cp;
2324 
2325 		cp = &cpi->cpi_extd[8];
2326 		cp->cp_eax = CPUID_LEAF_EXT_8;
2327 		(void) __cpuid_insn(cp);
2328 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2329 	}
2330 
2331 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2332 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2333 		struct cpuid_regs *cp;
2334 
2335 		cp = &cpi->cpi_extd[0x1e];
2336 		cp->cp_eax = CPUID_LEAF_EXT_1e;
2337 		(void) __cpuid_insn(cp);
2338 	}
2339 
2340 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_26) {
2341 		cpuid_gather_ext_topo_leaf(cpi, CPUID_LEAF_EXT_26);
2342 	}
2343 }
2344 
2345 /*
2346  * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2347  * it to everything else. If not, and we're on an AMD system where 8000001e is
2348  * valid, then we use that. Othewrise, we fall back to the default value for the
2349  * APIC ID in leaf 1.
2350  */
2351 static uint32_t
cpuid_gather_apicid(struct cpuid_info * cpi)2352 cpuid_gather_apicid(struct cpuid_info *cpi)
2353 {
2354 	/*
2355 	 * Leaf B changes based on the arguments to it. Because we don't cache
2356 	 * it, we need to gather it again.
2357 	 */
2358 	if (cpi->cpi_maxeax >= 0xB) {
2359 		struct cpuid_regs regs;
2360 		struct cpuid_regs *cp;
2361 
2362 		cp = &regs;
2363 		cp->cp_eax = 0xB;
2364 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2365 		(void) __cpuid_insn(cp);
2366 
2367 		if (cp->cp_ebx != 0) {
2368 			return (cp->cp_edx);
2369 		}
2370 	}
2371 
2372 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
2373 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
2374 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2375 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2376 		return (cpi->cpi_extd[0x1e].cp_eax);
2377 	}
2378 
2379 	return (CPI_APIC_ID(cpi));
2380 }
2381 
2382 /*
2383  * For AMD processors, attempt to calculate the number of chips and cores that
2384  * exist. The way that we do this varies based on the generation, because the
2385  * generations themselves have changed dramatically.
2386  *
2387  * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2388  * However, with the advent of family 17h (Zen) it actually tells us the number
2389  * of threads, so we need to look at leaf 0x8000001e if available to determine
2390  * its value. Otherwise, for all prior families, the number of enabled cores is
2391  * the same as threads.
2392  *
2393  * If we do not have leaf 0x80000008, then we assume that this processor does
2394  * not have anything. AMD's older CPUID specification says there's no reason to
2395  * fall back to leaf 1.
2396  *
2397  * In some virtualization cases we will not have leaf 8000001e or it will be
2398  * zero. When that happens we assume the number of threads is one.
2399  */
2400 static void
cpuid_amd_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2401 cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2402 {
2403 	uint_t nthreads, nthread_per_core;
2404 
2405 	nthreads = nthread_per_core = 1;
2406 
2407 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2408 		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2409 	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2410 		nthreads = CPI_CPU_COUNT(cpi);
2411 	}
2412 
2413 	/*
2414 	 * For us to have threads, and know about it, we have to be at least at
2415 	 * family 17h and have the cpuid bit that says we have extended
2416 	 * topology.
2417 	 */
2418 	if (cpi->cpi_family >= 0x17 &&
2419 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2420 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2421 		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2422 	}
2423 
2424 	*ncpus = nthreads;
2425 	*ncores = nthreads / nthread_per_core;
2426 }
2427 
2428 /*
2429  * Seed the initial values for the cores and threads for an Intel based
2430  * processor. These values will be overwritten if we detect that the processor
2431  * supports CPUID leaf 0xb.
2432  */
2433 static void
cpuid_intel_ncores(struct cpuid_info * cpi,uint_t * ncpus,uint_t * ncores)2434 cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2435 {
2436 	/*
2437 	 * Only seed the number of physical cores from the first level leaf 4
2438 	 * information. The number of threads there indicate how many share the
2439 	 * L1 cache, which may or may not have anything to do with the number of
2440 	 * logical CPUs per core.
2441 	 */
2442 	if (cpi->cpi_maxeax >= 4) {
2443 		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2444 	} else {
2445 		*ncores = 1;
2446 	}
2447 
2448 	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2449 		*ncpus = CPI_CPU_COUNT(cpi);
2450 	} else {
2451 		*ncpus = *ncores;
2452 	}
2453 }
2454 
2455 static boolean_t
cpuid_leafB_getids(cpu_t * cpu)2456 cpuid_leafB_getids(cpu_t *cpu)
2457 {
2458 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2459 	struct cpuid_regs regs;
2460 	struct cpuid_regs *cp;
2461 
2462 	if (cpi->cpi_maxeax < 0xB)
2463 		return (B_FALSE);
2464 
2465 	cp = &regs;
2466 	cp->cp_eax = 0xB;
2467 	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2468 
2469 	(void) __cpuid_insn(cp);
2470 
2471 	/*
2472 	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2473 	 * indicates that the extended topology enumeration leaf is
2474 	 * available.
2475 	 */
2476 	if (cp->cp_ebx != 0) {
2477 		uint32_t x2apic_id = 0;
2478 		uint_t coreid_shift = 0;
2479 		uint_t ncpu_per_core = 1;
2480 		uint_t chipid_shift = 0;
2481 		uint_t ncpu_per_chip = 1;
2482 		uint_t i;
2483 		uint_t level;
2484 
2485 		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2486 			cp->cp_eax = 0xB;
2487 			cp->cp_ecx = i;
2488 
2489 			(void) __cpuid_insn(cp);
2490 			level = CPI_CPU_LEVEL_TYPE(cp);
2491 
2492 			if (level == 1) {
2493 				x2apic_id = cp->cp_edx;
2494 				coreid_shift = BITX(cp->cp_eax, 4, 0);
2495 				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2496 			} else if (level == 2) {
2497 				x2apic_id = cp->cp_edx;
2498 				chipid_shift = BITX(cp->cp_eax, 4, 0);
2499 				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2500 			}
2501 		}
2502 
2503 		/*
2504 		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2505 		 */
2506 		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2507 		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2508 		    ncpu_per_core;
2509 		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2510 		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2511 		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2512 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2513 		cpi->cpi_procnodeid = cpi->cpi_chipid;
2514 		cpi->cpi_compunitid = cpi->cpi_coreid;
2515 
2516 		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2517 			cpi->cpi_nthread_bits = coreid_shift;
2518 			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2519 		}
2520 
2521 		return (B_TRUE);
2522 	} else {
2523 		return (B_FALSE);
2524 	}
2525 }
2526 
2527 static void
cpuid_intel_getids(cpu_t * cpu,void * feature)2528 cpuid_intel_getids(cpu_t *cpu, void *feature)
2529 {
2530 	uint_t i;
2531 	uint_t chipid_shift = 0;
2532 	uint_t coreid_shift = 0;
2533 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2534 
2535 	/*
2536 	 * There are no compute units or processor nodes currently on Intel.
2537 	 * Always set these to one.
2538 	 */
2539 	cpi->cpi_procnodes_per_pkg = 1;
2540 	cpi->cpi_cores_per_compunit = 1;
2541 
2542 	/*
2543 	 * If cpuid Leaf B is present, use that to try and get this information.
2544 	 * It will be the most accurate for Intel CPUs.
2545 	 */
2546 	if (cpuid_leafB_getids(cpu))
2547 		return;
2548 
2549 	/*
2550 	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2551 	 * and ncore_per_chip. These represent the largest power of two values
2552 	 * that we need to cover all of the IDs in the system. Therefore, we use
2553 	 * those values to seed the number of bits needed to cover information
2554 	 * in the case when leaf B is not available. These values will probably
2555 	 * be larger than required, but that's OK.
2556 	 */
2557 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2558 	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2559 
2560 	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2561 		chipid_shift++;
2562 
2563 	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2564 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2565 
2566 	if (is_x86_feature(feature, X86FSET_CMP)) {
2567 		/*
2568 		 * Multi-core (and possibly multi-threaded)
2569 		 * processors.
2570 		 */
2571 		uint_t ncpu_per_core = 0;
2572 
2573 		if (cpi->cpi_ncore_per_chip == 1)
2574 			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2575 		else if (cpi->cpi_ncore_per_chip > 1)
2576 			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2577 			    cpi->cpi_ncore_per_chip;
2578 		/*
2579 		 * 8bit APIC IDs on dual core Pentiums
2580 		 * look like this:
2581 		 *
2582 		 * +-----------------------+------+------+
2583 		 * | Physical Package ID   |  MC  |  HT  |
2584 		 * +-----------------------+------+------+
2585 		 * <------- chipid -------->
2586 		 * <------- coreid --------------->
2587 		 *			   <--- clogid -->
2588 		 *			   <------>
2589 		 *			   pkgcoreid
2590 		 *
2591 		 * Where the number of bits necessary to
2592 		 * represent MC and HT fields together equals
2593 		 * to the minimum number of bits necessary to
2594 		 * store the value of cpi->cpi_ncpu_per_chip.
2595 		 * Of those bits, the MC part uses the number
2596 		 * of bits necessary to store the value of
2597 		 * cpi->cpi_ncore_per_chip.
2598 		 */
2599 		for (i = 1; i < ncpu_per_core; i <<= 1)
2600 			coreid_shift++;
2601 		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2602 		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2603 	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2604 		/*
2605 		 * Single-core multi-threaded processors.
2606 		 */
2607 		cpi->cpi_coreid = cpi->cpi_chipid;
2608 		cpi->cpi_pkgcoreid = 0;
2609 	} else {
2610 		/*
2611 		 * Single-core single-thread processors.
2612 		 */
2613 		cpi->cpi_coreid = cpu->cpu_id;
2614 		cpi->cpi_pkgcoreid = 0;
2615 	}
2616 	cpi->cpi_procnodeid = cpi->cpi_chipid;
2617 	cpi->cpi_compunitid = cpi->cpi_coreid;
2618 }
2619 
2620 /*
2621  * Historically, AMD has had CMP chips with only a single thread per core.
2622  * However, starting in family 17h (Zen), this has changed and they now have
2623  * multiple threads. Our internal core id needs to be a unique value.
2624  *
2625  * To determine the core id of an AMD system, if we're from a family before 17h,
2626  * then we just use the cpu id, as that gives us a good value that will be
2627  * unique for each core. If instead, we're on family 17h or later, then we need
2628  * to do something more complicated. CPUID leaf 0x8000001e can tell us
2629  * how many threads are in the system. Based on that, we'll shift the APIC ID.
2630  * We can't use the normal core id in that leaf as it's only unique within the
2631  * socket, which is perfect for cpi_pkgcoreid, but not us.
2632  */
2633 static id_t
cpuid_amd_get_coreid(cpu_t * cpu)2634 cpuid_amd_get_coreid(cpu_t *cpu)
2635 {
2636 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2637 
2638 	if (cpi->cpi_family >= 0x17 &&
2639 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2640 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2641 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2642 		if (nthreads > 1) {
2643 			VERIFY3U(nthreads, ==, 2);
2644 			return (cpi->cpi_apicid >> 1);
2645 		}
2646 	}
2647 
2648 	return (cpu->cpu_id);
2649 }
2650 
2651 /*
2652  * IDs on AMD is a more challenging task. This is notable because of the
2653  * following two facts:
2654  *
2655  *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2656  *     also no way to get an actual unique core id from the system. As such, we
2657  *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2658  *     however, guarantee that sibling cores of a chip will have sequential
2659  *     coreids starting at a multiple of the number of cores per chip - that is
2660  *     usually the case, but if the APIC IDs have been set up in a different
2661  *     order then we need to perform a few more gymnastics for the pkgcoreid.
2662  *
2663  *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2664  *     called compute units. These compute units share the L1I cache, L2 cache,
2665  *     and the FPU. To deal with this, a new topology leaf was added in
2666  *     0x8000001e. However, parts of this leaf have different meanings
2667  *     once we get to family 0x17.
2668  */
2669 
2670 static void
cpuid_amd_getids(cpu_t * cpu,uchar_t * features)2671 cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2672 {
2673 	int i, first_half, coreidsz;
2674 	uint32_t nb_caps_reg;
2675 	uint_t node2_1;
2676 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2677 	struct cpuid_regs *cp;
2678 
2679 	/*
2680 	 * Calculate the core id (this comes from hardware in family 0x17 if it
2681 	 * hasn't been stripped by virtualization). We always set the compute
2682 	 * unit id to the same value. Also, initialize the default number of
2683 	 * cores per compute unit and nodes per package. This will be
2684 	 * overwritten when we know information about a particular family.
2685 	 */
2686 	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2687 	cpi->cpi_compunitid = cpi->cpi_coreid;
2688 	cpi->cpi_cores_per_compunit = 1;
2689 	cpi->cpi_procnodes_per_pkg = 1;
2690 
2691 	/*
2692 	 * To construct the logical ID, we need to determine how many APIC IDs
2693 	 * are dedicated to the cores and threads. This is provided for us in
2694 	 * 0x80000008. However, if it's not present (say due to virtualization),
2695 	 * then we assume it's one. This should be present on all 64-bit AMD
2696 	 * processors.  It was added in family 0xf (Hammer).
2697 	 */
2698 	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2699 		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2700 
2701 		/*
2702 		 * In AMD parlance chip is really a node while illumos
2703 		 * uses chip as equivalent to socket/package.
2704 		 */
2705 		if (coreidsz == 0) {
2706 			/* Use legacy method */
2707 			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2708 				coreidsz++;
2709 			if (coreidsz == 0)
2710 				coreidsz = 1;
2711 		}
2712 	} else {
2713 		/* Assume single-core part */
2714 		coreidsz = 1;
2715 	}
2716 	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2717 
2718 	/*
2719 	 * The package core ID varies depending on the family. While it may be
2720 	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2721 	 * this value is the core id in the given node. For non-virtualized
2722 	 * family 17h, we need to take the logical core id and shift off the
2723 	 * threads like we do when getting the core id.  Otherwise, we can use
2724 	 * the clogid as is. When family 17h is virtualized, the clogid should
2725 	 * be sufficient as if we don't have valid data in the leaf, then we
2726 	 * won't think we have SMT, in which case the cpi_clogid should be
2727 	 * sufficient.
2728 	 */
2729 	if (cpi->cpi_family >= 0x17 &&
2730 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2731 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2732 	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2733 		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2734 		if (nthreads > 1) {
2735 			VERIFY3U(nthreads, ==, 2);
2736 			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2737 		} else {
2738 			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2739 		}
2740 	} else {
2741 		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2742 	}
2743 
2744 	/*
2745 	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2746 	 * (bulldozer) or newer, then we can derive all of this from leaf
2747 	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2748 	 */
2749 	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2750 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2751 		cp = &cpi->cpi_extd[0x1e];
2752 
2753 		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2754 		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2755 
2756 		/*
2757 		 * For Bulldozer-era CPUs, recalculate the compute unit
2758 		 * information.
2759 		 */
2760 		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2761 			cpi->cpi_cores_per_compunit =
2762 			    BITX(cp->cp_ebx, 15, 8) + 1;
2763 			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2764 			    (cpi->cpi_ncore_per_chip /
2765 			    cpi->cpi_cores_per_compunit) *
2766 			    (cpi->cpi_procnodeid /
2767 			    cpi->cpi_procnodes_per_pkg);
2768 		}
2769 	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2770 		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2771 	} else if (cpi->cpi_family == 0x10) {
2772 		/*
2773 		 * See if we are a multi-node processor.
2774 		 * All processors in the system have the same number of nodes
2775 		 */
2776 		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2777 		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2778 			/* Single-node */
2779 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2780 			    coreidsz);
2781 		} else {
2782 
2783 			/*
2784 			 * Multi-node revision D (2 nodes per package
2785 			 * are supported)
2786 			 */
2787 			cpi->cpi_procnodes_per_pkg = 2;
2788 
2789 			first_half = (cpi->cpi_pkgcoreid <=
2790 			    (cpi->cpi_ncore_per_chip/2 - 1));
2791 
2792 			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2793 				/* We are BSP */
2794 				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2795 			} else {
2796 
2797 				/* We are AP */
2798 				/* NodeId[2:1] bits to use for reading F3xe8 */
2799 				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2800 
2801 				nb_caps_reg =
2802 				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2803 
2804 				/*
2805 				 * Check IntNodeNum bit (31:30, but bit 31 is
2806 				 * always 0 on dual-node processors)
2807 				 */
2808 				if (BITX(nb_caps_reg, 30, 30) == 0)
2809 					cpi->cpi_procnodeid = node2_1 +
2810 					    !first_half;
2811 				else
2812 					cpi->cpi_procnodeid = node2_1 +
2813 					    first_half;
2814 			}
2815 		}
2816 	} else {
2817 		cpi->cpi_procnodeid = 0;
2818 	}
2819 
2820 	cpi->cpi_chipid =
2821 	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2822 
2823 	cpi->cpi_ncore_bits = coreidsz;
2824 	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2825 	    cpi->cpi_ncore_per_chip);
2826 }
2827 
2828 static void
spec_uarch_flush_noop(void)2829 spec_uarch_flush_noop(void)
2830 {
2831 }
2832 
2833 /*
2834  * When microcode is present that mitigates MDS, this wrmsr will also flush the
2835  * MDS-related micro-architectural state that would normally happen by calling
2836  * x86_md_clear().
2837  */
2838 static void
spec_uarch_flush_msr(void)2839 spec_uarch_flush_msr(void)
2840 {
2841 	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2842 }
2843 
2844 /*
2845  * This function points to a function that will flush certain
2846  * micro-architectural state on the processor. This flush is used to mitigate
2847  * three different classes of Intel CPU vulnerabilities: L1TF, MDS, and RFDS.
2848  * This function can point to one of three functions:
2849  *
2850  * - A noop which is done because we either are vulnerable, but do not have
2851  *   microcode available to help deal with a fix, or because we aren't
2852  *   vulnerable.
2853  *
2854  * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2855  *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2856  *   however, it only flushes the MDS related micro-architectural state on the
2857  *   current hyperthread, it does not do anything for the twin.
2858  *
2859  * - x86_md_clear which will flush the MDS related state. This is done when we
2860  *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2861  *   (RDCL_NO is set); or if the CPU is vulnerable to RFDS and indicates VERW
2862  *   can clear it (RFDS_CLEAR is set).
2863  */
2864 void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2865 
2866 static void
cpuid_update_md_clear(cpu_t * cpu,uchar_t * featureset)2867 cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2868 {
2869 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2870 
2871 	/* Non-Intel doesn't concern us here. */
2872 	if (cpi->cpi_vendor != X86_VENDOR_Intel)
2873 		return;
2874 
2875 	/*
2876 	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2877 	 * has been fixed in hardware, it doesn't cover everything related to
2878 	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2879 	 * need to mitigate this.
2880 	 *
2881 	 * We must ALSO check the case of RFDS_NO and if RFDS_CLEAR is set,
2882 	 * because of the small cases of RFDS.
2883 	 */
2884 
2885 	if ((!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2886 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) ||
2887 	    (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2888 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR))) {
2889 		const uint8_t nop = NOP_INSTR;
2890 		uint8_t *md = (uint8_t *)x86_md_clear;
2891 
2892 		*md = nop;
2893 	}
2894 
2895 	membar_producer();
2896 }
2897 
2898 static void
cpuid_update_l1d_flush(cpu_t * cpu,uchar_t * featureset)2899 cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2900 {
2901 	boolean_t need_l1d, need_mds, need_rfds;
2902 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2903 
2904 	/*
2905 	 * If we're not on Intel or we've mitigated all of RDCL, MDS, and RFDS
2906 	 * in hardware, then there's nothing left for us to do for enabling
2907 	 * the flush. We can also go ahead and say that SMT exclusion is
2908 	 * unnecessary.
2909 	 */
2910 	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2911 	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2912 	    is_x86_feature(featureset, X86FSET_MDS_NO) &&
2913 	    is_x86_feature(featureset, X86FSET_RFDS_NO))) {
2914 		extern int smt_exclusion;
2915 		smt_exclusion = 0;
2916 		spec_uarch_flush = spec_uarch_flush_noop;
2917 		membar_producer();
2918 		return;
2919 	}
2920 
2921 	/*
2922 	 * The locations where we need to perform an L1D flush are required both
2923 	 * for mitigating L1TF and MDS. When verw support is present in
2924 	 * microcode, then the L1D flush will take care of doing that as well.
2925 	 * However, if we have a system where RDCL_NO is present, but we don't
2926 	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2927 	 * L1D flush.
2928 	 */
2929 	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2930 	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2931 	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2932 		need_l1d = B_TRUE;
2933 	} else {
2934 		need_l1d = B_FALSE;
2935 	}
2936 
2937 	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2938 	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2939 		need_mds = B_TRUE;
2940 	} else {
2941 		need_mds = B_FALSE;
2942 	}
2943 
2944 	if (!is_x86_feature(featureset, X86FSET_RFDS_NO) &&
2945 	    is_x86_feature(featureset, X86FSET_RFDS_CLEAR)) {
2946 		need_rfds = B_TRUE;
2947 	} else {
2948 		need_rfds = B_FALSE;
2949 	}
2950 
2951 	if (need_l1d) {
2952 		/*
2953 		 * As of Feb, 2024, no CPU needs L1D *and* RFDS mitigation
2954 		 * together. If the following VERIFY trips, we need to add
2955 		 * further fixes here.
2956 		 */
2957 		VERIFY(!need_rfds);
2958 		spec_uarch_flush = spec_uarch_flush_msr;
2959 	} else if (need_mds || need_rfds) {
2960 		spec_uarch_flush = x86_md_clear;
2961 	} else {
2962 		/*
2963 		 * We have no hardware mitigations available to us.
2964 		 */
2965 		spec_uarch_flush = spec_uarch_flush_noop;
2966 	}
2967 	membar_producer();
2968 }
2969 
2970 /*
2971  * We default to enabling RSB mitigations.
2972  *
2973  * NOTE: We used to skip RSB mitigations with eIBRS, but developments around
2974  * post-barrier RSB guessing suggests we should enable RSB mitigations always
2975  * unless specifically instructed not to.
2976  *
2977  * AMD indicates that when Automatic IBRS is enabled we do not need to implement
2978  * return stack buffer clearing for VMEXIT as it takes care of it. The manual
2979  * also states that as long as SMEP and we maintain at least one page between
2980  * the kernel and user space (we have much more of a red zone), then we do not
2981  * need to clear the RSB. We constrain this to only when Automatic IRBS is
2982  * present.
2983  */
2984 static void
cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)2985 cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2986 {
2987 	const uint8_t ret = RET_INSTR;
2988 	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2989 
2990 	switch (mit) {
2991 	case X86_SPECTREV2_AUTO_IBRS:
2992 	case X86_SPECTREV2_DISABLED:
2993 		*stuff = ret;
2994 		break;
2995 	default:
2996 		break;
2997 	}
2998 }
2999 
3000 static void
cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)3001 cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
3002 {
3003 	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
3004 	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
3005 	    "_r14", "_r15" };
3006 	const uint_t nthunks = ARRAY_SIZE(thunks);
3007 	const char *type;
3008 	uint_t i;
3009 
3010 	if (mit == x86_spectrev2_mitigation)
3011 		return;
3012 
3013 	switch (mit) {
3014 	case X86_SPECTREV2_RETPOLINE:
3015 		type = "gen";
3016 		break;
3017 	case X86_SPECTREV2_AUTO_IBRS:
3018 	case X86_SPECTREV2_ENHANCED_IBRS:
3019 	case X86_SPECTREV2_DISABLED:
3020 		type = "jmp";
3021 		break;
3022 	default:
3023 		panic("asked to update retpoline state with unknown state!");
3024 	}
3025 
3026 	for (i = 0; i < nthunks; i++) {
3027 		uintptr_t source, dest;
3028 		int ssize, dsize;
3029 		char sourcebuf[64], destbuf[64];
3030 
3031 		(void) snprintf(destbuf, sizeof (destbuf),
3032 		    "__x86_indirect_thunk%s", thunks[i]);
3033 		(void) snprintf(sourcebuf, sizeof (sourcebuf),
3034 		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
3035 
3036 		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
3037 		dest = kobj_getelfsym(destbuf, NULL, &dsize);
3038 		VERIFY3U(source, !=, 0);
3039 		VERIFY3U(dest, !=, 0);
3040 		VERIFY3S(dsize, >=, ssize);
3041 		bcopy((void *)source, (void *)dest, ssize);
3042 	}
3043 }
3044 
3045 static void
cpuid_enable_enhanced_ibrs(void)3046 cpuid_enable_enhanced_ibrs(void)
3047 {
3048 	uint64_t val;
3049 
3050 	val = rdmsr(MSR_IA32_SPEC_CTRL);
3051 	val |= IA32_SPEC_CTRL_IBRS;
3052 	wrmsr(MSR_IA32_SPEC_CTRL, val);
3053 }
3054 
3055 static void
cpuid_enable_auto_ibrs(void)3056 cpuid_enable_auto_ibrs(void)
3057 {
3058 	uint64_t val;
3059 
3060 	val = rdmsr(MSR_AMD_EFER);
3061 	val |= AMD_EFER_AIBRSE;
3062 	wrmsr(MSR_AMD_EFER, val);
3063 }
3064 
3065 /*
3066  * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
3067  * we can disable TSX, we do so.
3068  *
3069  * This determination is done only on the boot CPU, potentially after loading
3070  * updated microcode.
3071  */
3072 static void
cpuid_update_tsx(cpu_t * cpu,uchar_t * featureset)3073 cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
3074 {
3075 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3076 
3077 	VERIFY(cpu->cpu_id == 0);
3078 
3079 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3080 		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3081 		return;
3082 	}
3083 
3084 	if (x86_disable_taa) {
3085 		x86_taa_mitigation = X86_TAA_DISABLED;
3086 		return;
3087 	}
3088 
3089 	/*
3090 	 * If we do not have the ability to disable TSX, then our only
3091 	 * mitigation options are in hardware (TAA_NO), or by using our existing
3092 	 * MDS mitigation as described above.  The latter relies upon us having
3093 	 * configured MDS mitigations correctly! This includes disabling SMT if
3094 	 * we want to cross-CPU-thread protection.
3095 	 */
3096 	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
3097 		/*
3098 		 * It's not clear whether any parts will enumerate TAA_NO
3099 		 * *without* TSX_CTRL, but let's mark it as such if we see this.
3100 		 */
3101 		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
3102 			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
3103 			return;
3104 		}
3105 
3106 		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
3107 		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
3108 			x86_taa_mitigation = X86_TAA_MD_CLEAR;
3109 		} else {
3110 			x86_taa_mitigation = X86_TAA_NOTHING;
3111 		}
3112 		return;
3113 	}
3114 
3115 	/*
3116 	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
3117 	 * enough in boot.
3118 	 *
3119 	 * Otherwise, we'll fall back to causing transactions to abort as our
3120 	 * mitigation. TSX-using code will always take the fallback path.
3121 	 */
3122 	if (cpi->cpi_pass < 4) {
3123 		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
3124 	} else {
3125 		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
3126 	}
3127 }
3128 
3129 /*
3130  * As mentioned, we should only touch the MSR when we've got a suitable
3131  * microcode loaded on this CPU.
3132  */
3133 static void
cpuid_apply_tsx(x86_taa_mitigation_t taa,uchar_t * featureset)3134 cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
3135 {
3136 	uint64_t val;
3137 
3138 	switch (taa) {
3139 	case X86_TAA_TSX_DISABLE:
3140 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3141 			return;
3142 		val = rdmsr(MSR_IA32_TSX_CTRL);
3143 		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
3144 		wrmsr(MSR_IA32_TSX_CTRL, val);
3145 		break;
3146 	case X86_TAA_TSX_FORCE_ABORT:
3147 		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
3148 			return;
3149 		val = rdmsr(MSR_IA32_TSX_CTRL);
3150 		val |= IA32_TSX_CTRL_RTM_DISABLE;
3151 		wrmsr(MSR_IA32_TSX_CTRL, val);
3152 		break;
3153 	case X86_TAA_HW_MITIGATED:
3154 	case X86_TAA_MD_CLEAR:
3155 	case X86_TAA_DISABLED:
3156 	case X86_TAA_NOTHING:
3157 		break;
3158 	}
3159 }
3160 
3161 static void
cpuid_scan_security(cpu_t * cpu,uchar_t * featureset)3162 cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
3163 {
3164 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3165 	x86_spectrev2_mitigation_t v2mit;
3166 
3167 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
3168 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
3169 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3170 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
3171 			add_x86_feature(featureset, X86FSET_IBPB);
3172 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
3173 			add_x86_feature(featureset, X86FSET_IBRS);
3174 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
3175 			add_x86_feature(featureset, X86FSET_STIBP);
3176 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
3177 			add_x86_feature(featureset, X86FSET_STIBP_ALL);
3178 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
3179 			add_x86_feature(featureset, X86FSET_SSBD);
3180 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
3181 			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
3182 		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
3183 			add_x86_feature(featureset, X86FSET_SSB_NO);
3184 
3185 		/*
3186 		 * Rather than Enhanced IBRS, AMD has a different feature that
3187 		 * is a bit in EFER that can be enabled and will basically do
3188 		 * the right thing while executing in the kernel.
3189 		 */
3190 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3191 		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
3192 		    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21 &&
3193 		    (cpi->cpi_extd[0x21].cp_eax & CPUID_AMD_8X21_EAX_AIBRS)) {
3194 			add_x86_feature(featureset, X86FSET_AUTO_IBRS);
3195 		}
3196 
3197 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3198 	    cpi->cpi_maxeax >= 7) {
3199 		struct cpuid_regs *ecp;
3200 		ecp = &cpi->cpi_std[7];
3201 
3202 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
3203 			add_x86_feature(featureset, X86FSET_MD_CLEAR);
3204 		}
3205 
3206 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
3207 			add_x86_feature(featureset, X86FSET_IBRS);
3208 			add_x86_feature(featureset, X86FSET_IBPB);
3209 		}
3210 
3211 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
3212 			add_x86_feature(featureset, X86FSET_STIBP);
3213 		}
3214 
3215 		/*
3216 		 * Don't read the arch caps MSR on xpv where we lack the
3217 		 * on_trap().
3218 		 */
3219 #ifndef __xpv
3220 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
3221 			on_trap_data_t otd;
3222 
3223 			/*
3224 			 * Be paranoid and assume we'll get a #GP.
3225 			 */
3226 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3227 				uint64_t reg;
3228 
3229 				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
3230 				if (reg & IA32_ARCH_CAP_RDCL_NO) {
3231 					add_x86_feature(featureset,
3232 					    X86FSET_RDCL_NO);
3233 				}
3234 				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
3235 					add_x86_feature(featureset,
3236 					    X86FSET_IBRS_ALL);
3237 				}
3238 				if (reg & IA32_ARCH_CAP_RSBA) {
3239 					add_x86_feature(featureset,
3240 					    X86FSET_RSBA);
3241 				}
3242 				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
3243 					add_x86_feature(featureset,
3244 					    X86FSET_L1D_VM_NO);
3245 				}
3246 				if (reg & IA32_ARCH_CAP_SSB_NO) {
3247 					add_x86_feature(featureset,
3248 					    X86FSET_SSB_NO);
3249 				}
3250 				if (reg & IA32_ARCH_CAP_MDS_NO) {
3251 					add_x86_feature(featureset,
3252 					    X86FSET_MDS_NO);
3253 				}
3254 				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
3255 					add_x86_feature(featureset,
3256 					    X86FSET_TSX_CTRL);
3257 				}
3258 				if (reg & IA32_ARCH_CAP_TAA_NO) {
3259 					add_x86_feature(featureset,
3260 					    X86FSET_TAA_NO);
3261 				}
3262 				if (reg & IA32_ARCH_CAP_RFDS_NO) {
3263 					add_x86_feature(featureset,
3264 					    X86FSET_RFDS_NO);
3265 				}
3266 				if (reg & IA32_ARCH_CAP_RFDS_CLEAR) {
3267 					add_x86_feature(featureset,
3268 					    X86FSET_RFDS_CLEAR);
3269 				}
3270 			}
3271 			no_trap();
3272 		}
3273 #endif	/* !__xpv */
3274 
3275 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
3276 			add_x86_feature(featureset, X86FSET_SSBD);
3277 
3278 		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
3279 			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
3280 	}
3281 
3282 	/*
3283 	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
3284 	 * will have already run this function and determined what we need to
3285 	 * do. This gives us a hook for per-HW thread mitigations such as
3286 	 * enhanced IBRS, or disabling TSX.
3287 	 */
3288 	if (cpu->cpu_id != 0) {
3289 		switch (x86_spectrev2_mitigation) {
3290 		case X86_SPECTREV2_ENHANCED_IBRS:
3291 			cpuid_enable_enhanced_ibrs();
3292 			break;
3293 		case X86_SPECTREV2_AUTO_IBRS:
3294 			cpuid_enable_auto_ibrs();
3295 			break;
3296 		default:
3297 			break;
3298 		}
3299 
3300 		cpuid_apply_tsx(x86_taa_mitigation, featureset);
3301 		return;
3302 	}
3303 
3304 	/*
3305 	 * Go through and initialize various security mechanisms that we should
3306 	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3307 	 * TAA.
3308 	 */
3309 
3310 	/*
3311 	 * By default we've come in with retpolines enabled. Check whether we
3312 	 * should disable them or enable enhanced or automatic IBRS. RSB
3313 	 * stuffing is enabled by default. Note, we do not allow the use of AMD
3314 	 * optimized retpolines as it was disclosed by AMD in March 2022 that
3315 	 * they were still vulnerable. Prior to that point, we used them.
3316 	 */
3317 	if (x86_disable_spectrev2 != 0) {
3318 		v2mit = X86_SPECTREV2_DISABLED;
3319 	} else if (is_x86_feature(featureset, X86FSET_AUTO_IBRS)) {
3320 		cpuid_enable_auto_ibrs();
3321 		v2mit = X86_SPECTREV2_AUTO_IBRS;
3322 	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3323 		cpuid_enable_enhanced_ibrs();
3324 		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3325 	} else {
3326 		v2mit = X86_SPECTREV2_RETPOLINE;
3327 	}
3328 
3329 	cpuid_patch_retpolines(v2mit);
3330 	cpuid_patch_rsb(v2mit);
3331 	x86_spectrev2_mitigation = v2mit;
3332 	membar_producer();
3333 
3334 	/*
3335 	 * We need to determine what changes are required for mitigating L1TF
3336 	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3337 	 * is required.
3338 	 *
3339 	 * If any of these are present, then we need to flush u-arch state at
3340 	 * various points. For MDS, we need to do so whenever we change to a
3341 	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3342 	 * flush the L1D cache at VM entry. When we have microcode that handles
3343 	 * MDS, the L1D flush also clears the other u-arch state that the
3344 	 * md_clear does.
3345 	 */
3346 
3347 	/*
3348 	 * Update whether or not we need to be taking explicit action against
3349 	 * MDS or RFDS.
3350 	 */
3351 	cpuid_update_md_clear(cpu, featureset);
3352 
3353 	/*
3354 	 * Determine whether SMT exclusion is required and whether or not we
3355 	 * need to perform an l1d flush.
3356 	 */
3357 	cpuid_update_l1d_flush(cpu, featureset);
3358 
3359 	/*
3360 	 * Determine what our mitigation strategy should be for TAA and then
3361 	 * also apply TAA mitigations.
3362 	 */
3363 	cpuid_update_tsx(cpu, featureset);
3364 	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3365 }
3366 
3367 /*
3368  * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3369  */
3370 void
setup_xfem(void)3371 setup_xfem(void)
3372 {
3373 	uint64_t flags = XFEATURE_LEGACY_FP;
3374 
3375 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3376 
3377 	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3378 		flags |= XFEATURE_SSE;
3379 
3380 	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3381 		flags |= XFEATURE_AVX;
3382 
3383 	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3384 		flags |= XFEATURE_AVX512;
3385 
3386 	set_xcr(XFEATURE_ENABLED_MASK, flags);
3387 
3388 	xsave_bv_all = flags;
3389 }
3390 
3391 static void
cpuid_basic_topology(cpu_t * cpu,uchar_t * featureset)3392 cpuid_basic_topology(cpu_t *cpu, uchar_t *featureset)
3393 {
3394 	struct cpuid_info *cpi;
3395 
3396 	cpi = cpu->cpu_m.mcpu_cpi;
3397 
3398 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3399 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3400 		cpuid_gather_amd_topology_leaves(cpu);
3401 	}
3402 
3403 	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3404 
3405 	/*
3406 	 * Before we can calculate the IDs that we should assign to this
3407 	 * processor, we need to understand how many cores and threads it has.
3408 	 */
3409 	switch (cpi->cpi_vendor) {
3410 	case X86_VENDOR_Intel:
3411 		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3412 		    &cpi->cpi_ncore_per_chip);
3413 		break;
3414 	case X86_VENDOR_AMD:
3415 	case X86_VENDOR_HYGON:
3416 		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3417 		    &cpi->cpi_ncore_per_chip);
3418 		break;
3419 	default:
3420 		/*
3421 		 * If we have some other x86 compatible chip, it's not clear how
3422 		 * they would behave. The most common case is virtualization
3423 		 * today, though there are also 64-bit VIA chips. Assume that
3424 		 * all we can get is the basic Leaf 1 HTT information.
3425 		 */
3426 		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3427 			cpi->cpi_ncore_per_chip = 1;
3428 			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3429 		}
3430 		break;
3431 	}
3432 
3433 	/*
3434 	 * Based on the calculated number of threads and cores, potentially
3435 	 * assign the HTT and CMT features.
3436 	 */
3437 	if (cpi->cpi_ncore_per_chip > 1) {
3438 		add_x86_feature(featureset, X86FSET_CMP);
3439 	}
3440 
3441 	if (cpi->cpi_ncpu_per_chip > 1 &&
3442 	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3443 		add_x86_feature(featureset, X86FSET_HTT);
3444 	}
3445 
3446 	/*
3447 	 * Now that has been set up, we need to go through and calculate all of
3448 	 * the rest of the parameters that exist. If we think the CPU doesn't
3449 	 * have either SMT (HTT) or CMP, then we basically go through and fake
3450 	 * up information in some way. The most likely case for this is
3451 	 * virtualization where we have a lot of partial topology information.
3452 	 */
3453 	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3454 	    !is_x86_feature(featureset, X86FSET_CMP)) {
3455 		/*
3456 		 * This is a single core, single-threaded processor.
3457 		 */
3458 		cpi->cpi_procnodes_per_pkg = 1;
3459 		cpi->cpi_cores_per_compunit = 1;
3460 		cpi->cpi_compunitid = 0;
3461 		cpi->cpi_chipid = -1;
3462 		cpi->cpi_clogid = 0;
3463 		cpi->cpi_coreid = cpu->cpu_id;
3464 		cpi->cpi_pkgcoreid = 0;
3465 		if (cpi->cpi_vendor == X86_VENDOR_AMD ||
3466 		    cpi->cpi_vendor == X86_VENDOR_HYGON) {
3467 			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3468 		} else {
3469 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3470 		}
3471 	} else {
3472 		switch (cpi->cpi_vendor) {
3473 		case X86_VENDOR_Intel:
3474 			cpuid_intel_getids(cpu, featureset);
3475 			break;
3476 		case X86_VENDOR_AMD:
3477 		case X86_VENDOR_HYGON:
3478 			cpuid_amd_getids(cpu, featureset);
3479 			break;
3480 		default:
3481 			/*
3482 			 * In this case, it's hard to say what we should do.
3483 			 * We're going to model them to the OS as single core
3484 			 * threads. We don't have a good identifier for them, so
3485 			 * we're just going to use the cpu id all on a single
3486 			 * chip.
3487 			 *
3488 			 * This case has historically been different from the
3489 			 * case above where we don't have HTT or CMP. While they
3490 			 * could be combined, we've opted to keep it separate to
3491 			 * minimize the risk of topology changes in weird cases.
3492 			 */
3493 			cpi->cpi_procnodes_per_pkg = 1;
3494 			cpi->cpi_cores_per_compunit = 1;
3495 			cpi->cpi_chipid = 0;
3496 			cpi->cpi_coreid = cpu->cpu_id;
3497 			cpi->cpi_clogid = cpu->cpu_id;
3498 			cpi->cpi_pkgcoreid = cpu->cpu_id;
3499 			cpi->cpi_procnodeid = cpi->cpi_chipid;
3500 			cpi->cpi_compunitid = cpi->cpi_coreid;
3501 			break;
3502 		}
3503 	}
3504 }
3505 
3506 /*
3507  * Gather relevant CPU features from leaf 6 which covers thermal information. We
3508  * always gather leaf 6 if it's supported; however, we only look for features on
3509  * Intel systems as AMD does not currently define any of the features we look
3510  * for below.
3511  */
3512 static void
cpuid_basic_thermal(cpu_t * cpu,uchar_t * featureset)3513 cpuid_basic_thermal(cpu_t *cpu, uchar_t *featureset)
3514 {
3515 	struct cpuid_regs *cp;
3516 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3517 
3518 	if (cpi->cpi_maxeax < 6) {
3519 		return;
3520 	}
3521 
3522 	cp = &cpi->cpi_std[6];
3523 	cp->cp_eax = 6;
3524 	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3525 	(void) __cpuid_insn(cp);
3526 	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3527 
3528 	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3529 		return;
3530 	}
3531 
3532 	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3533 		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3534 	}
3535 
3536 	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3537 		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3538 	}
3539 }
3540 
3541 /*
3542  * This is used when we discover that we have AVX support in cpuid. This
3543  * proceeds to scan for the rest of the AVX derived features.
3544  */
3545 static void
cpuid_basic_avx(cpu_t * cpu,uchar_t * featureset)3546 cpuid_basic_avx(cpu_t *cpu, uchar_t *featureset)
3547 {
3548 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3549 
3550 	/*
3551 	 * If we don't have AVX, don't bother with most of this.
3552 	 */
3553 	if ((cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_AVX) == 0)
3554 		return;
3555 
3556 	add_x86_feature(featureset, X86FSET_AVX);
3557 
3558 	/*
3559 	 * Intel says we can't check these without also
3560 	 * checking AVX.
3561 	 */
3562 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_F16C)
3563 		add_x86_feature(featureset, X86FSET_F16C);
3564 
3565 	if (cpi->cpi_std[1].cp_ecx & CPUID_INTC_ECX_FMA)
3566 		add_x86_feature(featureset, X86FSET_FMA);
3567 
3568 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI1)
3569 		add_x86_feature(featureset, X86FSET_BMI1);
3570 
3571 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_BMI2)
3572 		add_x86_feature(featureset, X86FSET_BMI2);
3573 
3574 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX2)
3575 		add_x86_feature(featureset, X86FSET_AVX2);
3576 
3577 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VAES)
3578 		add_x86_feature(featureset, X86FSET_VAES);
3579 
3580 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_VPCLMULQDQ)
3581 		add_x86_feature(featureset, X86FSET_VPCLMULQDQ);
3582 
3583 	/*
3584 	 * The rest of the AVX features require AVX512. Do not check them unless
3585 	 * it is present.
3586 	 */
3587 	if ((cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512F) == 0)
3588 		return;
3589 	add_x86_feature(featureset, X86FSET_AVX512F);
3590 
3591 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
3592 		add_x86_feature(featureset, X86FSET_AVX512DQ);
3593 
3594 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
3595 		add_x86_feature(featureset, X86FSET_AVX512FMA);
3596 
3597 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512PF)
3598 		add_x86_feature(featureset, X86FSET_AVX512PF);
3599 
3600 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512ER)
3601 		add_x86_feature(featureset, X86FSET_AVX512ER);
3602 
3603 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512CD)
3604 		add_x86_feature(featureset, X86FSET_AVX512CD);
3605 
3606 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512BW)
3607 		add_x86_feature(featureset, X86FSET_AVX512BW);
3608 
3609 	if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_AVX512VL)
3610 		add_x86_feature(featureset, X86FSET_AVX512VL);
3611 
3612 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI)
3613 		add_x86_feature(featureset, X86FSET_AVX512VBMI);
3614 
3615 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VBMI2)
3616 		add_x86_feature(featureset, X86FSET_AVX512_VBMI2);
3617 
3618 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VNNI)
3619 		add_x86_feature(featureset, X86FSET_AVX512VNNI);
3620 
3621 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512BITALG)
3622 		add_x86_feature(featureset, X86FSET_AVX512_BITALG);
3623 
3624 	if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3625 		add_x86_feature(featureset, X86FSET_AVX512VPOPCDQ);
3626 
3627 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124NNIW)
3628 		add_x86_feature(featureset, X86FSET_AVX512NNIW);
3629 
3630 	if (cpi->cpi_std[7].cp_edx & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3631 		add_x86_feature(featureset, X86FSET_AVX512FMAPS);
3632 
3633 	/*
3634 	 * More features here are in Leaf 7, subleaf 1. Don't bother checking if
3635 	 * we don't need to.
3636 	 */
3637 	if (cpi->cpi_std[7].cp_eax < 1)
3638 		return;
3639 
3640 	if (cpi->cpi_sub7[0].cp_eax & CPUID_INTC_EAX_7_1_AVX512_BF16)
3641 		add_x86_feature(featureset, X86FSET_AVX512_BF16);
3642 }
3643 
3644 /*
3645  * PPIN is the protected processor inventory number. On AMD this is an actual
3646  * feature bit. However, on Intel systems we need to read the platform
3647  * information MSR if we're on a specific model.
3648  */
3649 #if !defined(__xpv)
3650 static void
cpuid_basic_ppin(cpu_t * cpu,uchar_t * featureset)3651 cpuid_basic_ppin(cpu_t *cpu, uchar_t *featureset)
3652 {
3653 	on_trap_data_t otd;
3654 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3655 
3656 	switch (cpi->cpi_vendor) {
3657 	case X86_VENDOR_AMD:
3658 		/*
3659 		 * This leaf will have already been gathered in the topology
3660 		 * functions.
3661 		 */
3662 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3663 			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3664 				add_x86_feature(featureset, X86FSET_PPIN);
3665 			}
3666 		}
3667 		break;
3668 	case X86_VENDOR_Intel:
3669 		if (cpi->cpi_family != 6)
3670 			break;
3671 		switch (cpi->cpi_model) {
3672 		case INTC_MODEL_IVYBRIDGE_XEON:
3673 		case INTC_MODEL_HASWELL_XEON:
3674 		case INTC_MODEL_BROADWELL_XEON:
3675 		case INTC_MODEL_BROADWELL_XEON_D:
3676 		case INTC_MODEL_SKYLAKE_XEON:
3677 		case INTC_MODEL_ICELAKE_XEON:
3678 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3679 				uint64_t value;
3680 
3681 				value = rdmsr(MSR_PLATFORM_INFO);
3682 				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3683 					add_x86_feature(featureset,
3684 					    X86FSET_PPIN);
3685 				}
3686 			}
3687 			no_trap();
3688 			break;
3689 		default:
3690 			break;
3691 		}
3692 		break;
3693 	default:
3694 		break;
3695 	}
3696 }
3697 #endif	/* ! __xpv */
3698 
3699 static void
cpuid_pass_prelude(cpu_t * cpu,void * arg)3700 cpuid_pass_prelude(cpu_t *cpu, void *arg)
3701 {
3702 	uchar_t *featureset = (uchar_t *)arg;
3703 
3704 	/*
3705 	 * We don't run on any processor that doesn't have cpuid, and could not
3706 	 * possibly have arrived here.
3707 	 */
3708 	add_x86_feature(featureset, X86FSET_CPUID);
3709 }
3710 
3711 static void
cpuid_pass_ident(cpu_t * cpu,void * arg __unused)3712 cpuid_pass_ident(cpu_t *cpu, void *arg __unused)
3713 {
3714 	struct cpuid_info *cpi;
3715 	struct cpuid_regs *cp;
3716 
3717 	/*
3718 	 * We require that virtual/native detection be complete and that PCI
3719 	 * config space access has been set up; at present there is no reliable
3720 	 * way to determine the latter.
3721 	 */
3722 #if !defined(__xpv)
3723 	ASSERT3S(platform_type, !=, -1);
3724 #endif	/* !__xpv */
3725 
3726 	cpi = cpu->cpu_m.mcpu_cpi;
3727 	ASSERT(cpi != NULL);
3728 
3729 	cp = &cpi->cpi_std[0];
3730 	cp->cp_eax = 0;
3731 	cpi->cpi_maxeax = __cpuid_insn(cp);
3732 	{
3733 		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3734 		*iptr++ = cp->cp_ebx;
3735 		*iptr++ = cp->cp_edx;
3736 		*iptr++ = cp->cp_ecx;
3737 		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3738 	}
3739 
3740 	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3741 	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3742 
3743 	/*
3744 	 * Limit the range in case of weird hardware
3745 	 */
3746 	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3747 		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3748 	if (cpi->cpi_maxeax < 1)
3749 		return;
3750 
3751 	cp = &cpi->cpi_std[1];
3752 	cp->cp_eax = 1;
3753 	(void) __cpuid_insn(cp);
3754 
3755 	/*
3756 	 * Extract identifying constants for easy access.
3757 	 */
3758 	cpi->cpi_model = CPI_MODEL(cpi);
3759 	cpi->cpi_family = CPI_FAMILY(cpi);
3760 
3761 	if (cpi->cpi_family == 0xf)
3762 		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3763 
3764 	/*
3765 	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3766 	 * Intel, and presumably everyone else, uses model == 0xf, as
3767 	 * one would expect (max value means possible overflow).  Sigh.
3768 	 */
3769 
3770 	switch (cpi->cpi_vendor) {
3771 	case X86_VENDOR_Intel:
3772 		if (IS_EXTENDED_MODEL_INTEL(cpi))
3773 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3774 		break;
3775 	case X86_VENDOR_AMD:
3776 		if (CPI_FAMILY(cpi) == 0xf)
3777 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3778 		break;
3779 	case X86_VENDOR_HYGON:
3780 		cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3781 		break;
3782 	default:
3783 		if (cpi->cpi_model == 0xf)
3784 			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3785 		break;
3786 	}
3787 
3788 	cpi->cpi_step = CPI_STEP(cpi);
3789 	cpi->cpi_brandid = CPI_BRANDID(cpi);
3790 
3791 	/*
3792 	 * Synthesize chip "revision" and socket type
3793 	 */
3794 	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3795 	    cpi->cpi_model, cpi->cpi_step);
3796 	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3797 	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3798 	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3799 	    cpi->cpi_model, cpi->cpi_step);
3800 	cpi->cpi_uarchrev = _cpuid_uarchrev(cpi->cpi_vendor, cpi->cpi_family,
3801 	    cpi->cpi_model, cpi->cpi_step);
3802 }
3803 
3804 static void
cpuid_pass_basic(cpu_t * cpu,void * arg)3805 cpuid_pass_basic(cpu_t *cpu, void *arg)
3806 {
3807 	uchar_t *featureset = (uchar_t *)arg;
3808 	uint32_t mask_ecx, mask_edx;
3809 	struct cpuid_info *cpi;
3810 	struct cpuid_regs *cp;
3811 	int xcpuid;
3812 #if !defined(__xpv)
3813 	extern int idle_cpu_prefer_mwait;
3814 #endif
3815 
3816 	cpi = cpu->cpu_m.mcpu_cpi;
3817 	ASSERT(cpi != NULL);
3818 
3819 	if (cpi->cpi_maxeax < 1)
3820 		return;
3821 
3822 	/*
3823 	 * This was filled during the identification pass.
3824 	 */
3825 	cp = &cpi->cpi_std[1];
3826 
3827 	/*
3828 	 * *default* assumptions:
3829 	 * - believe %edx feature word
3830 	 * - ignore %ecx feature word
3831 	 * - 32-bit virtual and physical addressing
3832 	 */
3833 	mask_edx = 0xffffffff;
3834 	mask_ecx = 0;
3835 
3836 	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3837 
3838 	switch (cpi->cpi_vendor) {
3839 	case X86_VENDOR_Intel:
3840 		if (cpi->cpi_family == 5)
3841 			x86_type = X86_TYPE_P5;
3842 		else if (IS_LEGACY_P6(cpi)) {
3843 			x86_type = X86_TYPE_P6;
3844 			pentiumpro_bug4046376 = 1;
3845 			/*
3846 			 * Clear the SEP bit when it was set erroneously
3847 			 */
3848 			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3849 				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3850 		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3851 			x86_type = X86_TYPE_P4;
3852 			/*
3853 			 * We don't currently depend on any of the %ecx
3854 			 * features until Prescott, so we'll only check
3855 			 * this from P4 onwards.  We might want to revisit
3856 			 * that idea later.
3857 			 */
3858 			mask_ecx = 0xffffffff;
3859 		} else if (cpi->cpi_family > 0xf)
3860 			mask_ecx = 0xffffffff;
3861 		/*
3862 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3863 		 * to obtain the monitor linesize.
3864 		 */
3865 		if (cpi->cpi_maxeax < 5)
3866 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3867 		break;
3868 	case X86_VENDOR_IntelClone:
3869 	default:
3870 		break;
3871 	case X86_VENDOR_AMD:
3872 #if defined(OPTERON_ERRATUM_108)
3873 		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3874 			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3875 			cpi->cpi_model = 0xc;
3876 		} else
3877 #endif
3878 		if (cpi->cpi_family == 5) {
3879 			/*
3880 			 * AMD K5 and K6
3881 			 *
3882 			 * These CPUs have an incomplete implementation
3883 			 * of MCA/MCE which we mask away.
3884 			 */
3885 			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3886 
3887 			/*
3888 			 * Model 0 uses the wrong (APIC) bit
3889 			 * to indicate PGE.  Fix it here.
3890 			 */
3891 			if (cpi->cpi_model == 0) {
3892 				if (cp->cp_edx & 0x200) {
3893 					cp->cp_edx &= ~0x200;
3894 					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3895 				}
3896 			}
3897 
3898 			/*
3899 			 * Early models had problems w/ MMX; disable.
3900 			 */
3901 			if (cpi->cpi_model < 6)
3902 				mask_edx &= ~CPUID_INTC_EDX_MMX;
3903 		}
3904 
3905 		/*
3906 		 * For newer families, SSE3 and CX16, at least, are valid;
3907 		 * enable all
3908 		 */
3909 		if (cpi->cpi_family >= 0xf)
3910 			mask_ecx = 0xffffffff;
3911 		/*
3912 		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3913 		 * to obtain the monitor linesize.
3914 		 */
3915 		if (cpi->cpi_maxeax < 5)
3916 			mask_ecx &= ~CPUID_INTC_ECX_MON;
3917 
3918 #if !defined(__xpv)
3919 		/*
3920 		 * AMD has not historically used MWAIT in the CPU's idle loop.
3921 		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3922 		 * know for certain that in at least family 17h, per AMD, mwait
3923 		 * is preferred. Families in-between are less certain.
3924 		 */
3925 		if (cpi->cpi_family < 0x17) {
3926 			idle_cpu_prefer_mwait = 0;
3927 		}
3928 #endif
3929 
3930 		break;
3931 	case X86_VENDOR_HYGON:
3932 		/* Enable all for Hygon Dhyana CPU */
3933 		mask_ecx = 0xffffffff;
3934 		break;
3935 	case X86_VENDOR_TM:
3936 		/*
3937 		 * workaround the NT workaround in CMS 4.1
3938 		 */
3939 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3940 		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3941 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3942 		break;
3943 	case X86_VENDOR_Centaur:
3944 		/*
3945 		 * workaround the NT workarounds again
3946 		 */
3947 		if (cpi->cpi_family == 6)
3948 			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3949 		break;
3950 	case X86_VENDOR_Cyrix:
3951 		/*
3952 		 * We rely heavily on the probing in locore
3953 		 * to actually figure out what parts, if any,
3954 		 * of the Cyrix cpuid instruction to believe.
3955 		 */
3956 		switch (x86_type) {
3957 		case X86_TYPE_CYRIX_486:
3958 			mask_edx = 0;
3959 			break;
3960 		case X86_TYPE_CYRIX_6x86:
3961 			mask_edx = 0;
3962 			break;
3963 		case X86_TYPE_CYRIX_6x86L:
3964 			mask_edx =
3965 			    CPUID_INTC_EDX_DE |
3966 			    CPUID_INTC_EDX_CX8;
3967 			break;
3968 		case X86_TYPE_CYRIX_6x86MX:
3969 			mask_edx =
3970 			    CPUID_INTC_EDX_DE |
3971 			    CPUID_INTC_EDX_MSR |
3972 			    CPUID_INTC_EDX_CX8 |
3973 			    CPUID_INTC_EDX_PGE |
3974 			    CPUID_INTC_EDX_CMOV |
3975 			    CPUID_INTC_EDX_MMX;
3976 			break;
3977 		case X86_TYPE_CYRIX_GXm:
3978 			mask_edx =
3979 			    CPUID_INTC_EDX_MSR |
3980 			    CPUID_INTC_EDX_CX8 |
3981 			    CPUID_INTC_EDX_CMOV |
3982 			    CPUID_INTC_EDX_MMX;
3983 			break;
3984 		case X86_TYPE_CYRIX_MediaGX:
3985 			break;
3986 		case X86_TYPE_CYRIX_MII:
3987 		case X86_TYPE_VIA_CYRIX_III:
3988 			mask_edx =
3989 			    CPUID_INTC_EDX_DE |
3990 			    CPUID_INTC_EDX_TSC |
3991 			    CPUID_INTC_EDX_MSR |
3992 			    CPUID_INTC_EDX_CX8 |
3993 			    CPUID_INTC_EDX_PGE |
3994 			    CPUID_INTC_EDX_CMOV |
3995 			    CPUID_INTC_EDX_MMX;
3996 			break;
3997 		default:
3998 			break;
3999 		}
4000 		break;
4001 	}
4002 
4003 #if defined(__xpv)
4004 	/*
4005 	 * Do not support MONITOR/MWAIT under a hypervisor
4006 	 */
4007 	mask_ecx &= ~CPUID_INTC_ECX_MON;
4008 	/*
4009 	 * Do not support XSAVE under a hypervisor for now
4010 	 */
4011 	xsave_force_disable = B_TRUE;
4012 
4013 #endif	/* __xpv */
4014 
4015 	if (xsave_force_disable) {
4016 		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
4017 		mask_ecx &= ~CPUID_INTC_ECX_AVX;
4018 		mask_ecx &= ~CPUID_INTC_ECX_F16C;
4019 		mask_ecx &= ~CPUID_INTC_ECX_FMA;
4020 	}
4021 
4022 	/*
4023 	 * Now we've figured out the masks that determine
4024 	 * which bits we choose to believe, apply the masks
4025 	 * to the feature words, then map the kernel's view
4026 	 * of these feature words into its feature word.
4027 	 */
4028 	cp->cp_edx &= mask_edx;
4029 	cp->cp_ecx &= mask_ecx;
4030 
4031 	/*
4032 	 * apply any platform restrictions (we don't call this
4033 	 * immediately after __cpuid_insn here, because we need the
4034 	 * workarounds applied above first)
4035 	 */
4036 	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
4037 
4038 	/*
4039 	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
4040 	 * instruction set extensions in leaf 7's ebx, ecx, and edx. Note, leaf
4041 	 * 7 has sub-leaves determined by ecx.
4042 	 */
4043 	if (cpi->cpi_maxeax >= 7) {
4044 		struct cpuid_regs *ecp;
4045 		ecp = &cpi->cpi_std[7];
4046 		ecp->cp_eax = 7;
4047 		ecp->cp_ecx = 0;
4048 		(void) __cpuid_insn(ecp);
4049 
4050 		/*
4051 		 * If XSAVE has been disabled, just ignore all of the
4052 		 * extended-save-area dependent flags here. By removing most of
4053 		 * the leaf 7, sub-leaf 0 flags, that will ensure tha we don't
4054 		 * end up looking at additional xsave dependent leaves right
4055 		 * now.
4056 		 */
4057 		if (xsave_force_disable) {
4058 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4059 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4060 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4061 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
4062 			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4063 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4064 			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4065 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VAES;
4066 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_VPCLMULQDQ;
4067 			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_GFNI;
4068 		}
4069 
4070 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
4071 			add_x86_feature(featureset, X86FSET_SMEP);
4072 
4073 		/*
4074 		 * We check disable_smap here in addition to in startup_smap()
4075 		 * to ensure CPUs that aren't the boot CPU don't accidentally
4076 		 * include it in the feature set and thus generate a mismatched
4077 		 * x86 feature set across CPUs.
4078 		 */
4079 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
4080 		    disable_smap == 0)
4081 			add_x86_feature(featureset, X86FSET_SMAP);
4082 
4083 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
4084 			add_x86_feature(featureset, X86FSET_RDSEED);
4085 
4086 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
4087 			add_x86_feature(featureset, X86FSET_ADX);
4088 
4089 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4090 			add_x86_feature(featureset, X86FSET_FSGSBASE);
4091 
4092 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4093 			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
4094 
4095 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
4096 			add_x86_feature(featureset, X86FSET_INVPCID);
4097 
4098 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
4099 			add_x86_feature(featureset, X86FSET_UMIP);
4100 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_PKU)
4101 			add_x86_feature(featureset, X86FSET_PKU);
4102 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
4103 			add_x86_feature(featureset, X86FSET_OSPKE);
4104 		if (ecp->cp_ecx & CPUID_INTC_ECX_7_0_GFNI)
4105 			add_x86_feature(featureset, X86FSET_GFNI);
4106 
4107 		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
4108 			add_x86_feature(featureset, X86FSET_CLWB);
4109 
4110 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4111 			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
4112 				add_x86_feature(featureset, X86FSET_MPX);
4113 		}
4114 
4115 		/*
4116 		 * If we have subleaf 1 available, grab and store that. This is
4117 		 * used for more AVX and related features.
4118 		 */
4119 		if (ecp->cp_eax >= 1) {
4120 			struct cpuid_regs *c71;
4121 			c71 = &cpi->cpi_sub7[0];
4122 			c71->cp_eax = 7;
4123 			c71->cp_ecx = 1;
4124 			(void) __cpuid_insn(c71);
4125 		}
4126 	}
4127 
4128 	/*
4129 	 * fold in overrides from the "eeprom" mechanism
4130 	 */
4131 	cp->cp_edx |= cpuid_feature_edx_include;
4132 	cp->cp_edx &= ~cpuid_feature_edx_exclude;
4133 
4134 	cp->cp_ecx |= cpuid_feature_ecx_include;
4135 	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
4136 
4137 	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
4138 		add_x86_feature(featureset, X86FSET_LARGEPAGE);
4139 	}
4140 	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
4141 		add_x86_feature(featureset, X86FSET_TSC);
4142 	}
4143 	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
4144 		add_x86_feature(featureset, X86FSET_MSR);
4145 	}
4146 	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
4147 		add_x86_feature(featureset, X86FSET_MTRR);
4148 	}
4149 	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
4150 		add_x86_feature(featureset, X86FSET_PGE);
4151 	}
4152 	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
4153 		add_x86_feature(featureset, X86FSET_CMOV);
4154 	}
4155 	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
4156 		add_x86_feature(featureset, X86FSET_MMX);
4157 	}
4158 	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
4159 	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
4160 		add_x86_feature(featureset, X86FSET_MCA);
4161 	}
4162 	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
4163 		add_x86_feature(featureset, X86FSET_PAE);
4164 	}
4165 	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
4166 		add_x86_feature(featureset, X86FSET_CX8);
4167 	}
4168 	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
4169 		add_x86_feature(featureset, X86FSET_CX16);
4170 	}
4171 	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
4172 		add_x86_feature(featureset, X86FSET_PAT);
4173 	}
4174 	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
4175 		add_x86_feature(featureset, X86FSET_SEP);
4176 	}
4177 	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
4178 		/*
4179 		 * In our implementation, fxsave/fxrstor
4180 		 * are prerequisites before we'll even
4181 		 * try and do SSE things.
4182 		 */
4183 		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
4184 			add_x86_feature(featureset, X86FSET_SSE);
4185 		}
4186 		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
4187 			add_x86_feature(featureset, X86FSET_SSE2);
4188 		}
4189 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
4190 			add_x86_feature(featureset, X86FSET_SSE3);
4191 		}
4192 		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
4193 			add_x86_feature(featureset, X86FSET_SSSE3);
4194 		}
4195 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
4196 			add_x86_feature(featureset, X86FSET_SSE4_1);
4197 		}
4198 		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
4199 			add_x86_feature(featureset, X86FSET_SSE4_2);
4200 		}
4201 		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
4202 			add_x86_feature(featureset, X86FSET_AES);
4203 		}
4204 		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
4205 			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
4206 		}
4207 
4208 		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
4209 			add_x86_feature(featureset, X86FSET_SHA);
4210 
4211 		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
4212 			add_x86_feature(featureset, X86FSET_XSAVE);
4213 
4214 			/* We only test AVX & AVX512 when there is XSAVE */
4215 			cpuid_basic_avx(cpu, featureset);
4216 		}
4217 	}
4218 
4219 	if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
4220 		add_x86_feature(featureset, X86FSET_PCID);
4221 	}
4222 
4223 	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
4224 		add_x86_feature(featureset, X86FSET_X2APIC);
4225 	}
4226 	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
4227 		add_x86_feature(featureset, X86FSET_DE);
4228 	}
4229 #if !defined(__xpv)
4230 	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
4231 
4232 		/*
4233 		 * We require the CLFLUSH instruction for erratum workaround
4234 		 * to use MONITOR/MWAIT.
4235 		 */
4236 		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4237 			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
4238 			add_x86_feature(featureset, X86FSET_MWAIT);
4239 		} else {
4240 			extern int idle_cpu_assert_cflush_monitor;
4241 
4242 			/*
4243 			 * All processors we are aware of which have
4244 			 * MONITOR/MWAIT also have CLFLUSH.
4245 			 */
4246 			if (idle_cpu_assert_cflush_monitor) {
4247 				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
4248 				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
4249 			}
4250 		}
4251 	}
4252 #endif	/* __xpv */
4253 
4254 	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
4255 		add_x86_feature(featureset, X86FSET_VMX);
4256 	}
4257 
4258 	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
4259 		add_x86_feature(featureset, X86FSET_RDRAND);
4260 
4261 	/*
4262 	 * Only need it first time, rest of the cpus would follow suit.
4263 	 * we only capture this for the bootcpu.
4264 	 */
4265 	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
4266 		add_x86_feature(featureset, X86FSET_CLFSH);
4267 		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
4268 	}
4269 	if (is_x86_feature(featureset, X86FSET_PAE))
4270 		cpi->cpi_pabits = 36;
4271 
4272 	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
4273 		struct cpuid_regs r, *ecp;
4274 
4275 		ecp = &r;
4276 		ecp->cp_eax = 0xD;
4277 		ecp->cp_ecx = 1;
4278 		ecp->cp_edx = ecp->cp_ebx = 0;
4279 		(void) __cpuid_insn(ecp);
4280 
4281 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
4282 			add_x86_feature(featureset, X86FSET_XSAVEOPT);
4283 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
4284 			add_x86_feature(featureset, X86FSET_XSAVEC);
4285 		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
4286 			add_x86_feature(featureset, X86FSET_XSAVES);
4287 
4288 		/*
4289 		 * Zen 2 family processors suffer from erratum 1386 that causes
4290 		 * xsaves to not function correctly in some circumstances. There
4291 		 * are no supervisor states in Zen 2 and earlier. Practically
4292 		 * speaking this has no impact for us as we currently do not
4293 		 * leverage compressed xsave formats. To safeguard against
4294 		 * issues in the future where we may opt to using it, we remove
4295 		 * it from the feature set now. While Matisse has a microcode
4296 		 * update available with a fix, not all Zen 2 CPUs do so it's
4297 		 * simpler for the moment to unconditionally remove it.
4298 		 */
4299 		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4300 		    uarchrev_uarch(cpi->cpi_uarchrev) <= X86_UARCH_AMD_ZEN2) {
4301 			remove_x86_feature(featureset, X86FSET_XSAVES);
4302 		}
4303 	}
4304 
4305 	/*
4306 	 * Work on the "extended" feature information, doing
4307 	 * some basic initialization to be used in the extended pass.
4308 	 */
4309 	xcpuid = 0;
4310 	switch (cpi->cpi_vendor) {
4311 	case X86_VENDOR_Intel:
4312 		/*
4313 		 * On KVM we know we will have proper support for extended
4314 		 * cpuid.
4315 		 */
4316 		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
4317 		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
4318 		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
4319 			xcpuid++;
4320 		break;
4321 	case X86_VENDOR_AMD:
4322 		if (cpi->cpi_family > 5 ||
4323 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
4324 			xcpuid++;
4325 		break;
4326 	case X86_VENDOR_Cyrix:
4327 		/*
4328 		 * Only these Cyrix CPUs are -known- to support
4329 		 * extended cpuid operations.
4330 		 */
4331 		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
4332 		    x86_type == X86_TYPE_CYRIX_GXm)
4333 			xcpuid++;
4334 		break;
4335 	case X86_VENDOR_HYGON:
4336 	case X86_VENDOR_Centaur:
4337 	case X86_VENDOR_TM:
4338 	default:
4339 		xcpuid++;
4340 		break;
4341 	}
4342 
4343 	if (xcpuid) {
4344 		cp = &cpi->cpi_extd[0];
4345 		cp->cp_eax = CPUID_LEAF_EXT_0;
4346 		cpi->cpi_xmaxeax = __cpuid_insn(cp);
4347 	}
4348 
4349 	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
4350 
4351 		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
4352 			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
4353 
4354 		switch (cpi->cpi_vendor) {
4355 		case X86_VENDOR_Intel:
4356 		case X86_VENDOR_AMD:
4357 		case X86_VENDOR_HYGON:
4358 			if (cpi->cpi_xmaxeax < 0x80000001)
4359 				break;
4360 			cp = &cpi->cpi_extd[1];
4361 			cp->cp_eax = 0x80000001;
4362 			(void) __cpuid_insn(cp);
4363 
4364 			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4365 			    cpi->cpi_family == 5 &&
4366 			    cpi->cpi_model == 6 &&
4367 			    cpi->cpi_step == 6) {
4368 				/*
4369 				 * K6 model 6 uses bit 10 to indicate SYSC
4370 				 * Later models use bit 11. Fix it here.
4371 				 */
4372 				if (cp->cp_edx & 0x400) {
4373 					cp->cp_edx &= ~0x400;
4374 					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
4375 				}
4376 			}
4377 
4378 			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
4379 
4380 			/*
4381 			 * Compute the additions to the kernel's feature word.
4382 			 */
4383 			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
4384 				add_x86_feature(featureset, X86FSET_NX);
4385 			}
4386 
4387 			/*
4388 			 * Regardless whether or not we boot 64-bit,
4389 			 * we should have a way to identify whether
4390 			 * the CPU is capable of running 64-bit.
4391 			 */
4392 			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
4393 				add_x86_feature(featureset, X86FSET_64);
4394 			}
4395 
4396 			/* 1 GB large page - enable only for 64 bit kernel */
4397 			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
4398 				add_x86_feature(featureset, X86FSET_1GPG);
4399 			}
4400 
4401 			if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4402 			    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4403 			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
4404 			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
4405 				add_x86_feature(featureset, X86FSET_SSE4A);
4406 			}
4407 
4408 			/*
4409 			 * It's really tricky to support syscall/sysret in
4410 			 * the i386 kernel; we rely on sysenter/sysexit
4411 			 * instead.  In the amd64 kernel, things are -way-
4412 			 * better.
4413 			 */
4414 			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4415 				add_x86_feature(featureset, X86FSET_ASYSC);
4416 			}
4417 
4418 			/*
4419 			 * While we're thinking about system calls, note
4420 			 * that AMD processors don't support sysenter
4421 			 * in long mode at all, so don't try to program them.
4422 			 */
4423 			if (x86_vendor == X86_VENDOR_AMD ||
4424 			    x86_vendor == X86_VENDOR_HYGON) {
4425 				remove_x86_feature(featureset, X86FSET_SEP);
4426 			}
4427 
4428 			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4429 				add_x86_feature(featureset, X86FSET_TSCP);
4430 			}
4431 
4432 			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4433 				add_x86_feature(featureset, X86FSET_SVM);
4434 			}
4435 
4436 			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4437 				add_x86_feature(featureset, X86FSET_TOPOEXT);
4438 			}
4439 
4440 			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4441 				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4442 			}
4443 
4444 			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4445 				add_x86_feature(featureset, X86FSET_XOP);
4446 			}
4447 
4448 			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4449 				add_x86_feature(featureset, X86FSET_FMA4);
4450 			}
4451 
4452 			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4453 				add_x86_feature(featureset, X86FSET_TBM);
4454 			}
4455 
4456 			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4457 				add_x86_feature(featureset, X86FSET_MONITORX);
4458 			}
4459 			break;
4460 		default:
4461 			break;
4462 		}
4463 
4464 		/*
4465 		 * Get CPUID data about processor cores and hyperthreads.
4466 		 */
4467 		switch (cpi->cpi_vendor) {
4468 		case X86_VENDOR_Intel:
4469 			if (cpi->cpi_maxeax >= 4) {
4470 				cp = &cpi->cpi_std[4];
4471 				cp->cp_eax = 4;
4472 				cp->cp_ecx = 0;
4473 				(void) __cpuid_insn(cp);
4474 				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4475 			}
4476 			/*FALLTHROUGH*/
4477 		case X86_VENDOR_AMD:
4478 		case X86_VENDOR_HYGON:
4479 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4480 				break;
4481 			cp = &cpi->cpi_extd[8];
4482 			cp->cp_eax = CPUID_LEAF_EXT_8;
4483 			(void) __cpuid_insn(cp);
4484 			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4485 			    cp);
4486 
4487 			/*
4488 			 * AMD uses ebx for some extended functions.
4489 			 */
4490 			if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4491 			    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4492 				/*
4493 				 * While we're here, check for the AMD "Error
4494 				 * Pointer Zero/Restore" feature. This can be
4495 				 * used to setup the FP save handlers
4496 				 * appropriately.
4497 				 */
4498 				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4499 					cpi->cpi_fp_amd_save = 0;
4500 				} else {
4501 					cpi->cpi_fp_amd_save = 1;
4502 				}
4503 
4504 				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4505 					add_x86_feature(featureset,
4506 					    X86FSET_CLZERO);
4507 				}
4508 			}
4509 
4510 			/*
4511 			 * Virtual and physical address limits from
4512 			 * cpuid override previously guessed values.
4513 			 */
4514 			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4515 			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4516 			break;
4517 		default:
4518 			break;
4519 		}
4520 
4521 		/*
4522 		 * Get CPUID data about TSC Invariance in Deep C-State.
4523 		 */
4524 		switch (cpi->cpi_vendor) {
4525 		case X86_VENDOR_Intel:
4526 		case X86_VENDOR_AMD:
4527 		case X86_VENDOR_HYGON:
4528 			if (cpi->cpi_maxeax >= 7) {
4529 				cp = &cpi->cpi_extd[7];
4530 				cp->cp_eax = 0x80000007;
4531 				cp->cp_ecx = 0;
4532 				(void) __cpuid_insn(cp);
4533 			}
4534 			break;
4535 		default:
4536 			break;
4537 		}
4538 	}
4539 
4540 	/*
4541 	 * cpuid_basic_ppin assumes that cpuid_basic_topology has already been
4542 	 * run and thus gathered some of its dependent leaves.
4543 	 */
4544 	cpuid_basic_topology(cpu, featureset);
4545 	cpuid_basic_thermal(cpu, featureset);
4546 #if !defined(__xpv)
4547 	cpuid_basic_ppin(cpu, featureset);
4548 #endif
4549 
4550 	if (cpi->cpi_vendor == X86_VENDOR_AMD ||
4551 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
4552 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4553 		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4554 			/* Special handling for AMD FP not necessary. */
4555 			cpi->cpi_fp_amd_save = 0;
4556 		} else {
4557 			cpi->cpi_fp_amd_save = 1;
4558 		}
4559 	}
4560 
4561 	/*
4562 	 * Check (and potentially set) if lfence is serializing.
4563 	 * This is useful for accurate rdtsc measurements and AMD retpolines.
4564 	 */
4565 	if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
4566 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
4567 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4568 		/*
4569 		 * The AMD white paper Software Techniques For Managing
4570 		 * Speculation on AMD Processors details circumstances for when
4571 		 * lfence instructions are serializing.
4572 		 *
4573 		 * On family 0xf and 0x11, it is inherently so.  On family 0x10
4574 		 * and later (excluding 0x11), a bit in the DE_CFG MSR
4575 		 * determines the lfence behavior.  Per that whitepaper, AMD has
4576 		 * committed to supporting that MSR on all later CPUs.
4577 		 */
4578 		if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11) {
4579 			add_x86_feature(featureset, X86FSET_LFENCE_SER);
4580 		} else if (cpi->cpi_family >= 0x10) {
4581 #if !defined(__xpv)
4582 			uint64_t val;
4583 
4584 			/*
4585 			 * Be careful when attempting to enable the bit, and
4586 			 * verify that it was actually set in case we are
4587 			 * running in a hypervisor which is less than faithful
4588 			 * about its emulation of this feature.
4589 			 */
4590 			on_trap_data_t otd;
4591 			if (!on_trap(&otd, OT_DATA_ACCESS)) {
4592 				val = rdmsr(MSR_AMD_DE_CFG);
4593 				val |= AMD_DE_CFG_LFENCE_DISPATCH;
4594 				wrmsr(MSR_AMD_DE_CFG, val);
4595 				val = rdmsr(MSR_AMD_DE_CFG);
4596 			} else {
4597 				val = 0;
4598 			}
4599 			no_trap();
4600 
4601 			if ((val & AMD_DE_CFG_LFENCE_DISPATCH) != 0) {
4602 				add_x86_feature(featureset, X86FSET_LFENCE_SER);
4603 			}
4604 #endif
4605 		}
4606 	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
4607 	    is_x86_feature(featureset, X86FSET_SSE2)) {
4608 		/*
4609 		 * Documentation and other OSes indicate that lfence is always
4610 		 * serializing on Intel CPUs.
4611 		 */
4612 		add_x86_feature(featureset, X86FSET_LFENCE_SER);
4613 	}
4614 
4615 
4616 	/*
4617 	 * Check the processor leaves that are used for security features. Grab
4618 	 * any additional processor-specific leaves that we may not have yet.
4619 	 */
4620 	switch (cpi->cpi_vendor) {
4621 	case X86_VENDOR_AMD:
4622 	case X86_VENDOR_HYGON:
4623 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_21) {
4624 			cp = &cpi->cpi_extd[7];
4625 			cp->cp_eax = CPUID_LEAF_EXT_21;
4626 			cp->cp_ecx = 0;
4627 			(void) __cpuid_insn(cp);
4628 		}
4629 		break;
4630 	default:
4631 		break;
4632 	}
4633 
4634 	cpuid_scan_security(cpu, featureset);
4635 }
4636 
4637 /*
4638  * Make copies of the cpuid table entries we depend on, in
4639  * part for ease of parsing now, in part so that we have only
4640  * one place to correct any of it, in part for ease of
4641  * later export to userland, and in part so we can look at
4642  * this stuff in a crash dump.
4643  */
4644 
4645 static void
cpuid_pass_extended(cpu_t * cpu,void * _arg __unused)4646 cpuid_pass_extended(cpu_t *cpu, void *_arg __unused)
4647 {
4648 	uint_t n, nmax;
4649 	int i;
4650 	struct cpuid_regs *cp;
4651 	uint8_t *dp;
4652 	uint32_t *iptr;
4653 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4654 
4655 	if (cpi->cpi_maxeax < 1)
4656 		return;
4657 
4658 	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4659 		nmax = NMAX_CPI_STD;
4660 	/*
4661 	 * (We already handled n == 0 and n == 1 in the basic pass)
4662 	 */
4663 	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4664 		/*
4665 		 * leaves 6 and 7 were handled in the basic pass
4666 		 */
4667 		if (n == 6 || n == 7)
4668 			continue;
4669 
4670 		cp->cp_eax = n;
4671 
4672 		/*
4673 		 * CPUID function 4 expects %ecx to be initialized
4674 		 * with an index which indicates which cache to return
4675 		 * information about. The OS is expected to call function 4
4676 		 * with %ecx set to 0, 1, 2, ... until it returns with
4677 		 * EAX[4:0] set to 0, which indicates there are no more
4678 		 * caches.
4679 		 *
4680 		 * Here, populate cpi_std[4] with the information returned by
4681 		 * function 4 when %ecx == 0, and do the rest in a later pass
4682 		 * when dynamic memory allocation becomes available.
4683 		 *
4684 		 * Note: we need to explicitly initialize %ecx here, since
4685 		 * function 4 may have been previously invoked.
4686 		 */
4687 		if (n == 4)
4688 			cp->cp_ecx = 0;
4689 
4690 		(void) __cpuid_insn(cp);
4691 		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4692 		switch (n) {
4693 		case 2:
4694 			/*
4695 			 * "the lower 8 bits of the %eax register
4696 			 * contain a value that identifies the number
4697 			 * of times the cpuid [instruction] has to be
4698 			 * executed to obtain a complete image of the
4699 			 * processor's caching systems."
4700 			 *
4701 			 * How *do* they make this stuff up?
4702 			 */
4703 			cpi->cpi_ncache = sizeof (*cp) *
4704 			    BITX(cp->cp_eax, 7, 0);
4705 			if (cpi->cpi_ncache == 0)
4706 				break;
4707 			cpi->cpi_ncache--;	/* skip count byte */
4708 
4709 			/*
4710 			 * Well, for now, rather than attempt to implement
4711 			 * this slightly dubious algorithm, we just look
4712 			 * at the first 15 ..
4713 			 */
4714 			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4715 				cpi->cpi_ncache = sizeof (*cp) - 1;
4716 
4717 			dp = cpi->cpi_cacheinfo;
4718 			if (BITX(cp->cp_eax, 31, 31) == 0) {
4719 				uint8_t *p = (void *)&cp->cp_eax;
4720 				for (i = 1; i < 4; i++)
4721 					if (p[i] != 0)
4722 						*dp++ = p[i];
4723 			}
4724 			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4725 				uint8_t *p = (void *)&cp->cp_ebx;
4726 				for (i = 0; i < 4; i++)
4727 					if (p[i] != 0)
4728 						*dp++ = p[i];
4729 			}
4730 			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4731 				uint8_t *p = (void *)&cp->cp_ecx;
4732 				for (i = 0; i < 4; i++)
4733 					if (p[i] != 0)
4734 						*dp++ = p[i];
4735 			}
4736 			if (BITX(cp->cp_edx, 31, 31) == 0) {
4737 				uint8_t *p = (void *)&cp->cp_edx;
4738 				for (i = 0; i < 4; i++)
4739 					if (p[i] != 0)
4740 						*dp++ = p[i];
4741 			}
4742 			break;
4743 
4744 		case 3:	/* Processor serial number, if PSN supported */
4745 			break;
4746 
4747 		case 4:	/* Deterministic cache parameters */
4748 			break;
4749 
4750 		case 5:	/* Monitor/Mwait parameters */
4751 		{
4752 			size_t mwait_size;
4753 
4754 			/*
4755 			 * check cpi_mwait.support which was set in
4756 			 * cpuid_pass_basic()
4757 			 */
4758 			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4759 				break;
4760 
4761 			/*
4762 			 * Protect ourself from insane mwait line size.
4763 			 * Workaround for incomplete hardware emulator(s).
4764 			 */
4765 			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4766 			if (mwait_size < sizeof (uint32_t) ||
4767 			    !ISP2(mwait_size)) {
4768 #if DEBUG
4769 				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4770 				    "size %ld", cpu->cpu_id, (long)mwait_size);
4771 #endif
4772 				break;
4773 			}
4774 
4775 			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4776 			cpi->cpi_mwait.mon_max = mwait_size;
4777 			if (MWAIT_EXTENSION(cpi)) {
4778 				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4779 				if (MWAIT_INT_ENABLE(cpi))
4780 					cpi->cpi_mwait.support |=
4781 					    MWAIT_ECX_INT_ENABLE;
4782 			}
4783 			break;
4784 		}
4785 		default:
4786 			break;
4787 		}
4788 	}
4789 
4790 	/*
4791 	 * XSAVE enumeration
4792 	 */
4793 	if (cpi->cpi_maxeax >= 0xD) {
4794 		struct cpuid_regs regs;
4795 		boolean_t cpuid_d_valid = B_TRUE;
4796 
4797 		cp = &regs;
4798 		cp->cp_eax = 0xD;
4799 		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4800 
4801 		(void) __cpuid_insn(cp);
4802 
4803 		/*
4804 		 * Sanity checks for debug
4805 		 */
4806 		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4807 		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4808 			cpuid_d_valid = B_FALSE;
4809 		}
4810 
4811 		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4812 		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4813 		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4814 
4815 		/*
4816 		 * If the hw supports AVX, get the size and offset in the save
4817 		 * area for the ymm state.
4818 		 */
4819 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4820 			cp->cp_eax = 0xD;
4821 			cp->cp_ecx = 2;
4822 			cp->cp_edx = cp->cp_ebx = 0;
4823 
4824 			(void) __cpuid_insn(cp);
4825 
4826 			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4827 			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4828 				cpuid_d_valid = B_FALSE;
4829 			}
4830 
4831 			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4832 			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4833 		}
4834 
4835 		/*
4836 		 * If the hw supports MPX, get the size and offset in the
4837 		 * save area for BNDREGS and BNDCSR.
4838 		 */
4839 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4840 			cp->cp_eax = 0xD;
4841 			cp->cp_ecx = 3;
4842 			cp->cp_edx = cp->cp_ebx = 0;
4843 
4844 			(void) __cpuid_insn(cp);
4845 
4846 			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4847 			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4848 
4849 			cp->cp_eax = 0xD;
4850 			cp->cp_ecx = 4;
4851 			cp->cp_edx = cp->cp_ebx = 0;
4852 
4853 			(void) __cpuid_insn(cp);
4854 
4855 			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4856 			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4857 		}
4858 
4859 		/*
4860 		 * If the hw supports AVX512, get the size and offset in the
4861 		 * save area for the opmask registers and zmm state.
4862 		 */
4863 		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4864 			cp->cp_eax = 0xD;
4865 			cp->cp_ecx = 5;
4866 			cp->cp_edx = cp->cp_ebx = 0;
4867 
4868 			(void) __cpuid_insn(cp);
4869 
4870 			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4871 			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4872 
4873 			cp->cp_eax = 0xD;
4874 			cp->cp_ecx = 6;
4875 			cp->cp_edx = cp->cp_ebx = 0;
4876 
4877 			(void) __cpuid_insn(cp);
4878 
4879 			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4880 			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4881 
4882 			cp->cp_eax = 0xD;
4883 			cp->cp_ecx = 7;
4884 			cp->cp_edx = cp->cp_ebx = 0;
4885 
4886 			(void) __cpuid_insn(cp);
4887 
4888 			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4889 			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4890 		}
4891 
4892 		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4893 			xsave_state_size = 0;
4894 		} else if (cpuid_d_valid) {
4895 			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4896 		} else {
4897 			/* Broken CPUID 0xD, probably in HVM */
4898 			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4899 			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4900 			    ", ymm_size = %d, ymm_offset = %d\n",
4901 			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4902 			    cpi->cpi_xsave.xsav_hw_features_high,
4903 			    (int)cpi->cpi_xsave.xsav_max_size,
4904 			    (int)cpi->cpi_xsave.ymm_size,
4905 			    (int)cpi->cpi_xsave.ymm_offset);
4906 
4907 			if (xsave_state_size != 0) {
4908 				/*
4909 				 * This must be a non-boot CPU. We cannot
4910 				 * continue, because boot cpu has already
4911 				 * enabled XSAVE.
4912 				 */
4913 				ASSERT(cpu->cpu_id != 0);
4914 				cmn_err(CE_PANIC, "cpu%d: we have already "
4915 				    "enabled XSAVE on boot cpu, cannot "
4916 				    "continue.", cpu->cpu_id);
4917 			} else {
4918 				/*
4919 				 * If we reached here on the boot CPU, it's also
4920 				 * almost certain that we'll reach here on the
4921 				 * non-boot CPUs. When we're here on a boot CPU
4922 				 * we should disable the feature, on a non-boot
4923 				 * CPU we need to confirm that we have.
4924 				 */
4925 				if (cpu->cpu_id == 0) {
4926 					remove_x86_feature(x86_featureset,
4927 					    X86FSET_XSAVE);
4928 					remove_x86_feature(x86_featureset,
4929 					    X86FSET_AVX);
4930 					remove_x86_feature(x86_featureset,
4931 					    X86FSET_F16C);
4932 					remove_x86_feature(x86_featureset,
4933 					    X86FSET_BMI1);
4934 					remove_x86_feature(x86_featureset,
4935 					    X86FSET_BMI2);
4936 					remove_x86_feature(x86_featureset,
4937 					    X86FSET_FMA);
4938 					remove_x86_feature(x86_featureset,
4939 					    X86FSET_AVX2);
4940 					remove_x86_feature(x86_featureset,
4941 					    X86FSET_MPX);
4942 					remove_x86_feature(x86_featureset,
4943 					    X86FSET_AVX512F);
4944 					remove_x86_feature(x86_featureset,
4945 					    X86FSET_AVX512DQ);
4946 					remove_x86_feature(x86_featureset,
4947 					    X86FSET_AVX512PF);
4948 					remove_x86_feature(x86_featureset,
4949 					    X86FSET_AVX512ER);
4950 					remove_x86_feature(x86_featureset,
4951 					    X86FSET_AVX512CD);
4952 					remove_x86_feature(x86_featureset,
4953 					    X86FSET_AVX512BW);
4954 					remove_x86_feature(x86_featureset,
4955 					    X86FSET_AVX512VL);
4956 					remove_x86_feature(x86_featureset,
4957 					    X86FSET_AVX512FMA);
4958 					remove_x86_feature(x86_featureset,
4959 					    X86FSET_AVX512VBMI);
4960 					remove_x86_feature(x86_featureset,
4961 					    X86FSET_AVX512VNNI);
4962 					remove_x86_feature(x86_featureset,
4963 					    X86FSET_AVX512VPOPCDQ);
4964 					remove_x86_feature(x86_featureset,
4965 					    X86FSET_AVX512NNIW);
4966 					remove_x86_feature(x86_featureset,
4967 					    X86FSET_AVX512FMAPS);
4968 					remove_x86_feature(x86_featureset,
4969 					    X86FSET_VAES);
4970 					remove_x86_feature(x86_featureset,
4971 					    X86FSET_VPCLMULQDQ);
4972 					remove_x86_feature(x86_featureset,
4973 					    X86FSET_GFNI);
4974 					remove_x86_feature(x86_featureset,
4975 					    X86FSET_AVX512_VP2INT);
4976 					remove_x86_feature(x86_featureset,
4977 					    X86FSET_AVX512_BITALG);
4978 					remove_x86_feature(x86_featureset,
4979 					    X86FSET_AVX512_VBMI2);
4980 					remove_x86_feature(x86_featureset,
4981 					    X86FSET_AVX512_BF16);
4982 
4983 					xsave_force_disable = B_TRUE;
4984 				} else {
4985 					VERIFY(is_x86_feature(x86_featureset,
4986 					    X86FSET_XSAVE) == B_FALSE);
4987 				}
4988 			}
4989 		}
4990 	}
4991 
4992 
4993 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4994 		return;
4995 
4996 	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4997 		nmax = NMAX_CPI_EXTD;
4998 	/*
4999 	 * Copy the extended properties, fixing them as we go. While we start at
5000 	 * 2 because we've already handled a few cases in the basic pass, the
5001 	 * rest we let ourselves just grab again (e.g. 0x8, 0x21).
5002 	 */
5003 	iptr = (void *)cpi->cpi_brandstr;
5004 	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
5005 		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
5006 		(void) __cpuid_insn(cp);
5007 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
5008 		    cp);
5009 		switch (n) {
5010 		case 2:
5011 		case 3:
5012 		case 4:
5013 			/*
5014 			 * Extract the brand string
5015 			 */
5016 			*iptr++ = cp->cp_eax;
5017 			*iptr++ = cp->cp_ebx;
5018 			*iptr++ = cp->cp_ecx;
5019 			*iptr++ = cp->cp_edx;
5020 			break;
5021 		case 5:
5022 			switch (cpi->cpi_vendor) {
5023 			case X86_VENDOR_AMD:
5024 				/*
5025 				 * The Athlon and Duron were the first
5026 				 * parts to report the sizes of the
5027 				 * TLB for large pages. Before then,
5028 				 * we don't trust the data.
5029 				 */
5030 				if (cpi->cpi_family < 6 ||
5031 				    (cpi->cpi_family == 6 &&
5032 				    cpi->cpi_model < 1))
5033 					cp->cp_eax = 0;
5034 				break;
5035 			default:
5036 				break;
5037 			}
5038 			break;
5039 		case 6:
5040 			switch (cpi->cpi_vendor) {
5041 			case X86_VENDOR_AMD:
5042 				/*
5043 				 * The Athlon and Duron were the first
5044 				 * AMD parts with L2 TLB's.
5045 				 * Before then, don't trust the data.
5046 				 */
5047 				if (cpi->cpi_family < 6 ||
5048 				    (cpi->cpi_family == 6 &&
5049 				    cpi->cpi_model < 1))
5050 					cp->cp_eax = cp->cp_ebx = 0;
5051 				/*
5052 				 * AMD Duron rev A0 reports L2
5053 				 * cache size incorrectly as 1K
5054 				 * when it is really 64K
5055 				 */
5056 				if (cpi->cpi_family == 6 &&
5057 				    cpi->cpi_model == 3 &&
5058 				    cpi->cpi_step == 0) {
5059 					cp->cp_ecx &= 0xffff;
5060 					cp->cp_ecx |= 0x400000;
5061 				}
5062 				break;
5063 			case X86_VENDOR_Cyrix:	/* VIA C3 */
5064 				/*
5065 				 * VIA C3 processors are a bit messed
5066 				 * up w.r.t. encoding cache sizes in %ecx
5067 				 */
5068 				if (cpi->cpi_family != 6)
5069 					break;
5070 				/*
5071 				 * model 7 and 8 were incorrectly encoded
5072 				 *
5073 				 * xxx is model 8 really broken?
5074 				 */
5075 				if (cpi->cpi_model == 7 ||
5076 				    cpi->cpi_model == 8)
5077 					cp->cp_ecx =
5078 					    BITX(cp->cp_ecx, 31, 24) << 16 |
5079 					    BITX(cp->cp_ecx, 23, 16) << 12 |
5080 					    BITX(cp->cp_ecx, 15, 8) << 8 |
5081 					    BITX(cp->cp_ecx, 7, 0);
5082 				/*
5083 				 * model 9 stepping 1 has wrong associativity
5084 				 */
5085 				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
5086 					cp->cp_ecx |= 8 << 12;
5087 				break;
5088 			case X86_VENDOR_Intel:
5089 				/*
5090 				 * Extended L2 Cache features function.
5091 				 * First appeared on Prescott.
5092 				 */
5093 			default:
5094 				break;
5095 			}
5096 			break;
5097 		default:
5098 			break;
5099 		}
5100 	}
5101 }
5102 
5103 static const char *
intel_cpubrand(const struct cpuid_info * cpi)5104 intel_cpubrand(const struct cpuid_info *cpi)
5105 {
5106 	int i;
5107 
5108 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5109 
5110 	switch (cpi->cpi_family) {
5111 	case 5:
5112 		return ("Intel Pentium(r)");
5113 	case 6:
5114 		switch (cpi->cpi_model) {
5115 			uint_t celeron, xeon;
5116 			const struct cpuid_regs *cp;
5117 		case 0:
5118 		case 1:
5119 		case 2:
5120 			return ("Intel Pentium(r) Pro");
5121 		case 3:
5122 		case 4:
5123 			return ("Intel Pentium(r) II");
5124 		case 6:
5125 			return ("Intel Celeron(r)");
5126 		case 5:
5127 		case 7:
5128 			celeron = xeon = 0;
5129 			cp = &cpi->cpi_std[2];	/* cache info */
5130 
5131 			for (i = 1; i < 4; i++) {
5132 				uint_t tmp;
5133 
5134 				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
5135 				if (tmp == 0x40)
5136 					celeron++;
5137 				if (tmp >= 0x44 && tmp <= 0x45)
5138 					xeon++;
5139 			}
5140 
5141 			for (i = 0; i < 2; i++) {
5142 				uint_t tmp;
5143 
5144 				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
5145 				if (tmp == 0x40)
5146 					celeron++;
5147 				else if (tmp >= 0x44 && tmp <= 0x45)
5148 					xeon++;
5149 			}
5150 
5151 			for (i = 0; i < 4; i++) {
5152 				uint_t tmp;
5153 
5154 				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
5155 				if (tmp == 0x40)
5156 					celeron++;
5157 				else if (tmp >= 0x44 && tmp <= 0x45)
5158 					xeon++;
5159 			}
5160 
5161 			for (i = 0; i < 4; i++) {
5162 				uint_t tmp;
5163 
5164 				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
5165 				if (tmp == 0x40)
5166 					celeron++;
5167 				else if (tmp >= 0x44 && tmp <= 0x45)
5168 					xeon++;
5169 			}
5170 
5171 			if (celeron)
5172 				return ("Intel Celeron(r)");
5173 			if (xeon)
5174 				return (cpi->cpi_model == 5 ?
5175 				    "Intel Pentium(r) II Xeon(tm)" :
5176 				    "Intel Pentium(r) III Xeon(tm)");
5177 			return (cpi->cpi_model == 5 ?
5178 			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
5179 			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
5180 		default:
5181 			break;
5182 		}
5183 	default:
5184 		break;
5185 	}
5186 
5187 	/* BrandID is present if the field is nonzero */
5188 	if (cpi->cpi_brandid != 0) {
5189 		static const struct {
5190 			uint_t bt_bid;
5191 			const char *bt_str;
5192 		} brand_tbl[] = {
5193 			{ 0x1,	"Intel(r) Celeron(r)" },
5194 			{ 0x2,	"Intel(r) Pentium(r) III" },
5195 			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
5196 			{ 0x4,	"Intel(r) Pentium(r) III" },
5197 			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
5198 			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
5199 			{ 0x8,	"Intel(r) Pentium(r) 4" },
5200 			{ 0x9,	"Intel(r) Pentium(r) 4" },
5201 			{ 0xa,	"Intel(r) Celeron(r)" },
5202 			{ 0xb,	"Intel(r) Xeon(tm)" },
5203 			{ 0xc,	"Intel(r) Xeon(tm) MP" },
5204 			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
5205 			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
5206 			{ 0x11, "Mobile Genuine Intel(r)" },
5207 			{ 0x12, "Intel(r) Celeron(r) M" },
5208 			{ 0x13, "Mobile Intel(r) Celeron(r)" },
5209 			{ 0x14, "Intel(r) Celeron(r)" },
5210 			{ 0x15, "Mobile Genuine Intel(r)" },
5211 			{ 0x16,	"Intel(r) Pentium(r) M" },
5212 			{ 0x17, "Mobile Intel(r) Celeron(r)" }
5213 		};
5214 		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
5215 		uint_t sgn;
5216 
5217 		sgn = (cpi->cpi_family << 8) |
5218 		    (cpi->cpi_model << 4) | cpi->cpi_step;
5219 
5220 		for (i = 0; i < btblmax; i++)
5221 			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
5222 				break;
5223 		if (i < btblmax) {
5224 			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
5225 				return ("Intel(r) Celeron(r)");
5226 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
5227 				return ("Intel(r) Xeon(tm) MP");
5228 			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
5229 				return ("Intel(r) Xeon(tm)");
5230 			return (brand_tbl[i].bt_str);
5231 		}
5232 	}
5233 
5234 	return (NULL);
5235 }
5236 
5237 static const char *
amd_cpubrand(const struct cpuid_info * cpi)5238 amd_cpubrand(const struct cpuid_info *cpi)
5239 {
5240 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5241 
5242 	switch (cpi->cpi_family) {
5243 	case 5:
5244 		switch (cpi->cpi_model) {
5245 		case 0:
5246 		case 1:
5247 		case 2:
5248 		case 3:
5249 		case 4:
5250 		case 5:
5251 			return ("AMD-K5(r)");
5252 		case 6:
5253 		case 7:
5254 			return ("AMD-K6(r)");
5255 		case 8:
5256 			return ("AMD-K6(r)-2");
5257 		case 9:
5258 			return ("AMD-K6(r)-III");
5259 		default:
5260 			return ("AMD (family 5)");
5261 		}
5262 	case 6:
5263 		switch (cpi->cpi_model) {
5264 		case 1:
5265 			return ("AMD-K7(tm)");
5266 		case 0:
5267 		case 2:
5268 		case 4:
5269 			return ("AMD Athlon(tm)");
5270 		case 3:
5271 		case 7:
5272 			return ("AMD Duron(tm)");
5273 		case 6:
5274 		case 8:
5275 		case 10:
5276 			/*
5277 			 * Use the L2 cache size to distinguish
5278 			 */
5279 			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
5280 			    "AMD Athlon(tm)" : "AMD Duron(tm)");
5281 		default:
5282 			return ("AMD (family 6)");
5283 		}
5284 	default:
5285 		break;
5286 	}
5287 
5288 	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
5289 	    cpi->cpi_brandid != 0) {
5290 		switch (BITX(cpi->cpi_brandid, 7, 5)) {
5291 		case 3:
5292 			return ("AMD Opteron(tm) UP 1xx");
5293 		case 4:
5294 			return ("AMD Opteron(tm) DP 2xx");
5295 		case 5:
5296 			return ("AMD Opteron(tm) MP 8xx");
5297 		default:
5298 			return ("AMD Opteron(tm)");
5299 		}
5300 	}
5301 
5302 	return (NULL);
5303 }
5304 
5305 static const char *
cyrix_cpubrand(struct cpuid_info * cpi,uint_t type)5306 cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
5307 {
5308 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
5309 
5310 	switch (type) {
5311 	case X86_TYPE_CYRIX_6x86:
5312 		return ("Cyrix 6x86");
5313 	case X86_TYPE_CYRIX_6x86L:
5314 		return ("Cyrix 6x86L");
5315 	case X86_TYPE_CYRIX_6x86MX:
5316 		return ("Cyrix 6x86MX");
5317 	case X86_TYPE_CYRIX_GXm:
5318 		return ("Cyrix GXm");
5319 	case X86_TYPE_CYRIX_MediaGX:
5320 		return ("Cyrix MediaGX");
5321 	case X86_TYPE_CYRIX_MII:
5322 		return ("Cyrix M2");
5323 	case X86_TYPE_VIA_CYRIX_III:
5324 		return ("VIA Cyrix M3");
5325 	default:
5326 		/*
5327 		 * Have another wild guess ..
5328 		 */
5329 		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
5330 			return ("Cyrix 5x86");
5331 		else if (cpi->cpi_family == 5) {
5332 			switch (cpi->cpi_model) {
5333 			case 2:
5334 				return ("Cyrix 6x86");	/* Cyrix M1 */
5335 			case 4:
5336 				return ("Cyrix MediaGX");
5337 			default:
5338 				break;
5339 			}
5340 		} else if (cpi->cpi_family == 6) {
5341 			switch (cpi->cpi_model) {
5342 			case 0:
5343 				return ("Cyrix 6x86MX"); /* Cyrix M2? */
5344 			case 5:
5345 			case 6:
5346 			case 7:
5347 			case 8:
5348 			case 9:
5349 				return ("VIA C3");
5350 			default:
5351 				break;
5352 			}
5353 		}
5354 		break;
5355 	}
5356 	return (NULL);
5357 }
5358 
5359 /*
5360  * This only gets called in the case that the CPU extended
5361  * feature brand string (0x80000002, 0x80000003, 0x80000004)
5362  * aren't available, or contain null bytes for some reason.
5363  */
5364 static void
fabricate_brandstr(struct cpuid_info * cpi)5365 fabricate_brandstr(struct cpuid_info *cpi)
5366 {
5367 	const char *brand = NULL;
5368 
5369 	switch (cpi->cpi_vendor) {
5370 	case X86_VENDOR_Intel:
5371 		brand = intel_cpubrand(cpi);
5372 		break;
5373 	case X86_VENDOR_AMD:
5374 		brand = amd_cpubrand(cpi);
5375 		break;
5376 	case X86_VENDOR_Cyrix:
5377 		brand = cyrix_cpubrand(cpi, x86_type);
5378 		break;
5379 	case X86_VENDOR_NexGen:
5380 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5381 			brand = "NexGen Nx586";
5382 		break;
5383 	case X86_VENDOR_Centaur:
5384 		if (cpi->cpi_family == 5)
5385 			switch (cpi->cpi_model) {
5386 			case 4:
5387 				brand = "Centaur C6";
5388 				break;
5389 			case 8:
5390 				brand = "Centaur C2";
5391 				break;
5392 			case 9:
5393 				brand = "Centaur C3";
5394 				break;
5395 			default:
5396 				break;
5397 			}
5398 		break;
5399 	case X86_VENDOR_Rise:
5400 		if (cpi->cpi_family == 5 &&
5401 		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
5402 			brand = "Rise mP6";
5403 		break;
5404 	case X86_VENDOR_SiS:
5405 		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
5406 			brand = "SiS 55x";
5407 		break;
5408 	case X86_VENDOR_TM:
5409 		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
5410 			brand = "Transmeta Crusoe TM3x00 or TM5x00";
5411 		break;
5412 	case X86_VENDOR_NSC:
5413 	case X86_VENDOR_UMC:
5414 	default:
5415 		break;
5416 	}
5417 	if (brand) {
5418 		(void) strcpy((char *)cpi->cpi_brandstr, brand);
5419 		return;
5420 	}
5421 
5422 	/*
5423 	 * If all else fails ...
5424 	 */
5425 	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
5426 	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
5427 	    cpi->cpi_model, cpi->cpi_step);
5428 }
5429 
5430 /*
5431  * This routine is called just after kernel memory allocation
5432  * becomes available on cpu0, and as part of mp_startup() on
5433  * the other cpus.
5434  *
5435  * Fixup the brand string, and collect any information from cpuid
5436  * that requires dynamically allocated storage to represent.
5437  */
5438 
5439 static void
cpuid_pass_dynamic(cpu_t * cpu,void * _arg __unused)5440 cpuid_pass_dynamic(cpu_t *cpu, void *_arg __unused)
5441 {
5442 	int	i, max, shft, level, size;
5443 	struct cpuid_regs regs;
5444 	struct cpuid_regs *cp;
5445 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5446 
5447 	/*
5448 	 * Deterministic cache parameters
5449 	 *
5450 	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5451 	 * values that are present are currently defined to be the same. This
5452 	 * means we can use the same logic to parse it as long as we use the
5453 	 * appropriate leaf to get the data. If you're updating this, make sure
5454 	 * you're careful about which vendor supports which aspect.
5455 	 *
5456 	 * Take this opportunity to detect the number of threads sharing the
5457 	 * last level cache, and construct a corresponding cache id. The
5458 	 * respective cpuid_info members are initialized to the default case of
5459 	 * "no last level cache sharing".
5460 	 */
5461 	cpi->cpi_ncpu_shr_last_cache = 1;
5462 	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5463 
5464 	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5465 	    ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5466 	    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5467 	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5468 	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5469 		uint32_t leaf;
5470 
5471 		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5472 			leaf = 4;
5473 		} else {
5474 			leaf = CPUID_LEAF_EXT_1d;
5475 		}
5476 
5477 		/*
5478 		 * Find the # of elements (size) returned by the leaf and along
5479 		 * the way detect last level cache sharing details.
5480 		 */
5481 		bzero(&regs, sizeof (regs));
5482 		cp = &regs;
5483 		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5484 			cp->cp_eax = leaf;
5485 			cp->cp_ecx = i;
5486 
5487 			(void) __cpuid_insn(cp);
5488 
5489 			if (CPI_CACHE_TYPE(cp) == 0)
5490 				break;
5491 			level = CPI_CACHE_LVL(cp);
5492 			if (level > max) {
5493 				max = level;
5494 				cpi->cpi_ncpu_shr_last_cache =
5495 				    CPI_NTHR_SHR_CACHE(cp) + 1;
5496 			}
5497 		}
5498 		cpi->cpi_cache_leaf_size = size = i;
5499 
5500 		/*
5501 		 * Allocate the cpi_cache_leaves array. The first element
5502 		 * references the regs for the corresponding leaf with %ecx set
5503 		 * to 0. This was gathered in cpuid_pass_extended().
5504 		 */
5505 		if (size > 0) {
5506 			cpi->cpi_cache_leaves =
5507 			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5508 			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5509 				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5510 			} else {
5511 				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5512 			}
5513 
5514 			/*
5515 			 * Allocate storage to hold the additional regs
5516 			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5517 			 *
5518 			 * The regs for the leaf, %ecx == 0 has already
5519 			 * been allocated as indicated above.
5520 			 */
5521 			for (i = 1; i < size; i++) {
5522 				cp = cpi->cpi_cache_leaves[i] =
5523 				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5524 				cp->cp_eax = leaf;
5525 				cp->cp_ecx = i;
5526 
5527 				(void) __cpuid_insn(cp);
5528 			}
5529 		}
5530 		/*
5531 		 * Determine the number of bits needed to represent
5532 		 * the number of CPUs sharing the last level cache.
5533 		 *
5534 		 * Shift off that number of bits from the APIC id to
5535 		 * derive the cache id.
5536 		 */
5537 		shft = 0;
5538 		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5539 			shft++;
5540 		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5541 	}
5542 
5543 	/*
5544 	 * Now fixup the brand string
5545 	 */
5546 	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5547 		fabricate_brandstr(cpi);
5548 	} else {
5549 
5550 		/*
5551 		 * If we successfully extracted a brand string from the cpuid
5552 		 * instruction, clean it up by removing leading spaces and
5553 		 * similar junk.
5554 		 */
5555 		if (cpi->cpi_brandstr[0]) {
5556 			size_t maxlen = sizeof (cpi->cpi_brandstr);
5557 			char *src, *dst;
5558 
5559 			dst = src = (char *)cpi->cpi_brandstr;
5560 			src[maxlen - 1] = '\0';
5561 			/*
5562 			 * strip leading spaces
5563 			 */
5564 			while (*src == ' ')
5565 				src++;
5566 			/*
5567 			 * Remove any 'Genuine' or "Authentic" prefixes
5568 			 */
5569 			if (strncmp(src, "Genuine ", 8) == 0)
5570 				src += 8;
5571 			if (strncmp(src, "Authentic ", 10) == 0)
5572 				src += 10;
5573 
5574 			/*
5575 			 * Now do an in-place copy.
5576 			 * Map (R) to (r) and (TM) to (tm).
5577 			 * The era of teletypes is long gone, and there's
5578 			 * -really- no need to shout.
5579 			 */
5580 			while (*src != '\0') {
5581 				if (src[0] == '(') {
5582 					if (strncmp(src + 1, "R)", 2) == 0) {
5583 						(void) strncpy(dst, "(r)", 3);
5584 						src += 3;
5585 						dst += 3;
5586 						continue;
5587 					}
5588 					if (strncmp(src + 1, "TM)", 3) == 0) {
5589 						(void) strncpy(dst, "(tm)", 4);
5590 						src += 4;
5591 						dst += 4;
5592 						continue;
5593 					}
5594 				}
5595 				*dst++ = *src++;
5596 			}
5597 			*dst = '\0';
5598 
5599 			/*
5600 			 * Finally, remove any trailing spaces
5601 			 */
5602 			while (--dst > cpi->cpi_brandstr)
5603 				if (*dst == ' ')
5604 					*dst = '\0';
5605 				else
5606 					break;
5607 		} else
5608 			fabricate_brandstr(cpi);
5609 	}
5610 }
5611 
5612 typedef struct {
5613 	uint32_t avm_av;
5614 	uint32_t avm_feat;
5615 } av_feat_map_t;
5616 
5617 /*
5618  * These arrays are used to map features that we should add based on x86
5619  * features that are present. As a large number depend on kernel features,
5620  * rather than rechecking and clearing CPUID everywhere, we simply map these.
5621  * There is an array of these for each hwcap word. Some features aren't tracked
5622  * in the kernel x86 featureset and that's ok. They will not show up in here.
5623  */
5624 static const av_feat_map_t x86fset_to_av1[] = {
5625 	{ AV_386_CX8, X86FSET_CX8 },
5626 	{ AV_386_SEP, X86FSET_SEP },
5627 	{ AV_386_AMD_SYSC, X86FSET_ASYSC },
5628 	{ AV_386_CMOV, X86FSET_CMOV },
5629 	{ AV_386_FXSR, X86FSET_SSE },
5630 	{ AV_386_SSE, X86FSET_SSE },
5631 	{ AV_386_SSE2, X86FSET_SSE2 },
5632 	{ AV_386_SSE3, X86FSET_SSE3 },
5633 	{ AV_386_CX16, X86FSET_CX16 },
5634 	{ AV_386_TSCP, X86FSET_TSCP },
5635 	{ AV_386_AMD_SSE4A, X86FSET_SSE4A },
5636 	{ AV_386_SSSE3, X86FSET_SSSE3 },
5637 	{ AV_386_SSE4_1, X86FSET_SSE4_1 },
5638 	{ AV_386_SSE4_2, X86FSET_SSE4_2 },
5639 	{ AV_386_AES, X86FSET_AES },
5640 	{ AV_386_PCLMULQDQ, X86FSET_PCLMULQDQ },
5641 	{ AV_386_XSAVE, X86FSET_XSAVE },
5642 	{ AV_386_AVX, X86FSET_AVX },
5643 	{ AV_386_VMX, X86FSET_VMX },
5644 	{ AV_386_AMD_SVM, X86FSET_SVM }
5645 };
5646 
5647 static const av_feat_map_t x86fset_to_av2[] = {
5648 	{ AV_386_2_F16C, X86FSET_F16C },
5649 	{ AV_386_2_RDRAND, X86FSET_RDRAND },
5650 	{ AV_386_2_BMI1, X86FSET_BMI1 },
5651 	{ AV_386_2_BMI2, X86FSET_BMI2 },
5652 	{ AV_386_2_FMA, X86FSET_FMA },
5653 	{ AV_386_2_AVX2, X86FSET_AVX2 },
5654 	{ AV_386_2_ADX, X86FSET_ADX },
5655 	{ AV_386_2_RDSEED, X86FSET_RDSEED },
5656 	{ AV_386_2_AVX512F, X86FSET_AVX512F },
5657 	{ AV_386_2_AVX512DQ, X86FSET_AVX512DQ },
5658 	{ AV_386_2_AVX512IFMA, X86FSET_AVX512FMA },
5659 	{ AV_386_2_AVX512PF, X86FSET_AVX512PF },
5660 	{ AV_386_2_AVX512ER, X86FSET_AVX512ER },
5661 	{ AV_386_2_AVX512CD, X86FSET_AVX512CD },
5662 	{ AV_386_2_AVX512BW, X86FSET_AVX512BW },
5663 	{ AV_386_2_AVX512VL, X86FSET_AVX512VL },
5664 	{ AV_386_2_AVX512VBMI, X86FSET_AVX512VBMI },
5665 	{ AV_386_2_AVX512VPOPCDQ, X86FSET_AVX512VPOPCDQ },
5666 	{ AV_386_2_SHA, X86FSET_SHA },
5667 	{ AV_386_2_FSGSBASE, X86FSET_FSGSBASE },
5668 	{ AV_386_2_CLFLUSHOPT, X86FSET_CLFLUSHOPT },
5669 	{ AV_386_2_CLWB, X86FSET_CLWB },
5670 	{ AV_386_2_MONITORX, X86FSET_MONITORX },
5671 	{ AV_386_2_CLZERO, X86FSET_CLZERO },
5672 	{ AV_386_2_AVX512_VNNI, X86FSET_AVX512VNNI },
5673 	{ AV_386_2_VPCLMULQDQ, X86FSET_VPCLMULQDQ },
5674 	{ AV_386_2_VAES, X86FSET_VAES },
5675 	{ AV_386_2_GFNI, X86FSET_GFNI },
5676 	{ AV_386_2_AVX512_VP2INT, X86FSET_AVX512_VP2INT },
5677 	{ AV_386_2_AVX512_BITALG, X86FSET_AVX512_BITALG }
5678 };
5679 
5680 static const av_feat_map_t x86fset_to_av3[] = {
5681 	{ AV_386_3_AVX512_VBMI2, X86FSET_AVX512_VBMI2 },
5682 	{ AV_386_3_AVX512_BF16, X86FSET_AVX512_BF16 }
5683 };
5684 
5685 /*
5686  * This routine is called out of bind_hwcap() much later in the life
5687  * of the kernel (post_startup()).  The job of this routine is to resolve
5688  * the hardware feature support and kernel support for those features into
5689  * what we're actually going to tell applications via the aux vector.
5690  *
5691  * Most of the aux vector is derived from the x86_featureset array vector where
5692  * a given feature indicates that an aux vector should be plumbed through. This
5693  * allows the kernel to use one tracking mechanism for these based on whether or
5694  * not it has the required hardware support (most often xsave). Most newer
5695  * features are added there in case we need them in the kernel. Otherwise,
5696  * features are evaluated based on looking at the cpuid features that remain. If
5697  * you find yourself wanting to clear out cpuid features for some reason, they
5698  * should instead be driven by the feature set so we have a consistent view.
5699  */
5700 
5701 static void
cpuid_pass_resolve(cpu_t * cpu,void * arg)5702 cpuid_pass_resolve(cpu_t *cpu, void *arg)
5703 {
5704 	uint_t *hwcap_out = (uint_t *)arg;
5705 	struct cpuid_info *cpi;
5706 	uint_t hwcap_flags = 0, hwcap_flags_2 = 0, hwcap_flags_3 = 0;
5707 
5708 	cpi = cpu->cpu_m.mcpu_cpi;
5709 
5710 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av1); i++) {
5711 		if (is_x86_feature(x86_featureset,
5712 		    x86fset_to_av1[i].avm_feat)) {
5713 			hwcap_flags |= x86fset_to_av1[i].avm_av;
5714 		}
5715 	}
5716 
5717 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av2); i++) {
5718 		if (is_x86_feature(x86_featureset,
5719 		    x86fset_to_av2[i].avm_feat)) {
5720 			hwcap_flags_2 |= x86fset_to_av2[i].avm_av;
5721 		}
5722 	}
5723 
5724 	for (uint_t i = 0; i < ARRAY_SIZE(x86fset_to_av3); i++) {
5725 		if (is_x86_feature(x86_featureset,
5726 		    x86fset_to_av3[i].avm_feat)) {
5727 			hwcap_flags_3 |= x86fset_to_av3[i].avm_av;
5728 		}
5729 	}
5730 
5731 	/*
5732 	 * From here on out we're working through features that don't have
5733 	 * corresponding kernel feature flags for various reasons that are
5734 	 * mostly just due to the historical implementation.
5735 	 */
5736 	if (cpi->cpi_maxeax >= 1) {
5737 		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5738 		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5739 
5740 		*edx = CPI_FEATURES_EDX(cpi);
5741 		*ecx = CPI_FEATURES_ECX(cpi);
5742 
5743 		/*
5744 		 * [no explicit support required beyond x87 fp context]
5745 		 */
5746 		if (!fpu_exists)
5747 			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5748 
5749 		/*
5750 		 * Now map the supported feature vector to things that we
5751 		 * think userland will care about.
5752 		 */
5753 		if (*ecx & CPUID_INTC_ECX_MOVBE)
5754 			hwcap_flags |= AV_386_MOVBE;
5755 
5756 		if (*ecx & CPUID_INTC_ECX_POPCNT)
5757 			hwcap_flags |= AV_386_POPCNT;
5758 		if (*edx & CPUID_INTC_EDX_FPU)
5759 			hwcap_flags |= AV_386_FPU;
5760 		if (*edx & CPUID_INTC_EDX_MMX)
5761 			hwcap_flags |= AV_386_MMX;
5762 		if (*edx & CPUID_INTC_EDX_TSC)
5763 			hwcap_flags |= AV_386_TSC;
5764 	}
5765 
5766 	/*
5767 	 * Check a few miscellaneous features.
5768 	 */
5769 	if (cpi->cpi_xmaxeax < 0x80000001)
5770 		goto resolve_done;
5771 
5772 	switch (cpi->cpi_vendor) {
5773 		uint32_t *edx, *ecx;
5774 
5775 	case X86_VENDOR_Intel:
5776 		/*
5777 		 * Seems like Intel duplicated what we necessary
5778 		 * here to make the initial crop of 64-bit OS's work.
5779 		 * Hopefully, those are the only "extended" bits
5780 		 * they'll add.
5781 		 */
5782 		/*FALLTHROUGH*/
5783 
5784 	case X86_VENDOR_AMD:
5785 	case X86_VENDOR_HYGON:
5786 		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5787 		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5788 
5789 		*edx = CPI_FEATURES_XTD_EDX(cpi);
5790 		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5791 
5792 		/*
5793 		 * [no explicit support required beyond
5794 		 * x87 fp context and exception handlers]
5795 		 */
5796 		if (!fpu_exists)
5797 			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5798 			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5799 
5800 		/*
5801 		 * Now map the supported feature vector to
5802 		 * things that we think userland will care about.
5803 		 */
5804 		if (*edx & CPUID_AMD_EDX_MMXamd)
5805 			hwcap_flags |= AV_386_AMD_MMX;
5806 		if (*edx & CPUID_AMD_EDX_3DNow)
5807 			hwcap_flags |= AV_386_AMD_3DNow;
5808 		if (*edx & CPUID_AMD_EDX_3DNowx)
5809 			hwcap_flags |= AV_386_AMD_3DNowx;
5810 
5811 		switch (cpi->cpi_vendor) {
5812 		case X86_VENDOR_AMD:
5813 		case X86_VENDOR_HYGON:
5814 			if (*ecx & CPUID_AMD_ECX_AHF64)
5815 				hwcap_flags |= AV_386_AHF;
5816 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5817 				hwcap_flags |= AV_386_AMD_LZCNT;
5818 			break;
5819 
5820 		case X86_VENDOR_Intel:
5821 			if (*ecx & CPUID_AMD_ECX_LZCNT)
5822 				hwcap_flags |= AV_386_AMD_LZCNT;
5823 			/*
5824 			 * Aarrgh.
5825 			 * Intel uses a different bit in the same word.
5826 			 */
5827 			if (*ecx & CPUID_INTC_ECX_AHF64)
5828 				hwcap_flags |= AV_386_AHF;
5829 			break;
5830 		default:
5831 			break;
5832 		}
5833 		break;
5834 
5835 	default:
5836 		break;
5837 	}
5838 
5839 resolve_done:
5840 	if (hwcap_out != NULL) {
5841 		hwcap_out[0] = hwcap_flags;
5842 		hwcap_out[1] = hwcap_flags_2;
5843 		hwcap_out[2] = hwcap_flags_3;
5844 	}
5845 }
5846 
5847 
5848 /*
5849  * Simulate the cpuid instruction using the data we previously
5850  * captured about this CPU.  We try our best to return the truth
5851  * about the hardware, independently of kernel support.
5852  */
5853 uint32_t
cpuid_insn(cpu_t * cpu,struct cpuid_regs * cp)5854 cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5855 {
5856 	struct cpuid_info *cpi;
5857 	struct cpuid_regs *xcp;
5858 
5859 	if (cpu == NULL)
5860 		cpu = CPU;
5861 	cpi = cpu->cpu_m.mcpu_cpi;
5862 
5863 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5864 
5865 	/*
5866 	 * CPUID data is cached in two separate places: cpi_std for standard
5867 	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5868 	 */
5869 	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5870 		xcp = &cpi->cpi_std[cp->cp_eax];
5871 	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5872 	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5873 	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5874 		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5875 	} else {
5876 		/*
5877 		 * The caller is asking for data from an input parameter which
5878 		 * the kernel has not cached.  In this case we go fetch from
5879 		 * the hardware and return the data directly to the user.
5880 		 */
5881 		return (__cpuid_insn(cp));
5882 	}
5883 
5884 	cp->cp_eax = xcp->cp_eax;
5885 	cp->cp_ebx = xcp->cp_ebx;
5886 	cp->cp_ecx = xcp->cp_ecx;
5887 	cp->cp_edx = xcp->cp_edx;
5888 	return (cp->cp_eax);
5889 }
5890 
5891 boolean_t
cpuid_checkpass(const cpu_t * const cpu,const cpuid_pass_t pass)5892 cpuid_checkpass(const cpu_t *const cpu, const cpuid_pass_t pass)
5893 {
5894 	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5895 	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5896 }
5897 
5898 int
cpuid_getbrandstr(cpu_t * cpu,char * s,size_t n)5899 cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5900 {
5901 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
5902 
5903 	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5904 }
5905 
5906 int
cpuid_is_cmt(cpu_t * cpu)5907 cpuid_is_cmt(cpu_t *cpu)
5908 {
5909 	if (cpu == NULL)
5910 		cpu = CPU;
5911 
5912 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5913 
5914 	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5915 }
5916 
5917 /*
5918  * AMD and Intel both implement the 64-bit variant of the syscall
5919  * instruction (syscallq), so if there's -any- support for syscall,
5920  * cpuid currently says "yes, we support this".
5921  *
5922  * However, Intel decided to -not- implement the 32-bit variant of the
5923  * syscall instruction, so we provide a predicate to allow our caller
5924  * to test that subtlety here.
5925  *
5926  * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5927  *	even in the case where the hardware would in fact support it.
5928  */
5929 /*ARGSUSED*/
5930 int
cpuid_syscall32_insn(cpu_t * cpu)5931 cpuid_syscall32_insn(cpu_t *cpu)
5932 {
5933 	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), CPUID_PASS_BASIC));
5934 
5935 #if !defined(__xpv)
5936 	if (cpu == NULL)
5937 		cpu = CPU;
5938 
5939 	/*CSTYLED*/
5940 	{
5941 		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5942 
5943 		if ((cpi->cpi_vendor == X86_VENDOR_AMD ||
5944 		    cpi->cpi_vendor == X86_VENDOR_HYGON) &&
5945 		    cpi->cpi_xmaxeax >= 0x80000001 &&
5946 		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5947 			return (1);
5948 	}
5949 #endif
5950 	return (0);
5951 }
5952 
5953 int
cpuid_getidstr(cpu_t * cpu,char * s,size_t n)5954 cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5955 {
5956 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5957 
5958 	static const char fmt[] =
5959 	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5960 	static const char fmt_ht[] =
5961 	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5962 
5963 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
5964 
5965 	if (cpuid_is_cmt(cpu))
5966 		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5967 		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5968 		    cpi->cpi_family, cpi->cpi_model,
5969 		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5970 	return (snprintf(s, n, fmt,
5971 	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5972 	    cpi->cpi_family, cpi->cpi_model,
5973 	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5974 }
5975 
5976 const char *
cpuid_getvendorstr(cpu_t * cpu)5977 cpuid_getvendorstr(cpu_t *cpu)
5978 {
5979 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5980 	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5981 }
5982 
5983 uint_t
cpuid_getvendor(cpu_t * cpu)5984 cpuid_getvendor(cpu_t *cpu)
5985 {
5986 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5987 	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5988 }
5989 
5990 uint_t
cpuid_getfamily(cpu_t * cpu)5991 cpuid_getfamily(cpu_t *cpu)
5992 {
5993 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
5994 	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5995 }
5996 
5997 uint_t
cpuid_getmodel(cpu_t * cpu)5998 cpuid_getmodel(cpu_t *cpu)
5999 {
6000 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6001 	return (cpu->cpu_m.mcpu_cpi->cpi_model);
6002 }
6003 
6004 uint_t
cpuid_get_ncpu_per_chip(cpu_t * cpu)6005 cpuid_get_ncpu_per_chip(cpu_t *cpu)
6006 {
6007 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6008 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
6009 }
6010 
6011 uint_t
cpuid_get_ncore_per_chip(cpu_t * cpu)6012 cpuid_get_ncore_per_chip(cpu_t *cpu)
6013 {
6014 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6015 	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
6016 }
6017 
6018 uint_t
cpuid_get_ncpu_sharing_last_cache(cpu_t * cpu)6019 cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
6020 {
6021 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6022 	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
6023 }
6024 
6025 id_t
cpuid_get_last_lvl_cacheid(cpu_t * cpu)6026 cpuid_get_last_lvl_cacheid(cpu_t *cpu)
6027 {
6028 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_EXTENDED));
6029 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6030 }
6031 
6032 uint_t
cpuid_getstep(cpu_t * cpu)6033 cpuid_getstep(cpu_t *cpu)
6034 {
6035 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6036 	return (cpu->cpu_m.mcpu_cpi->cpi_step);
6037 }
6038 
6039 uint_t
cpuid_getsig(struct cpu * cpu)6040 cpuid_getsig(struct cpu *cpu)
6041 {
6042 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6043 	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
6044 }
6045 
6046 uint32_t
cpuid_getchiprev(struct cpu * cpu)6047 cpuid_getchiprev(struct cpu *cpu)
6048 {
6049 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6050 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
6051 }
6052 
6053 const char *
cpuid_getchiprevstr(struct cpu * cpu)6054 cpuid_getchiprevstr(struct cpu *cpu)
6055 {
6056 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6057 	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
6058 }
6059 
6060 uint32_t
cpuid_getsockettype(struct cpu * cpu)6061 cpuid_getsockettype(struct cpu *cpu)
6062 {
6063 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6064 	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
6065 }
6066 
6067 const char *
cpuid_getsocketstr(cpu_t * cpu)6068 cpuid_getsocketstr(cpu_t *cpu)
6069 {
6070 	static const char *socketstr = NULL;
6071 	struct cpuid_info *cpi;
6072 
6073 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_IDENT));
6074 	cpi = cpu->cpu_m.mcpu_cpi;
6075 
6076 	/* Assume that socket types are the same across the system */
6077 	if (socketstr == NULL)
6078 		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
6079 		    cpi->cpi_model, cpi->cpi_step);
6080 
6081 
6082 	return (socketstr);
6083 }
6084 
6085 x86_uarchrev_t
cpuid_getuarchrev(cpu_t * cpu)6086 cpuid_getuarchrev(cpu_t *cpu)
6087 {
6088 	return (cpu->cpu_m.mcpu_cpi->cpi_uarchrev);
6089 }
6090 
6091 int
cpuid_get_chipid(cpu_t * cpu)6092 cpuid_get_chipid(cpu_t *cpu)
6093 {
6094 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6095 
6096 	if (cpuid_is_cmt(cpu))
6097 		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
6098 	return (cpu->cpu_id);
6099 }
6100 
6101 id_t
cpuid_get_coreid(cpu_t * cpu)6102 cpuid_get_coreid(cpu_t *cpu)
6103 {
6104 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6105 	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
6106 }
6107 
6108 int
cpuid_get_pkgcoreid(cpu_t * cpu)6109 cpuid_get_pkgcoreid(cpu_t *cpu)
6110 {
6111 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6112 	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
6113 }
6114 
6115 int
cpuid_get_clogid(cpu_t * cpu)6116 cpuid_get_clogid(cpu_t *cpu)
6117 {
6118 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6119 	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
6120 }
6121 
6122 int
cpuid_get_cacheid(cpu_t * cpu)6123 cpuid_get_cacheid(cpu_t *cpu)
6124 {
6125 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6126 	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
6127 }
6128 
6129 uint_t
cpuid_get_procnodeid(cpu_t * cpu)6130 cpuid_get_procnodeid(cpu_t *cpu)
6131 {
6132 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6133 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
6134 }
6135 
6136 uint_t
cpuid_get_procnodes_per_pkg(cpu_t * cpu)6137 cpuid_get_procnodes_per_pkg(cpu_t *cpu)
6138 {
6139 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6140 	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
6141 }
6142 
6143 uint_t
cpuid_get_compunitid(cpu_t * cpu)6144 cpuid_get_compunitid(cpu_t *cpu)
6145 {
6146 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6147 	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
6148 }
6149 
6150 uint_t
cpuid_get_cores_per_compunit(cpu_t * cpu)6151 cpuid_get_cores_per_compunit(cpu_t *cpu)
6152 {
6153 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6154 	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
6155 }
6156 
6157 uint32_t
cpuid_get_apicid(cpu_t * cpu)6158 cpuid_get_apicid(cpu_t *cpu)
6159 {
6160 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6161 	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
6162 		return (UINT32_MAX);
6163 	} else {
6164 		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
6165 	}
6166 }
6167 
6168 void
cpuid_get_addrsize(cpu_t * cpu,uint_t * pabits,uint_t * vabits)6169 cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
6170 {
6171 	struct cpuid_info *cpi;
6172 
6173 	if (cpu == NULL)
6174 		cpu = CPU;
6175 	cpi = cpu->cpu_m.mcpu_cpi;
6176 
6177 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6178 
6179 	if (pabits)
6180 		*pabits = cpi->cpi_pabits;
6181 	if (vabits)
6182 		*vabits = cpi->cpi_vabits;
6183 }
6184 
6185 size_t
cpuid_get_xsave_size(void)6186 cpuid_get_xsave_size(void)
6187 {
6188 	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
6189 	    sizeof (struct xsave_state)));
6190 }
6191 
6192 /*
6193  * Export information about known offsets to the kernel. We only care about
6194  * things we have actually enabled support for in %xcr0.
6195  */
6196 void
cpuid_get_xsave_info(uint64_t bit,size_t * sizep,size_t * offp)6197 cpuid_get_xsave_info(uint64_t bit, size_t *sizep, size_t *offp)
6198 {
6199 	size_t size, off;
6200 
6201 	VERIFY3U(bit & xsave_bv_all, !=, 0);
6202 
6203 	if (sizep == NULL)
6204 		sizep = &size;
6205 	if (offp == NULL)
6206 		offp = &off;
6207 
6208 	switch (bit) {
6209 	case XFEATURE_LEGACY_FP:
6210 	case XFEATURE_SSE:
6211 		*sizep = sizeof (struct fxsave_state);
6212 		*offp = 0;
6213 		break;
6214 	case XFEATURE_AVX:
6215 		*sizep = cpuid_info0.cpi_xsave.ymm_size;
6216 		*offp = cpuid_info0.cpi_xsave.ymm_offset;
6217 		break;
6218 	case XFEATURE_AVX512_OPMASK:
6219 		*sizep = cpuid_info0.cpi_xsave.opmask_size;
6220 		*offp = cpuid_info0.cpi_xsave.opmask_offset;
6221 		break;
6222 	case XFEATURE_AVX512_ZMM:
6223 		*sizep = cpuid_info0.cpi_xsave.zmmlo_size;
6224 		*offp = cpuid_info0.cpi_xsave.zmmlo_offset;
6225 		break;
6226 	case XFEATURE_AVX512_HI_ZMM:
6227 		*sizep = cpuid_info0.cpi_xsave.zmmhi_size;
6228 		*offp = cpuid_info0.cpi_xsave.zmmhi_offset;
6229 		break;
6230 	default:
6231 		panic("asked for unsupported xsave feature: 0x%lx", bit);
6232 	}
6233 }
6234 
6235 /*
6236  * Return true if the CPUs on this system require 'pointer clearing' for the
6237  * floating point error pointer exception handling. In the past, this has been
6238  * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
6239  * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
6240  * feature bit and is reflected in the cpi_fp_amd_save member.
6241  */
6242 boolean_t
cpuid_need_fp_excp_handling(void)6243 cpuid_need_fp_excp_handling(void)
6244 {
6245 	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
6246 	    cpuid_info0.cpi_fp_amd_save != 0);
6247 }
6248 
6249 /*
6250  * Returns the number of data TLB entries for a corresponding
6251  * pagesize.  If it can't be computed, or isn't known, the
6252  * routine returns zero.  If you ask about an architecturally
6253  * impossible pagesize, the routine will panic (so that the
6254  * hat implementor knows that things are inconsistent.)
6255  */
6256 uint_t
cpuid_get_dtlb_nent(cpu_t * cpu,size_t pagesize)6257 cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
6258 {
6259 	struct cpuid_info *cpi;
6260 	uint_t dtlb_nent = 0;
6261 
6262 	if (cpu == NULL)
6263 		cpu = CPU;
6264 	cpi = cpu->cpu_m.mcpu_cpi;
6265 
6266 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
6267 
6268 	/*
6269 	 * Check the L2 TLB info
6270 	 */
6271 	if (cpi->cpi_xmaxeax >= 0x80000006) {
6272 		struct cpuid_regs *cp = &cpi->cpi_extd[6];
6273 
6274 		switch (pagesize) {
6275 
6276 		case 4 * 1024:
6277 			/*
6278 			 * All zero in the top 16 bits of the register
6279 			 * indicates a unified TLB. Size is in low 16 bits.
6280 			 */
6281 			if ((cp->cp_ebx & 0xffff0000) == 0)
6282 				dtlb_nent = cp->cp_ebx & 0x0000ffff;
6283 			else
6284 				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
6285 			break;
6286 
6287 		case 2 * 1024 * 1024:
6288 			if ((cp->cp_eax & 0xffff0000) == 0)
6289 				dtlb_nent = cp->cp_eax & 0x0000ffff;
6290 			else
6291 				dtlb_nent = BITX(cp->cp_eax, 27, 16);
6292 			break;
6293 
6294 		default:
6295 			panic("unknown L2 pagesize");
6296 			/*NOTREACHED*/
6297 		}
6298 	}
6299 
6300 	if (dtlb_nent != 0)
6301 		return (dtlb_nent);
6302 
6303 	/*
6304 	 * No L2 TLB support for this size, try L1.
6305 	 */
6306 	if (cpi->cpi_xmaxeax >= 0x80000005) {
6307 		struct cpuid_regs *cp = &cpi->cpi_extd[5];
6308 
6309 		switch (pagesize) {
6310 		case 4 * 1024:
6311 			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
6312 			break;
6313 		case 2 * 1024 * 1024:
6314 			dtlb_nent = BITX(cp->cp_eax, 23, 16);
6315 			break;
6316 		default:
6317 			panic("unknown L1 d-TLB pagesize");
6318 			/*NOTREACHED*/
6319 		}
6320 	}
6321 
6322 	return (dtlb_nent);
6323 }
6324 
6325 /*
6326  * Return 0 if the erratum is not present or not applicable, positive
6327  * if it is, and negative if the status of the erratum is unknown.
6328  *
6329  * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
6330  * Processors" #25759, Rev 3.57, August 2005
6331  */
6332 int
cpuid_opteron_erratum(cpu_t * cpu,uint_t erratum)6333 cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
6334 {
6335 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6336 	uint_t eax;
6337 
6338 	/*
6339 	 * Bail out if this CPU isn't an AMD CPU, or if it's
6340 	 * a legacy (32-bit) AMD CPU.
6341 	 */
6342 	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
6343 	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
6344 	    cpi->cpi_family == 6) {
6345 		return (0);
6346 	}
6347 
6348 	eax = cpi->cpi_std[1].cp_eax;
6349 
6350 #define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
6351 #define	SH_B3(eax)	(eax == 0xf51)
6352 #define	B(eax)		(SH_B0(eax) || SH_B3(eax))
6353 
6354 #define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
6355 
6356 #define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
6357 #define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
6358 #define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
6359 #define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
6360 
6361 #define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
6362 #define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
6363 #define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
6364 #define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
6365 
6366 #define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
6367 #define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
6368 #define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
6369 #define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
6370 #define	BH_E4(eax)	(eax == 0x20fb1)
6371 #define	SH_E5(eax)	(eax == 0x20f42)
6372 #define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
6373 #define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
6374 #define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
6375 			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
6376 			    DH_E6(eax) || JH_E6(eax))
6377 
6378 #define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
6379 #define	DR_B0(eax)	(eax == 0x100f20)
6380 #define	DR_B1(eax)	(eax == 0x100f21)
6381 #define	DR_BA(eax)	(eax == 0x100f2a)
6382 #define	DR_B2(eax)	(eax == 0x100f22)
6383 #define	DR_B3(eax)	(eax == 0x100f23)
6384 #define	RB_C0(eax)	(eax == 0x100f40)
6385 
6386 	switch (erratum) {
6387 	case 1:
6388 		return (cpi->cpi_family < 0x10);
6389 	case 51:	/* what does the asterisk mean? */
6390 		return (B(eax) || SH_C0(eax) || CG(eax));
6391 	case 52:
6392 		return (B(eax));
6393 	case 57:
6394 		return (cpi->cpi_family <= 0x11);
6395 	case 58:
6396 		return (B(eax));
6397 	case 60:
6398 		return (cpi->cpi_family <= 0x11);
6399 	case 61:
6400 	case 62:
6401 	case 63:
6402 	case 64:
6403 	case 65:
6404 	case 66:
6405 	case 68:
6406 	case 69:
6407 	case 70:
6408 	case 71:
6409 		return (B(eax));
6410 	case 72:
6411 		return (SH_B0(eax));
6412 	case 74:
6413 		return (B(eax));
6414 	case 75:
6415 		return (cpi->cpi_family < 0x10);
6416 	case 76:
6417 		return (B(eax));
6418 	case 77:
6419 		return (cpi->cpi_family <= 0x11);
6420 	case 78:
6421 		return (B(eax) || SH_C0(eax));
6422 	case 79:
6423 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6424 	case 80:
6425 	case 81:
6426 	case 82:
6427 		return (B(eax));
6428 	case 83:
6429 		return (B(eax) || SH_C0(eax) || CG(eax));
6430 	case 85:
6431 		return (cpi->cpi_family < 0x10);
6432 	case 86:
6433 		return (SH_C0(eax) || CG(eax));
6434 	case 88:
6435 		return (B(eax) || SH_C0(eax));
6436 	case 89:
6437 		return (cpi->cpi_family < 0x10);
6438 	case 90:
6439 		return (B(eax) || SH_C0(eax) || CG(eax));
6440 	case 91:
6441 	case 92:
6442 		return (B(eax) || SH_C0(eax));
6443 	case 93:
6444 		return (SH_C0(eax));
6445 	case 94:
6446 		return (B(eax) || SH_C0(eax) || CG(eax));
6447 	case 95:
6448 		return (B(eax) || SH_C0(eax));
6449 	case 96:
6450 		return (B(eax) || SH_C0(eax) || CG(eax));
6451 	case 97:
6452 	case 98:
6453 		return (SH_C0(eax) || CG(eax));
6454 	case 99:
6455 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6456 	case 100:
6457 		return (B(eax) || SH_C0(eax));
6458 	case 101:
6459 	case 103:
6460 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6461 	case 104:
6462 		return (SH_C0(eax) || CG(eax) || D0(eax));
6463 	case 105:
6464 	case 106:
6465 	case 107:
6466 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6467 	case 108:
6468 		return (DH_CG(eax));
6469 	case 109:
6470 		return (SH_C0(eax) || CG(eax) || D0(eax));
6471 	case 110:
6472 		return (D0(eax) || EX(eax));
6473 	case 111:
6474 		return (CG(eax));
6475 	case 112:
6476 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6477 	case 113:
6478 		return (eax == 0x20fc0);
6479 	case 114:
6480 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6481 	case 115:
6482 		return (SH_E0(eax) || JH_E1(eax));
6483 	case 116:
6484 		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6485 	case 117:
6486 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6487 	case 118:
6488 		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6489 		    JH_E6(eax));
6490 	case 121:
6491 		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6492 	case 122:
6493 		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6494 	case 123:
6495 		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6496 	case 131:
6497 		return (cpi->cpi_family < 0x10);
6498 	case 6336786:
6499 
6500 		/*
6501 		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6502 		 * if this is a K8 family or newer processor. We're testing for
6503 		 * this 'erratum' to determine whether or not we have a constant
6504 		 * TSC.
6505 		 *
6506 		 * Our current fix for this is to disable the C1-Clock ramping.
6507 		 * However, this doesn't work on newer processor families nor
6508 		 * does it work when virtualized as those devices don't exist.
6509 		 */
6510 		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6511 			return (0);
6512 		}
6513 
6514 		if (CPI_FAMILY(cpi) == 0xf) {
6515 			struct cpuid_regs regs;
6516 			regs.cp_eax = 0x80000007;
6517 			(void) __cpuid_insn(&regs);
6518 			return (!(regs.cp_edx & 0x100));
6519 		}
6520 		return (0);
6521 	case 147:
6522 		/*
6523 		 * This erratum (K8 #147) is not present on family 10 and newer.
6524 		 */
6525 		if (cpi->cpi_family >= 0x10) {
6526 			return (0);
6527 		}
6528 		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6529 		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6530 
6531 	case 6671130:
6532 		/*
6533 		 * check for processors (pre-Shanghai) that do not provide
6534 		 * optimal management of 1gb ptes in its tlb.
6535 		 */
6536 		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6537 
6538 	case 298:
6539 		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6540 		    DR_B2(eax) || RB_C0(eax));
6541 
6542 	case 721:
6543 		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6544 
6545 	default:
6546 		return (-1);
6547 
6548 	}
6549 }
6550 
6551 /*
6552  * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6553  * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6554  */
6555 int
osvw_opteron_erratum(cpu_t * cpu,uint_t erratum)6556 osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6557 {
6558 	struct cpuid_info	*cpi;
6559 	uint_t			osvwid;
6560 	static int		osvwfeature = -1;
6561 	uint64_t		osvwlength;
6562 
6563 
6564 	cpi = cpu->cpu_m.mcpu_cpi;
6565 
6566 	/* confirm OSVW supported */
6567 	if (osvwfeature == -1) {
6568 		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6569 	} else {
6570 		/* assert that osvw feature setting is consistent on all cpus */
6571 		ASSERT(osvwfeature ==
6572 		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6573 	}
6574 	if (!osvwfeature)
6575 		return (-1);
6576 
6577 	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6578 
6579 	switch (erratum) {
6580 	case 298:	/* osvwid is 0 */
6581 		osvwid = 0;
6582 		if (osvwlength <= (uint64_t)osvwid) {
6583 			/* osvwid 0 is unknown */
6584 			return (-1);
6585 		}
6586 
6587 		/*
6588 		 * Check the OSVW STATUS MSR to determine the state
6589 		 * of the erratum where:
6590 		 *   0 - fixed by HW
6591 		 *   1 - BIOS has applied the workaround when BIOS
6592 		 *   workaround is available. (Or for other errata,
6593 		 *   OS workaround is required.)
6594 		 * For a value of 1, caller will confirm that the
6595 		 * erratum 298 workaround has indeed been applied by BIOS.
6596 		 *
6597 		 * A 1 may be set in cpus that have a HW fix
6598 		 * in a mixed cpu system. Regarding erratum 298:
6599 		 *   In a multiprocessor platform, the workaround above
6600 		 *   should be applied to all processors regardless of
6601 		 *   silicon revision when an affected processor is
6602 		 *   present.
6603 		 */
6604 
6605 		return (rdmsr(MSR_AMD_OSVW_STATUS +
6606 		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6607 		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6608 
6609 	default:
6610 		return (-1);
6611 	}
6612 }
6613 
6614 static const char assoc_str[] = "associativity";
6615 static const char line_str[] = "line-size";
6616 static const char size_str[] = "size";
6617 
6618 static void
add_cache_prop(dev_info_t * devi,const char * label,const char * type,uint32_t val)6619 add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6620     uint32_t val)
6621 {
6622 	char buf[128];
6623 
6624 	/*
6625 	 * ndi_prop_update_int() is used because it is desirable for
6626 	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6627 	 */
6628 	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6629 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6630 }
6631 
6632 /*
6633  * Intel-style cache/tlb description
6634  *
6635  * Standard cpuid level 2 gives a randomly ordered
6636  * selection of tags that index into a table that describes
6637  * cache and tlb properties.
6638  */
6639 
6640 static const char l1_icache_str[] = "l1-icache";
6641 static const char l1_dcache_str[] = "l1-dcache";
6642 static const char l2_cache_str[] = "l2-cache";
6643 static const char l3_cache_str[] = "l3-cache";
6644 static const char itlb4k_str[] = "itlb-4K";
6645 static const char dtlb4k_str[] = "dtlb-4K";
6646 static const char itlb2M_str[] = "itlb-2M";
6647 static const char itlb4M_str[] = "itlb-4M";
6648 static const char dtlb4M_str[] = "dtlb-4M";
6649 static const char dtlb24_str[] = "dtlb0-2M-4M";
6650 static const char itlb424_str[] = "itlb-4K-2M-4M";
6651 static const char itlb24_str[] = "itlb-2M-4M";
6652 static const char dtlb44_str[] = "dtlb-4K-4M";
6653 static const char sl1_dcache_str[] = "sectored-l1-dcache";
6654 static const char sl2_cache_str[] = "sectored-l2-cache";
6655 static const char itrace_str[] = "itrace-cache";
6656 static const char sl3_cache_str[] = "sectored-l3-cache";
6657 static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6658 
6659 static const struct cachetab {
6660 	uint8_t		ct_code;
6661 	uint8_t		ct_assoc;
6662 	uint16_t	ct_line_size;
6663 	size_t		ct_size;
6664 	const char	*ct_label;
6665 } intel_ctab[] = {
6666 	/*
6667 	 * maintain descending order!
6668 	 *
6669 	 * Codes ignored - Reason
6670 	 * ----------------------
6671 	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6672 	 * f0H/f1H - Currently we do not interpret prefetch size by design
6673 	 */
6674 	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6675 	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6676 	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6677 	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6678 	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6679 	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6680 	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6681 	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6682 	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6683 	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6684 	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6685 	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6686 	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6687 	{ 0xc0, 4, 0, 8, dtlb44_str },
6688 	{ 0xba, 4, 0, 64, dtlb4k_str },
6689 	{ 0xb4, 4, 0, 256, dtlb4k_str },
6690 	{ 0xb3, 4, 0, 128, dtlb4k_str },
6691 	{ 0xb2, 4, 0, 64, itlb4k_str },
6692 	{ 0xb0, 4, 0, 128, itlb4k_str },
6693 	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6694 	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6695 	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6696 	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6697 	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6698 	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6699 	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6700 	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6701 	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6702 	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6703 	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6704 	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6705 	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6706 	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6707 	{ 0x73, 8, 0, 64*1024, itrace_str},
6708 	{ 0x72, 8, 0, 32*1024, itrace_str},
6709 	{ 0x71, 8, 0, 16*1024, itrace_str},
6710 	{ 0x70, 8, 0, 12*1024, itrace_str},
6711 	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6712 	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6713 	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6714 	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6715 	{ 0x5d, 0, 0, 256, dtlb44_str},
6716 	{ 0x5c, 0, 0, 128, dtlb44_str},
6717 	{ 0x5b, 0, 0, 64, dtlb44_str},
6718 	{ 0x5a, 4, 0, 32, dtlb24_str},
6719 	{ 0x59, 0, 0, 16, dtlb4k_str},
6720 	{ 0x57, 4, 0, 16, dtlb4k_str},
6721 	{ 0x56, 4, 0, 16, dtlb4M_str},
6722 	{ 0x55, 0, 0, 7, itlb24_str},
6723 	{ 0x52, 0, 0, 256, itlb424_str},
6724 	{ 0x51, 0, 0, 128, itlb424_str},
6725 	{ 0x50, 0, 0, 64, itlb424_str},
6726 	{ 0x4f, 0, 0, 32, itlb4k_str},
6727 	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6728 	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6729 	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6730 	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6731 	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6732 	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6733 	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6734 	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6735 	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6736 	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6737 	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6738 	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6739 	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6740 	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6741 	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6742 	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6743 	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6744 	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6745 	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6746 	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6747 	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6748 	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6749 	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6750 	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6751 	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6752 	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6753 	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6754 	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6755 	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6756 	{ 0x0b, 4, 0, 4, itlb4M_str},
6757 	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6758 	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6759 	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6760 	{ 0x05, 4, 0, 32, dtlb4M_str},
6761 	{ 0x04, 4, 0, 8, dtlb4M_str},
6762 	{ 0x03, 4, 0, 64, dtlb4k_str},
6763 	{ 0x02, 4, 0, 2, itlb4M_str},
6764 	{ 0x01, 4, 0, 32, itlb4k_str},
6765 	{ 0 }
6766 };
6767 
6768 static const struct cachetab cyrix_ctab[] = {
6769 	{ 0x70, 4, 0, 32, "tlb-4K" },
6770 	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6771 	{ 0 }
6772 };
6773 
6774 /*
6775  * Search a cache table for a matching entry
6776  */
6777 static const struct cachetab *
find_cacheent(const struct cachetab * ct,uint_t code)6778 find_cacheent(const struct cachetab *ct, uint_t code)
6779 {
6780 	if (code != 0) {
6781 		for (; ct->ct_code != 0; ct++)
6782 			if (ct->ct_code <= code)
6783 				break;
6784 		if (ct->ct_code == code)
6785 			return (ct);
6786 	}
6787 	return (NULL);
6788 }
6789 
6790 /*
6791  * Populate cachetab entry with L2 or L3 cache-information using
6792  * cpuid function 4. This function is called from intel_walk_cacheinfo()
6793  * when descriptor 0x49 is encountered. It returns 0 if no such cache
6794  * information is found.
6795  */
6796 static int
intel_cpuid_4_cache_info(struct cachetab * ct,struct cpuid_info * cpi)6797 intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6798 {
6799 	uint32_t level, i;
6800 	int ret = 0;
6801 
6802 	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6803 		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6804 
6805 		if (level == 2 || level == 3) {
6806 			ct->ct_assoc =
6807 			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6808 			ct->ct_line_size =
6809 			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6810 			ct->ct_size = ct->ct_assoc *
6811 			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6812 			    ct->ct_line_size *
6813 			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6814 
6815 			if (level == 2) {
6816 				ct->ct_label = l2_cache_str;
6817 			} else if (level == 3) {
6818 				ct->ct_label = l3_cache_str;
6819 			}
6820 			ret = 1;
6821 		}
6822 	}
6823 
6824 	return (ret);
6825 }
6826 
6827 /*
6828  * Walk the cacheinfo descriptor, applying 'func' to every valid element
6829  * The walk is terminated if the walker returns non-zero.
6830  */
6831 static void
intel_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))6832 intel_walk_cacheinfo(struct cpuid_info *cpi,
6833     void *arg, int (*func)(void *, const struct cachetab *))
6834 {
6835 	const struct cachetab *ct;
6836 	struct cachetab des_49_ct, des_b1_ct;
6837 	uint8_t *dp;
6838 	int i;
6839 
6840 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6841 		return;
6842 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6843 		/*
6844 		 * For overloaded descriptor 0x49 we use cpuid function 4
6845 		 * if supported by the current processor, to create
6846 		 * cache information.
6847 		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6848 		 * to disambiguate the cache information.
6849 		 */
6850 		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6851 		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6852 				ct = &des_49_ct;
6853 		} else if (*dp == 0xb1) {
6854 			des_b1_ct.ct_code = 0xb1;
6855 			des_b1_ct.ct_assoc = 4;
6856 			des_b1_ct.ct_line_size = 0;
6857 			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6858 				des_b1_ct.ct_size = 8;
6859 				des_b1_ct.ct_label = itlb2M_str;
6860 			} else {
6861 				des_b1_ct.ct_size = 4;
6862 				des_b1_ct.ct_label = itlb4M_str;
6863 			}
6864 			ct = &des_b1_ct;
6865 		} else {
6866 			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6867 				continue;
6868 			}
6869 		}
6870 
6871 		if (func(arg, ct) != 0) {
6872 			break;
6873 		}
6874 	}
6875 }
6876 
6877 /*
6878  * (Like the Intel one, except for Cyrix CPUs)
6879  */
6880 static void
cyrix_walk_cacheinfo(struct cpuid_info * cpi,void * arg,int (* func)(void *,const struct cachetab *))6881 cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6882     void *arg, int (*func)(void *, const struct cachetab *))
6883 {
6884 	const struct cachetab *ct;
6885 	uint8_t *dp;
6886 	int i;
6887 
6888 	if ((dp = cpi->cpi_cacheinfo) == NULL)
6889 		return;
6890 	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6891 		/*
6892 		 * Search Cyrix-specific descriptor table first ..
6893 		 */
6894 		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6895 			if (func(arg, ct) != 0)
6896 				break;
6897 			continue;
6898 		}
6899 		/*
6900 		 * .. else fall back to the Intel one
6901 		 */
6902 		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6903 			if (func(arg, ct) != 0)
6904 				break;
6905 			continue;
6906 		}
6907 	}
6908 }
6909 
6910 /*
6911  * A cacheinfo walker that adds associativity, line-size, and size properties
6912  * to the devinfo node it is passed as an argument.
6913  */
6914 static int
add_cacheent_props(void * arg,const struct cachetab * ct)6915 add_cacheent_props(void *arg, const struct cachetab *ct)
6916 {
6917 	dev_info_t *devi = arg;
6918 
6919 	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6920 	if (ct->ct_line_size != 0)
6921 		add_cache_prop(devi, ct->ct_label, line_str,
6922 		    ct->ct_line_size);
6923 	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6924 	return (0);
6925 }
6926 
6927 
6928 static const char fully_assoc[] = "fully-associative?";
6929 
6930 /*
6931  * AMD style cache/tlb description
6932  *
6933  * Extended functions 5 and 6 directly describe properties of
6934  * tlbs and various cache levels.
6935  */
6936 static void
add_amd_assoc(dev_info_t * devi,const char * label,uint_t assoc)6937 add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6938 {
6939 	switch (assoc) {
6940 	case 0:	/* reserved; ignore */
6941 		break;
6942 	default:
6943 		add_cache_prop(devi, label, assoc_str, assoc);
6944 		break;
6945 	case 0xff:
6946 		add_cache_prop(devi, label, fully_assoc, 1);
6947 		break;
6948 	}
6949 }
6950 
6951 static void
add_amd_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)6952 add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6953 {
6954 	if (size == 0)
6955 		return;
6956 	add_cache_prop(devi, label, size_str, size);
6957 	add_amd_assoc(devi, label, assoc);
6958 }
6959 
6960 static void
add_amd_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)6961 add_amd_cache(dev_info_t *devi, const char *label,
6962     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6963 {
6964 	if (size == 0 || line_size == 0)
6965 		return;
6966 	add_amd_assoc(devi, label, assoc);
6967 	/*
6968 	 * Most AMD parts have a sectored cache. Multiple cache lines are
6969 	 * associated with each tag. A sector consists of all cache lines
6970 	 * associated with a tag. For example, the AMD K6-III has a sector
6971 	 * size of 2 cache lines per tag.
6972 	 */
6973 	if (lines_per_tag != 0)
6974 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6975 	add_cache_prop(devi, label, line_str, line_size);
6976 	add_cache_prop(devi, label, size_str, size * 1024);
6977 }
6978 
6979 static void
add_amd_l2_assoc(dev_info_t * devi,const char * label,uint_t assoc)6980 add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6981 {
6982 	switch (assoc) {
6983 	case 0:	/* off */
6984 		break;
6985 	case 1:
6986 	case 2:
6987 	case 4:
6988 		add_cache_prop(devi, label, assoc_str, assoc);
6989 		break;
6990 	case 6:
6991 		add_cache_prop(devi, label, assoc_str, 8);
6992 		break;
6993 	case 8:
6994 		add_cache_prop(devi, label, assoc_str, 16);
6995 		break;
6996 	case 0xf:
6997 		add_cache_prop(devi, label, fully_assoc, 1);
6998 		break;
6999 	default: /* reserved; ignore */
7000 		break;
7001 	}
7002 }
7003 
7004 static void
add_amd_l2_tlb(dev_info_t * devi,const char * label,uint_t assoc,uint_t size)7005 add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
7006 {
7007 	if (size == 0 || assoc == 0)
7008 		return;
7009 	add_amd_l2_assoc(devi, label, assoc);
7010 	add_cache_prop(devi, label, size_str, size);
7011 }
7012 
7013 static void
add_amd_l2_cache(dev_info_t * devi,const char * label,uint_t size,uint_t assoc,uint_t lines_per_tag,uint_t line_size)7014 add_amd_l2_cache(dev_info_t *devi, const char *label,
7015     uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
7016 {
7017 	if (size == 0 || assoc == 0 || line_size == 0)
7018 		return;
7019 	add_amd_l2_assoc(devi, label, assoc);
7020 	if (lines_per_tag != 0)
7021 		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
7022 	add_cache_prop(devi, label, line_str, line_size);
7023 	add_cache_prop(devi, label, size_str, size * 1024);
7024 }
7025 
7026 static void
amd_cache_info(struct cpuid_info * cpi,dev_info_t * devi)7027 amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
7028 {
7029 	struct cpuid_regs *cp;
7030 
7031 	if (cpi->cpi_xmaxeax < 0x80000005)
7032 		return;
7033 	cp = &cpi->cpi_extd[5];
7034 
7035 	/*
7036 	 * 4M/2M L1 TLB configuration
7037 	 *
7038 	 * We report the size for 2M pages because AMD uses two
7039 	 * TLB entries for one 4M page.
7040 	 */
7041 	add_amd_tlb(devi, "dtlb-2M",
7042 	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
7043 	add_amd_tlb(devi, "itlb-2M",
7044 	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
7045 
7046 	/*
7047 	 * 4K L1 TLB configuration
7048 	 */
7049 
7050 	switch (cpi->cpi_vendor) {
7051 		uint_t nentries;
7052 	case X86_VENDOR_TM:
7053 		if (cpi->cpi_family >= 5) {
7054 			/*
7055 			 * Crusoe processors have 256 TLB entries, but
7056 			 * cpuid data format constrains them to only
7057 			 * reporting 255 of them.
7058 			 */
7059 			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
7060 				nentries = 256;
7061 			/*
7062 			 * Crusoe processors also have a unified TLB
7063 			 */
7064 			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
7065 			    nentries);
7066 			break;
7067 		}
7068 		/*FALLTHROUGH*/
7069 	default:
7070 		add_amd_tlb(devi, itlb4k_str,
7071 		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
7072 		add_amd_tlb(devi, dtlb4k_str,
7073 		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
7074 		break;
7075 	}
7076 
7077 	/*
7078 	 * data L1 cache configuration
7079 	 */
7080 
7081 	add_amd_cache(devi, l1_dcache_str,
7082 	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
7083 	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
7084 
7085 	/*
7086 	 * code L1 cache configuration
7087 	 */
7088 
7089 	add_amd_cache(devi, l1_icache_str,
7090 	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
7091 	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
7092 
7093 	if (cpi->cpi_xmaxeax < 0x80000006)
7094 		return;
7095 	cp = &cpi->cpi_extd[6];
7096 
7097 	/* Check for a unified L2 TLB for large pages */
7098 
7099 	if (BITX(cp->cp_eax, 31, 16) == 0)
7100 		add_amd_l2_tlb(devi, "l2-tlb-2M",
7101 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7102 	else {
7103 		add_amd_l2_tlb(devi, "l2-dtlb-2M",
7104 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7105 		add_amd_l2_tlb(devi, "l2-itlb-2M",
7106 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7107 	}
7108 
7109 	/* Check for a unified L2 TLB for 4K pages */
7110 
7111 	if (BITX(cp->cp_ebx, 31, 16) == 0) {
7112 		add_amd_l2_tlb(devi, "l2-tlb-4K",
7113 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7114 	} else {
7115 		add_amd_l2_tlb(devi, "l2-dtlb-4K",
7116 		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
7117 		add_amd_l2_tlb(devi, "l2-itlb-4K",
7118 		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
7119 	}
7120 
7121 	add_amd_l2_cache(devi, l2_cache_str,
7122 	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
7123 	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
7124 }
7125 
7126 /*
7127  * There are two basic ways that the x86 world describes it cache
7128  * and tlb architecture - Intel's way and AMD's way.
7129  *
7130  * Return which flavor of cache architecture we should use
7131  */
7132 static int
x86_which_cacheinfo(struct cpuid_info * cpi)7133 x86_which_cacheinfo(struct cpuid_info *cpi)
7134 {
7135 	switch (cpi->cpi_vendor) {
7136 	case X86_VENDOR_Intel:
7137 		if (cpi->cpi_maxeax >= 2)
7138 			return (X86_VENDOR_Intel);
7139 		break;
7140 	case X86_VENDOR_AMD:
7141 		/*
7142 		 * The K5 model 1 was the first part from AMD that reported
7143 		 * cache sizes via extended cpuid functions.
7144 		 */
7145 		if (cpi->cpi_family > 5 ||
7146 		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
7147 			return (X86_VENDOR_AMD);
7148 		break;
7149 	case X86_VENDOR_HYGON:
7150 		return (X86_VENDOR_AMD);
7151 	case X86_VENDOR_TM:
7152 		if (cpi->cpi_family >= 5)
7153 			return (X86_VENDOR_AMD);
7154 		/*FALLTHROUGH*/
7155 	default:
7156 		/*
7157 		 * If they have extended CPU data for 0x80000005
7158 		 * then we assume they have AMD-format cache
7159 		 * information.
7160 		 *
7161 		 * If not, and the vendor happens to be Cyrix,
7162 		 * then try our-Cyrix specific handler.
7163 		 *
7164 		 * If we're not Cyrix, then assume we're using Intel's
7165 		 * table-driven format instead.
7166 		 */
7167 		if (cpi->cpi_xmaxeax >= 0x80000005)
7168 			return (X86_VENDOR_AMD);
7169 		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
7170 			return (X86_VENDOR_Cyrix);
7171 		else if (cpi->cpi_maxeax >= 2)
7172 			return (X86_VENDOR_Intel);
7173 		break;
7174 	}
7175 	return (-1);
7176 }
7177 
7178 void
cpuid_set_cpu_properties(void * dip,processorid_t cpu_id,struct cpuid_info * cpi)7179 cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
7180     struct cpuid_info *cpi)
7181 {
7182 	dev_info_t *cpu_devi;
7183 	int create;
7184 
7185 	cpu_devi = (dev_info_t *)dip;
7186 
7187 	/* device_type */
7188 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7189 	    "device_type", "cpu");
7190 
7191 	/* reg */
7192 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7193 	    "reg", cpu_id);
7194 
7195 	/* cpu-mhz, and clock-frequency */
7196 	if (cpu_freq > 0) {
7197 		long long mul;
7198 
7199 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7200 		    "cpu-mhz", cpu_freq);
7201 		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
7202 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7203 			    "clock-frequency", (int)mul);
7204 	}
7205 
7206 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7207 
7208 	/* vendor-id */
7209 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7210 	    "vendor-id", cpi->cpi_vendorstr);
7211 
7212 	if (cpi->cpi_maxeax == 0) {
7213 		return;
7214 	}
7215 
7216 	/*
7217 	 * family, model, and step
7218 	 */
7219 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7220 	    "family", CPI_FAMILY(cpi));
7221 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7222 	    "cpu-model", CPI_MODEL(cpi));
7223 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7224 	    "stepping-id", CPI_STEP(cpi));
7225 
7226 	/* type */
7227 	switch (cpi->cpi_vendor) {
7228 	case X86_VENDOR_Intel:
7229 		create = 1;
7230 		break;
7231 	default:
7232 		create = 0;
7233 		break;
7234 	}
7235 	if (create)
7236 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7237 		    "type", CPI_TYPE(cpi));
7238 
7239 	/* ext-family */
7240 	switch (cpi->cpi_vendor) {
7241 	case X86_VENDOR_Intel:
7242 	case X86_VENDOR_AMD:
7243 		create = cpi->cpi_family >= 0xf;
7244 		break;
7245 	case X86_VENDOR_HYGON:
7246 		create = 1;
7247 		break;
7248 	default:
7249 		create = 0;
7250 		break;
7251 	}
7252 	if (create)
7253 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7254 		    "ext-family", CPI_FAMILY_XTD(cpi));
7255 
7256 	/* ext-model */
7257 	switch (cpi->cpi_vendor) {
7258 	case X86_VENDOR_Intel:
7259 		create = IS_EXTENDED_MODEL_INTEL(cpi);
7260 		break;
7261 	case X86_VENDOR_AMD:
7262 		create = CPI_FAMILY(cpi) == 0xf;
7263 		break;
7264 	case X86_VENDOR_HYGON:
7265 		create = 1;
7266 		break;
7267 	default:
7268 		create = 0;
7269 		break;
7270 	}
7271 	if (create)
7272 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7273 		    "ext-model", CPI_MODEL_XTD(cpi));
7274 
7275 	/* generation */
7276 	switch (cpi->cpi_vendor) {
7277 	case X86_VENDOR_AMD:
7278 	case X86_VENDOR_HYGON:
7279 		/*
7280 		 * AMD K5 model 1 was the first part to support this
7281 		 */
7282 		create = cpi->cpi_xmaxeax >= 0x80000001;
7283 		break;
7284 	default:
7285 		create = 0;
7286 		break;
7287 	}
7288 	if (create)
7289 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7290 		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
7291 
7292 	/* brand-id */
7293 	switch (cpi->cpi_vendor) {
7294 	case X86_VENDOR_Intel:
7295 		/*
7296 		 * brand id first appeared on Pentium III Xeon model 8,
7297 		 * and Celeron model 8 processors and Opteron
7298 		 */
7299 		create = cpi->cpi_family > 6 ||
7300 		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
7301 		break;
7302 	case X86_VENDOR_AMD:
7303 		create = cpi->cpi_family >= 0xf;
7304 		break;
7305 	case X86_VENDOR_HYGON:
7306 		create = 1;
7307 		break;
7308 	default:
7309 		create = 0;
7310 		break;
7311 	}
7312 	if (create && cpi->cpi_brandid != 0) {
7313 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7314 		    "brand-id", cpi->cpi_brandid);
7315 	}
7316 
7317 	/* chunks, and apic-id */
7318 	switch (cpi->cpi_vendor) {
7319 		/*
7320 		 * first available on Pentium IV and Opteron (K8)
7321 		 */
7322 	case X86_VENDOR_Intel:
7323 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7324 		break;
7325 	case X86_VENDOR_AMD:
7326 		create = cpi->cpi_family >= 0xf;
7327 		break;
7328 	case X86_VENDOR_HYGON:
7329 		create = 1;
7330 		break;
7331 	default:
7332 		create = 0;
7333 		break;
7334 	}
7335 	if (create) {
7336 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7337 		    "chunks", CPI_CHUNKS(cpi));
7338 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7339 		    "apic-id", cpi->cpi_apicid);
7340 		if (cpi->cpi_chipid >= 0) {
7341 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7342 			    "chip#", cpi->cpi_chipid);
7343 			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7344 			    "clog#", cpi->cpi_clogid);
7345 		}
7346 	}
7347 
7348 	/* cpuid-features */
7349 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7350 	    "cpuid-features", CPI_FEATURES_EDX(cpi));
7351 
7352 
7353 	/* cpuid-features-ecx */
7354 	switch (cpi->cpi_vendor) {
7355 	case X86_VENDOR_Intel:
7356 		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
7357 		break;
7358 	case X86_VENDOR_AMD:
7359 		create = cpi->cpi_family >= 0xf;
7360 		break;
7361 	case X86_VENDOR_HYGON:
7362 		create = 1;
7363 		break;
7364 	default:
7365 		create = 0;
7366 		break;
7367 	}
7368 	if (create)
7369 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7370 		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
7371 
7372 	/* ext-cpuid-features */
7373 	switch (cpi->cpi_vendor) {
7374 	case X86_VENDOR_Intel:
7375 	case X86_VENDOR_AMD:
7376 	case X86_VENDOR_HYGON:
7377 	case X86_VENDOR_Cyrix:
7378 	case X86_VENDOR_TM:
7379 	case X86_VENDOR_Centaur:
7380 		create = cpi->cpi_xmaxeax >= 0x80000001;
7381 		break;
7382 	default:
7383 		create = 0;
7384 		break;
7385 	}
7386 	if (create) {
7387 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7388 		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
7389 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
7390 		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
7391 	}
7392 
7393 	/*
7394 	 * Brand String first appeared in Intel Pentium IV, AMD K5
7395 	 * model 1, and Cyrix GXm.  On earlier models we try and
7396 	 * simulate something similar .. so this string should always
7397 	 * same -something- about the processor, however lame.
7398 	 */
7399 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
7400 	    "brand-string", cpi->cpi_brandstr);
7401 
7402 	/*
7403 	 * Finally, cache and tlb information
7404 	 */
7405 	switch (x86_which_cacheinfo(cpi)) {
7406 	case X86_VENDOR_Intel:
7407 		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7408 		break;
7409 	case X86_VENDOR_Cyrix:
7410 		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7411 		break;
7412 	case X86_VENDOR_AMD:
7413 		amd_cache_info(cpi, cpu_devi);
7414 		break;
7415 	default:
7416 		break;
7417 	}
7418 }
7419 
7420 struct l2info {
7421 	int *l2i_csz;
7422 	int *l2i_lsz;
7423 	int *l2i_assoc;
7424 	int l2i_ret;
7425 };
7426 
7427 /*
7428  * A cacheinfo walker that fetches the size, line-size and associativity
7429  * of the L2 cache
7430  */
7431 static int
intel_l2cinfo(void * arg,const struct cachetab * ct)7432 intel_l2cinfo(void *arg, const struct cachetab *ct)
7433 {
7434 	struct l2info *l2i = arg;
7435 	int *ip;
7436 
7437 	if (ct->ct_label != l2_cache_str &&
7438 	    ct->ct_label != sl2_cache_str)
7439 		return (0);	/* not an L2 -- keep walking */
7440 
7441 	if ((ip = l2i->l2i_csz) != NULL)
7442 		*ip = ct->ct_size;
7443 	if ((ip = l2i->l2i_lsz) != NULL)
7444 		*ip = ct->ct_line_size;
7445 	if ((ip = l2i->l2i_assoc) != NULL)
7446 		*ip = ct->ct_assoc;
7447 	l2i->l2i_ret = ct->ct_size;
7448 	return (1);		/* was an L2 -- terminate walk */
7449 }
7450 
7451 /*
7452  * AMD L2/L3 Cache and TLB Associativity Field Definition:
7453  *
7454  *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7455  *	value is the associativity, the associativity for the L2 cache and
7456  *	tlb is encoded in the following table. The 4 bit L2 value serves as
7457  *	an index into the amd_afd[] array to determine the associativity.
7458  *	-1 is undefined. 0 is fully associative.
7459  */
7460 
7461 static int amd_afd[] =
7462 	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7463 
7464 static void
amd_l2cacheinfo(struct cpuid_info * cpi,struct l2info * l2i)7465 amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7466 {
7467 	struct cpuid_regs *cp;
7468 	uint_t size, assoc;
7469 	int i;
7470 	int *ip;
7471 
7472 	if (cpi->cpi_xmaxeax < 0x80000006)
7473 		return;
7474 	cp = &cpi->cpi_extd[6];
7475 
7476 	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7477 	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7478 		uint_t cachesz = size * 1024;
7479 		assoc = amd_afd[i];
7480 
7481 		ASSERT(assoc != -1);
7482 
7483 		if ((ip = l2i->l2i_csz) != NULL)
7484 			*ip = cachesz;
7485 		if ((ip = l2i->l2i_lsz) != NULL)
7486 			*ip = BITX(cp->cp_ecx, 7, 0);
7487 		if ((ip = l2i->l2i_assoc) != NULL)
7488 			*ip = assoc;
7489 		l2i->l2i_ret = cachesz;
7490 	}
7491 }
7492 
7493 int
getl2cacheinfo(cpu_t * cpu,int * csz,int * lsz,int * assoc)7494 getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7495 {
7496 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7497 	struct l2info __l2info, *l2i = &__l2info;
7498 
7499 	l2i->l2i_csz = csz;
7500 	l2i->l2i_lsz = lsz;
7501 	l2i->l2i_assoc = assoc;
7502 	l2i->l2i_ret = -1;
7503 
7504 	switch (x86_which_cacheinfo(cpi)) {
7505 	case X86_VENDOR_Intel:
7506 		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7507 		break;
7508 	case X86_VENDOR_Cyrix:
7509 		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7510 		break;
7511 	case X86_VENDOR_AMD:
7512 		amd_l2cacheinfo(cpi, l2i);
7513 		break;
7514 	default:
7515 		break;
7516 	}
7517 	return (l2i->l2i_ret);
7518 }
7519 
7520 #if !defined(__xpv)
7521 
7522 uint32_t *
cpuid_mwait_alloc(cpu_t * cpu)7523 cpuid_mwait_alloc(cpu_t *cpu)
7524 {
7525 	uint32_t	*ret;
7526 	size_t		mwait_size;
7527 
7528 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_EXTENDED));
7529 
7530 	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7531 	if (mwait_size == 0)
7532 		return (NULL);
7533 
7534 	/*
7535 	 * kmem_alloc() returns cache line size aligned data for mwait_size
7536 	 * allocations.  mwait_size is currently cache line sized.  Neither
7537 	 * of these implementation details are guarantied to be true in the
7538 	 * future.
7539 	 *
7540 	 * First try allocating mwait_size as kmem_alloc() currently returns
7541 	 * correctly aligned memory.  If kmem_alloc() does not return
7542 	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7543 	 *
7544 	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7545 	 * decide to free this memory.
7546 	 */
7547 	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7548 	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7549 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7550 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7551 		*ret = MWAIT_RUNNING;
7552 		return (ret);
7553 	} else {
7554 		kmem_free(ret, mwait_size);
7555 		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7556 		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7557 		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7558 		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7559 		*ret = MWAIT_RUNNING;
7560 		return (ret);
7561 	}
7562 }
7563 
7564 void
cpuid_mwait_free(cpu_t * cpu)7565 cpuid_mwait_free(cpu_t *cpu)
7566 {
7567 	if (cpu->cpu_m.mcpu_cpi == NULL) {
7568 		return;
7569 	}
7570 
7571 	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7572 	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7573 		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7574 		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7575 	}
7576 
7577 	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7578 	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7579 }
7580 
7581 void
patch_tsc_read(int flag)7582 patch_tsc_read(int flag)
7583 {
7584 	size_t cnt;
7585 
7586 	switch (flag) {
7587 	case TSC_NONE:
7588 		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7589 		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7590 		break;
7591 	case TSC_RDTSC_LFENCE:
7592 		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7593 		(void) memcpy((void *)tsc_read,
7594 		    (void *)&_tsc_lfence_start, cnt);
7595 		break;
7596 	case TSC_TSCP:
7597 		cnt = &_tscp_end - &_tscp_start;
7598 		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7599 		break;
7600 	default:
7601 		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7602 		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7603 		break;
7604 	}
7605 	tsc_type = flag;
7606 }
7607 
7608 int
cpuid_deep_cstates_supported(void)7609 cpuid_deep_cstates_supported(void)
7610 {
7611 	struct cpuid_info *cpi;
7612 	struct cpuid_regs regs;
7613 
7614 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7615 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7616 
7617 	cpi = CPU->cpu_m.mcpu_cpi;
7618 
7619 	switch (cpi->cpi_vendor) {
7620 	case X86_VENDOR_Intel:
7621 		if (cpi->cpi_xmaxeax < 0x80000007)
7622 			return (0);
7623 
7624 		/*
7625 		 * Does TSC run at a constant rate in all C-states?
7626 		 */
7627 		regs.cp_eax = 0x80000007;
7628 		(void) __cpuid_insn(&regs);
7629 		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7630 
7631 	default:
7632 		return (0);
7633 	}
7634 }
7635 
7636 #endif	/* !__xpv */
7637 
7638 void
post_startup_cpu_fixups(void)7639 post_startup_cpu_fixups(void)
7640 {
7641 #ifndef __xpv
7642 	/*
7643 	 * Some AMD processors support C1E state. Entering this state will
7644 	 * cause the local APIC timer to stop, which we can't deal with at
7645 	 * this time.
7646 	 */
7647 	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7648 		on_trap_data_t otd;
7649 		uint64_t reg;
7650 
7651 		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7652 			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7653 			/* Disable C1E state if it is enabled by BIOS */
7654 			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7655 			    AMD_ACTONCMPHALT_MASK) {
7656 				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7657 				    AMD_ACTONCMPHALT_SHIFT);
7658 				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7659 			}
7660 		}
7661 		no_trap();
7662 	}
7663 #endif	/* !__xpv */
7664 }
7665 
7666 void
enable_pcid(void)7667 enable_pcid(void)
7668 {
7669 	if (x86_use_pcid == -1)
7670 		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7671 
7672 	if (x86_use_invpcid == -1) {
7673 		x86_use_invpcid = is_x86_feature(x86_featureset,
7674 		    X86FSET_INVPCID);
7675 	}
7676 
7677 	if (!x86_use_pcid)
7678 		return;
7679 
7680 	/*
7681 	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7682 	 * bits; better make sure there's nothing there.
7683 	 */
7684 	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7685 
7686 	setcr4(getcr4() | CR4_PCIDE);
7687 }
7688 
7689 /*
7690  * Setup necessary registers to enable XSAVE feature on this processor.
7691  * This function needs to be called early enough, so that no xsave/xrstor
7692  * ops will execute on the processor before the MSRs are properly set up.
7693  *
7694  * Current implementation has the following assumption:
7695  * - cpuid_pass_basic() is done, so that X86 features are known.
7696  * - fpu_probe() is done, so that fp_save_mech is chosen.
7697  */
7698 void
xsave_setup_msr(cpu_t * cpu)7699 xsave_setup_msr(cpu_t *cpu)
7700 {
7701 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_BASIC));
7702 	ASSERT(fp_save_mech == FP_XSAVE);
7703 	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7704 
7705 	/* Enable OSXSAVE in CR4. */
7706 	setcr4(getcr4() | CR4_OSXSAVE);
7707 	/*
7708 	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7709 	 * correct value.
7710 	 */
7711 	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7712 	setup_xfem();
7713 }
7714 
7715 /*
7716  * Starting with the Westmere processor the local
7717  * APIC timer will continue running in all C-states,
7718  * including the deepest C-states.
7719  */
7720 int
cpuid_arat_supported(void)7721 cpuid_arat_supported(void)
7722 {
7723 	struct cpuid_info *cpi;
7724 	struct cpuid_regs regs;
7725 
7726 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7727 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7728 
7729 	cpi = CPU->cpu_m.mcpu_cpi;
7730 
7731 	switch (cpi->cpi_vendor) {
7732 	case X86_VENDOR_Intel:
7733 		/*
7734 		 * Always-running Local APIC Timer is
7735 		 * indicated by CPUID.6.EAX[2].
7736 		 */
7737 		if (cpi->cpi_maxeax >= 6) {
7738 			regs.cp_eax = 6;
7739 			(void) cpuid_insn(NULL, &regs);
7740 			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7741 		} else {
7742 			return (0);
7743 		}
7744 	default:
7745 		return (0);
7746 	}
7747 }
7748 
7749 /*
7750  * Check support for Intel ENERGY_PERF_BIAS feature
7751  */
7752 int
cpuid_iepb_supported(struct cpu * cp)7753 cpuid_iepb_supported(struct cpu *cp)
7754 {
7755 	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7756 	struct cpuid_regs regs;
7757 
7758 	ASSERT(cpuid_checkpass(cp, CPUID_PASS_BASIC));
7759 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7760 
7761 	if (!(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7762 		return (0);
7763 	}
7764 
7765 	/*
7766 	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7767 	 * capability bit CPUID.6.ECX.3
7768 	 */
7769 	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7770 		return (0);
7771 
7772 	regs.cp_eax = 0x6;
7773 	(void) cpuid_insn(NULL, &regs);
7774 	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7775 }
7776 
7777 /*
7778  * Check support for TSC deadline timer
7779  *
7780  * TSC deadline timer provides a superior software programming
7781  * model over local APIC timer that eliminates "time drifts".
7782  * Instead of specifying a relative time, software specifies an
7783  * absolute time as the target at which the processor should
7784  * generate a timer event.
7785  */
7786 int
cpuid_deadline_tsc_supported(void)7787 cpuid_deadline_tsc_supported(void)
7788 {
7789 	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7790 	struct cpuid_regs regs;
7791 
7792 	ASSERT(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7793 	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7794 
7795 	switch (cpi->cpi_vendor) {
7796 	case X86_VENDOR_Intel:
7797 		if (cpi->cpi_maxeax >= 1) {
7798 			regs.cp_eax = 1;
7799 			(void) cpuid_insn(NULL, &regs);
7800 			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7801 		} else {
7802 			return (0);
7803 		}
7804 	default:
7805 		return (0);
7806 	}
7807 }
7808 
7809 #if !defined(__xpv)
7810 /*
7811  * Patch in versions of bcopy for high performance Intel Nhm processors
7812  * and later...
7813  */
7814 void
patch_memops(uint_t vendor)7815 patch_memops(uint_t vendor)
7816 {
7817 	size_t cnt, i;
7818 	caddr_t to, from;
7819 
7820 	if ((vendor == X86_VENDOR_Intel) &&
7821 	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7822 		cnt = &bcopy_patch_end - &bcopy_patch_start;
7823 		to = &bcopy_ck_size;
7824 		from = &bcopy_patch_start;
7825 		for (i = 0; i < cnt; i++) {
7826 			*to++ = *from++;
7827 		}
7828 	}
7829 }
7830 #endif  /*  !__xpv */
7831 
7832 /*
7833  * We're being asked to tell the system how many bits are required to represent
7834  * the various thread and strand IDs. While it's tempting to derive this based
7835  * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7836  * correct. Instead, this needs to be based on the number of bits that the APIC
7837  * allows for these different configurations. We only update these to a larger
7838  * value if we find one.
7839  */
7840 void
cpuid_get_ext_topo(cpu_t * cpu,uint_t * core_nbits,uint_t * strand_nbits)7841 cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7842 {
7843 	struct cpuid_info *cpi;
7844 
7845 	VERIFY(cpuid_checkpass(CPU, CPUID_PASS_BASIC));
7846 	cpi = cpu->cpu_m.mcpu_cpi;
7847 
7848 	if (cpi->cpi_ncore_bits > *core_nbits) {
7849 		*core_nbits = cpi->cpi_ncore_bits;
7850 	}
7851 
7852 	if (cpi->cpi_nthread_bits > *strand_nbits) {
7853 		*strand_nbits = cpi->cpi_nthread_bits;
7854 	}
7855 }
7856 
7857 void
cpuid_pass_ucode(cpu_t * cpu,uchar_t * fset)7858 cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7859 {
7860 	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7861 	struct cpuid_regs cp;
7862 
7863 	/*
7864 	 * Reread the CPUID portions that we need for various security
7865 	 * information.
7866 	 */
7867 	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7868 		/*
7869 		 * Check if we now have leaf 7 available to us.
7870 		 */
7871 		if (cpi->cpi_maxeax < 7) {
7872 			bzero(&cp, sizeof (cp));
7873 			cp.cp_eax = 0;
7874 			cpi->cpi_maxeax = __cpuid_insn(&cp);
7875 			if (cpi->cpi_maxeax < 7)
7876 				return;
7877 		}
7878 
7879 		bzero(&cp, sizeof (cp));
7880 		cp.cp_eax = 7;
7881 		cp.cp_ecx = 0;
7882 		(void) __cpuid_insn(&cp);
7883 		cpi->cpi_std[7] = cp;
7884 	} else if (cpi->cpi_vendor == X86_VENDOR_AMD ||
7885 	    cpi->cpi_vendor == X86_VENDOR_HYGON) {
7886 		/* No xcpuid support */
7887 		if (cpi->cpi_family < 5 ||
7888 		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
7889 			return;
7890 
7891 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7892 			bzero(&cp, sizeof (cp));
7893 			cp.cp_eax = CPUID_LEAF_EXT_0;
7894 			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
7895 			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
7896 				return;
7897 			}
7898 		}
7899 
7900 		/*
7901 		 * Most AMD features are in leaf 8. Automatic IBRS was added in
7902 		 * leaf 0x21. So we also check that.
7903 		 */
7904 		bzero(&cp, sizeof (cp));
7905 		cp.cp_eax = CPUID_LEAF_EXT_8;
7906 		(void) __cpuid_insn(&cp);
7907 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
7908 		cpi->cpi_extd[8] = cp;
7909 
7910 		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_21) {
7911 			return;
7912 		}
7913 
7914 		bzero(&cp, sizeof (cp));
7915 		cp.cp_eax = CPUID_LEAF_EXT_21;
7916 		(void) __cpuid_insn(&cp);
7917 		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_21, &cp);
7918 		cpi->cpi_extd[0x21] = cp;
7919 	} else {
7920 		/*
7921 		 * Nothing to do here. Return an empty set which has already
7922 		 * been zeroed for us.
7923 		 */
7924 		return;
7925 	}
7926 	cpuid_scan_security(cpu, fset);
7927 }
7928 
7929 /* ARGSUSED */
7930 static int
cpuid_post_ucodeadm_xc(xc_arg_t arg0,xc_arg_t arg1,xc_arg_t arg2)7931 cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
7932 {
7933 	uchar_t *fset;
7934 	boolean_t first_pass = (boolean_t)arg1;
7935 
7936 	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
7937 	if (first_pass && CPU->cpu_id != 0)
7938 		return (0);
7939 	if (!first_pass && CPU->cpu_id == 0)
7940 		return (0);
7941 	cpuid_pass_ucode(CPU, fset);
7942 
7943 	return (0);
7944 }
7945 
7946 /*
7947  * After a microcode update where the version has changed, then we need to
7948  * rescan CPUID. To do this we check every CPU to make sure that they have the
7949  * same microcode. Then we perform a cross call to all such CPUs. It's the
7950  * caller's job to make sure that no one else can end up doing an update while
7951  * this is going on.
7952  *
7953  * We assume that the system is microcode capable if we're called.
7954  */
7955 void
cpuid_post_ucodeadm(void)7956 cpuid_post_ucodeadm(void)
7957 {
7958 	uint32_t rev;
7959 	int i;
7960 	struct cpu *cpu;
7961 	cpuset_t cpuset;
7962 	void *argdata;
7963 	uchar_t *f0;
7964 
7965 	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
7966 
7967 	mutex_enter(&cpu_lock);
7968 	cpu = cpu_get(0);
7969 	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
7970 	CPUSET_ONLY(cpuset, 0);
7971 	for (i = 1; i < max_ncpus; i++) {
7972 		if ((cpu = cpu_get(i)) == NULL)
7973 			continue;
7974 
7975 		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
7976 			panic("post microcode update CPU %d has differing "
7977 			    "microcode revision (%u) from CPU 0 (%u)",
7978 			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
7979 		}
7980 		CPUSET_ADD(cpuset, i);
7981 	}
7982 
7983 	/*
7984 	 * We do the cross calls in two passes. The first pass is only for the
7985 	 * boot CPU. The second pass is for all of the other CPUs. This allows
7986 	 * the boot CPU to go through and change behavior related to patching or
7987 	 * whether or not Enhanced IBRS needs to be enabled and then allow all
7988 	 * other CPUs to follow suit.
7989 	 */
7990 	kpreempt_disable();
7991 	xc_sync((xc_arg_t)argdata, B_TRUE, 0, CPUSET2BV(cpuset),
7992 	    cpuid_post_ucodeadm_xc);
7993 	xc_sync((xc_arg_t)argdata, B_FALSE, 0, CPUSET2BV(cpuset),
7994 	    cpuid_post_ucodeadm_xc);
7995 	kpreempt_enable();
7996 
7997 	/*
7998 	 * OK, now look at each CPU and see if their feature sets are equal.
7999 	 */
8000 	f0 = argdata;
8001 	for (i = 1; i < max_ncpus; i++) {
8002 		uchar_t *fset;
8003 		if (!CPU_IN_SET(cpuset, i))
8004 			continue;
8005 
8006 		fset = (uchar_t *)((uintptr_t)argdata +
8007 		    sizeof (x86_featureset) * i);
8008 
8009 		if (!compare_x86_featureset(f0, fset)) {
8010 			panic("Post microcode update CPU %d has "
8011 			    "differing security feature (%p) set from CPU 0 "
8012 			    "(%p), not appending to feature set", i,
8013 			    (void *)fset, (void *)f0);
8014 		}
8015 	}
8016 
8017 	mutex_exit(&cpu_lock);
8018 
8019 	for (i = 0; i < NUM_X86_FEATURES; i++) {
8020 		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
8021 		    x86_feature_names[i]);
8022 		if (is_x86_feature(f0, i)) {
8023 			add_x86_feature(x86_featureset, i);
8024 		}
8025 	}
8026 	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
8027 }
8028 
8029 typedef void (*cpuid_pass_f)(cpu_t *, void *);
8030 
8031 typedef struct cpuid_pass_def {
8032 	cpuid_pass_t cpd_pass;
8033 	cpuid_pass_f cpd_func;
8034 } cpuid_pass_def_t;
8035 
8036 /*
8037  * See block comment at the top; note that cpuid_pass_ucode is not a pass in the
8038  * normal sense and should not appear here.
8039  */
8040 static const cpuid_pass_def_t cpuid_pass_defs[] = {
8041 	{ CPUID_PASS_PRELUDE, cpuid_pass_prelude },
8042 	{ CPUID_PASS_IDENT, cpuid_pass_ident },
8043 	{ CPUID_PASS_BASIC, cpuid_pass_basic },
8044 	{ CPUID_PASS_EXTENDED, cpuid_pass_extended },
8045 	{ CPUID_PASS_DYNAMIC, cpuid_pass_dynamic },
8046 	{ CPUID_PASS_RESOLVE, cpuid_pass_resolve },
8047 };
8048 
8049 void
cpuid_execpass(cpu_t * cp,cpuid_pass_t pass,void * arg)8050 cpuid_execpass(cpu_t *cp, cpuid_pass_t pass, void *arg)
8051 {
8052 	VERIFY3S(pass, !=, CPUID_PASS_NONE);
8053 
8054 	if (cp == NULL)
8055 		cp = CPU;
8056 
8057 	/*
8058 	 * Space statically allocated for BSP, ensure pointer is set
8059 	 */
8060 	if (cp->cpu_id == 0 && cp->cpu_m.mcpu_cpi == NULL)
8061 		cp->cpu_m.mcpu_cpi = &cpuid_info0;
8062 
8063 	ASSERT(cpuid_checkpass(cp, pass - 1));
8064 
8065 	for (uint_t i = 0; i < ARRAY_SIZE(cpuid_pass_defs); i++) {
8066 		if (cpuid_pass_defs[i].cpd_pass == pass) {
8067 			cpuid_pass_defs[i].cpd_func(cp, arg);
8068 			cp->cpu_m.mcpu_cpi->cpi_pass = pass;
8069 			return;
8070 		}
8071 	}
8072 
8073 	panic("unable to execute invalid cpuid pass %d on cpu%d\n",
8074 	    pass, cp->cpu_id);
8075 }
8076 
8077 /*
8078  * Extract the processor family from a chiprev.  Processor families are not the
8079  * same as cpuid families; see comments above and in x86_archext.h.
8080  */
8081 x86_processor_family_t
chiprev_family(const x86_chiprev_t cr)8082 chiprev_family(const x86_chiprev_t cr)
8083 {
8084 	return ((x86_processor_family_t)_X86_CHIPREV_FAMILY(cr));
8085 }
8086 
8087 /*
8088  * A chiprev matches its template if the vendor and family are identical and the
8089  * revision of the chiprev matches one of the bits set in the template.  Callers
8090  * may bitwise-OR together chiprevs of the same vendor and family to form the
8091  * template, or use the _ANY variant.  It is not possible to match chiprevs of
8092  * multiple vendors or processor families with a single call.  Note that this
8093  * function operates on processor families, not cpuid families.
8094  */
8095 boolean_t
chiprev_matches(const x86_chiprev_t cr,const x86_chiprev_t template)8096 chiprev_matches(const x86_chiprev_t cr, const x86_chiprev_t template)
8097 {
8098 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(template) &&
8099 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(template) &&
8100 	    (_X86_CHIPREV_REV(cr) & _X86_CHIPREV_REV(template)) != 0);
8101 }
8102 
8103 /*
8104  * A chiprev is at least min if the vendor and family are identical and the
8105  * revision of the chiprev is at least as recent as that of min.  Processor
8106  * families are considered unordered and cannot be compared using this function.
8107  * Note that this function operates on processor families, not cpuid families.
8108  * Use of the _ANY chiprev variant with this function is not useful; it will
8109  * always return B_FALSE if the _ANY variant is supplied as the minimum
8110  * revision.  To determine only whether a chiprev is of a given processor
8111  * family, test the return value of chiprev_family() instead.
8112  */
8113 boolean_t
chiprev_at_least(const x86_chiprev_t cr,const x86_chiprev_t min)8114 chiprev_at_least(const x86_chiprev_t cr, const x86_chiprev_t min)
8115 {
8116 	return (_X86_CHIPREV_VENDOR(cr) == _X86_CHIPREV_VENDOR(min) &&
8117 	    _X86_CHIPREV_FAMILY(cr) == _X86_CHIPREV_FAMILY(min) &&
8118 	    _X86_CHIPREV_REV(cr) >= _X86_CHIPREV_REV(min));
8119 }
8120 
8121 /*
8122  * The uarch functions operate in a manner similar to the chiprev functions
8123  * above.  While it is tempting to allow these to operate on microarchitectures
8124  * produced by a specific vendor in an ordered fashion (e.g., ZEN3 is "newer"
8125  * than ZEN2), we elect not to do so because a manufacturer may supply
8126  * processors of multiple different microarchitecture families each of which may
8127  * be internally ordered but unordered with respect to those of other families.
8128  */
8129 x86_uarch_t
uarchrev_uarch(const x86_uarchrev_t ur)8130 uarchrev_uarch(const x86_uarchrev_t ur)
8131 {
8132 	return ((x86_uarch_t)_X86_UARCHREV_UARCH(ur));
8133 }
8134 
8135 boolean_t
uarchrev_matches(const x86_uarchrev_t ur,const x86_uarchrev_t template)8136 uarchrev_matches(const x86_uarchrev_t ur, const x86_uarchrev_t template)
8137 {
8138 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(template) &&
8139 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(template) &&
8140 	    (_X86_UARCHREV_REV(ur) & _X86_UARCHREV_REV(template)) != 0);
8141 }
8142 
8143 boolean_t
uarchrev_at_least(const x86_uarchrev_t ur,const x86_uarchrev_t min)8144 uarchrev_at_least(const x86_uarchrev_t ur, const x86_uarchrev_t min)
8145 {
8146 	return (_X86_UARCHREV_VENDOR(ur) == _X86_UARCHREV_VENDOR(min) &&
8147 	    _X86_UARCHREV_UARCH(ur) == _X86_UARCHREV_UARCH(min) &&
8148 	    _X86_UARCHREV_REV(ur) >= _X86_UARCHREV_REV(min));
8149 }
8150 
8151 /*
8152  * Topology cache related information. This is yet another cache interface that
8153  * we're exposing out intended to be used when we have either Intel Leaf 4 or
8154  * AMD Leaf 8x1D (introduced with Zen 1).
8155  */
8156 static boolean_t
cpuid_cache_topo_sup(const struct cpuid_info * cpi)8157 cpuid_cache_topo_sup(const struct cpuid_info *cpi)
8158 {
8159 	switch (cpi->cpi_vendor) {
8160 	case X86_VENDOR_Intel:
8161 		if (cpi->cpi_maxeax >= 4) {
8162 			return (B_TRUE);
8163 		}
8164 		break;
8165 	case X86_VENDOR_AMD:
8166 	case X86_VENDOR_HYGON:
8167 		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
8168 		    is_x86_feature(x86_featureset, X86FSET_TOPOEXT)) {
8169 			return (B_TRUE);
8170 		}
8171 		break;
8172 	default:
8173 		break;
8174 	}
8175 
8176 	return (B_FALSE);
8177 }
8178 
8179 int
cpuid_getncaches(struct cpu * cpu,uint32_t * ncache)8180 cpuid_getncaches(struct cpu *cpu, uint32_t *ncache)
8181 {
8182 	const struct cpuid_info *cpi;
8183 
8184 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8185 	cpi = cpu->cpu_m.mcpu_cpi;
8186 
8187 	if (!cpuid_cache_topo_sup(cpi)) {
8188 		return (ENOTSUP);
8189 	}
8190 
8191 	*ncache = cpi->cpi_cache_leaf_size;
8192 	return (0);
8193 }
8194 
8195 int
cpuid_getcache(struct cpu * cpu,uint32_t cno,x86_cache_t * cache)8196 cpuid_getcache(struct cpu *cpu, uint32_t cno, x86_cache_t *cache)
8197 {
8198 	const struct cpuid_info *cpi;
8199 	const struct cpuid_regs *cp;
8200 
8201 	ASSERT(cpuid_checkpass(cpu, CPUID_PASS_DYNAMIC));
8202 	cpi = cpu->cpu_m.mcpu_cpi;
8203 
8204 	if (!cpuid_cache_topo_sup(cpi)) {
8205 		return (ENOTSUP);
8206 	}
8207 
8208 	if (cno >= cpi->cpi_cache_leaf_size) {
8209 		return (EINVAL);
8210 	}
8211 
8212 	bzero(cache, sizeof (cache));
8213 	cp = cpi->cpi_cache_leaves[cno];
8214 	switch (CPI_CACHE_TYPE(cp)) {
8215 	case CPI_CACHE_TYPE_DATA:
8216 		cache->xc_type = X86_CACHE_TYPE_DATA;
8217 		break;
8218 	case CPI_CACHE_TYPE_INSTR:
8219 		cache->xc_type = X86_CACHE_TYPE_INST;
8220 		break;
8221 	case CPI_CACHE_TYPE_UNIFIED:
8222 		cache->xc_type = X86_CACHE_TYPE_UNIFIED;
8223 		break;
8224 	case CPI_CACHE_TYPE_DONE:
8225 	default:
8226 		return (EINVAL);
8227 	}
8228 	cache->xc_level = CPI_CACHE_LVL(cp);
8229 	if (CPI_FULL_ASSOC_CACHE(cp) != 0) {
8230 		cache->xc_flags |= X86_CACHE_F_FULL_ASSOC;
8231 	}
8232 	cache->xc_nparts = CPI_CACHE_PARTS(cp) + 1;
8233 	/*
8234 	 * The number of sets is reserved on AMD if the CPU is tagged as fully
8235 	 * associative, where as it is considered valid on Intel.
8236 	 */
8237 	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
8238 	    CPI_FULL_ASSOC_CACHE(cp) != 0) {
8239 		cache->xc_nsets = 1;
8240 	} else {
8241 		cache->xc_nsets = CPI_CACHE_SETS(cp) + 1;
8242 	}
8243 	cache->xc_nways = CPI_CACHE_WAYS(cp) + 1;
8244 	cache->xc_line_size = CPI_CACHE_COH_LN_SZ(cp) + 1;
8245 	cache->xc_size = cache->xc_nparts * cache->xc_nsets * cache->xc_nways *
8246 	    cache->xc_line_size;
8247 	/*
8248 	 * We're looking for the number of bits to cover the number of CPUs that
8249 	 * are being shared. Normally this would be the value - 1, but the CPUID
8250 	 * value is encoded as the actual value minus one, so we don't modify
8251 	 * this at all.
8252 	 */
8253 	cache->xc_apic_shift = highbit(CPI_NTHR_SHR_CACHE(cp));
8254 
8255 	/*
8256 	 * To construct a unique ID we construct a uint64_t that looks as
8257 	 * follows:
8258 	 *
8259 	 * [47:40] cache level
8260 	 * [39:32] CPUID cache type
8261 	 * [31:00] shifted APIC ID
8262 	 *
8263 	 * The shifted APIC ID gives us a guarantee that a given cache entry is
8264 	 * unique within its peers. The other two numbers give us something that
8265 	 * ensures that something is unique within the CPU. If we just had the
8266 	 * APIC ID shifted over by the indicated number of bits we'd end up with
8267 	 * an ID of zero for the L1I, L1D, L2, and L3.
8268 	 *
8269 	 * The format of this ID is private to the system and can change across
8270 	 * a reboot for the time being.
8271 	 */
8272 	cache->xc_id = (uint64_t)cache->xc_level << 40;
8273 	cache->xc_id |= (uint64_t)cache->xc_type << 32;
8274 	cache->xc_id |= (uint64_t)cpi->cpi_apicid >> cache->xc_apic_shift;
8275 
8276 	return (0);
8277 }
8278