1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 * Copyright 2020 Joyent, Inc.
27 */
28/*
29 * Copyright (c) 2010, Intel Corporation.
30 * All rights reserved.
31 */
32/*
33 * Portions Copyright 2009 Advanced Micro Devices, Inc.
34 */
35
36/*
37 * CPU Identification logic
38 *
39 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
40 * with the identification of CPUs, their features, and their topologies. More
41 * specifically, this file helps drive the following:
42 *
43 * 1. Enumeration of features of the processor which are used by the kernel to
44 *    determine what features to enable or disable. These may be instruction set
45 *    enhancements or features that we use.
46 *
47 * 2. Enumeration of instruction set architecture (ISA) additions that userland
48 *    will be told about through the auxiliary vector.
49 *
50 * 3. Understanding the physical topology of the CPU such as the number of
51 *    caches, how many cores it has, whether or not it supports symmetric
52 *    multi-processing (SMT), etc.
53 *
54 * ------------------------
55 * CPUID History and Basics
56 * ------------------------
57 *
58 * The cpuid instruction was added by Intel roughly around the time that the
59 * original Pentium was introduced. The purpose of cpuid was to tell in a
60 * programmatic fashion information about the CPU that previously was guessed
61 * at. For example, an important part of cpuid is that we can know what
62 * extensions to the ISA exist. If you use an invalid opcode you would get a
63 * #UD, so this method allows a program (whether a user program or the kernel)
64 * to determine what exists without crashing or getting a SIGILL. Of course,
65 * this was also during the era of the clones and the AMD Am5x86. The vendor
66 * name shows up first in cpuid for a reason.
67 *
68 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
69 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
70 * its own meaning. The different leaves are broken down into different regions:
71 *
72 *	[ 0, 7fffffff ]			This region is called the 'basic'
73 *					region. This region is generally defined
74 *					by Intel, though some of the original
75 *					portions have different meanings based
76 *					on the manufacturer. These days, Intel
77 *					adds most new features to this region.
78 *					AMD adds non-Intel compatible
79 *					information in the third, extended
80 *					region. Intel uses this for everything
81 *					including ISA extensions, CPU
82 *					features, cache information, topology,
83 *					and more.
84 *
85 *					There is a hole carved out of this
86 *					region which is reserved for
87 *					hypervisors.
88 *
89 *	[ 40000000, 4fffffff ]		This region, which is found in the
90 *					middle of the previous region, is
91 *					explicitly promised to never be used by
92 *					CPUs. Instead, it is used by hypervisors
93 *					to communicate information about
94 *					themselves to the operating system. The
95 *					values and details are unique for each
96 *					hypervisor.
97 *
98 *	[ 80000000, ffffffff ]		This region is called the 'extended'
99 *					region. Some of the low leaves mirror
100 *					parts of the basic leaves. This region
101 *					has generally been used by AMD for
102 *					various extensions. For example, AMD-
103 *					specific information about caches,
104 *					features, and topology are found in this
105 *					region.
106 *
107 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
108 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
109 * the ranges, one of the primary things returned is the maximum valid leaf in
110 * that range. This allows for discovery of what range of CPUID is valid.
111 *
112 * The CPUs have potentially surprising behavior when using an invalid leaf or
113 * unimplemented leaf. If the requested leaf is within the valid basic or
114 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
115 * set to zero. However, if you specify a leaf that is outside of a valid range,
116 * then instead it will be filled with the last valid _basic_ leaf. For example,
117 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
118 * an invalid extended leaf will return the information for leaf 3.
119 *
120 * Some leaves are broken down into sub-leaves. This means that the value
121 * depends on both the leaf asked for in %eax and a secondary register. For
122 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
123 * additional information. Or when getting topology information in leaf 0xb, the
124 * initial value in %ecx changes which level of the topology that you are
125 * getting information about.
126 *
127 * cpuid values are always kept to 32 bits regardless of whether or not the
128 * program is in 64-bit mode. When executing in 64-bit mode, the upper
129 * 32 bits of the register are always set to zero so that way the values are the
130 * same regardless of execution mode.
131 *
132 * ----------------------
133 * Identifying Processors
134 * ----------------------
135 *
136 * We can identify a processor in two steps. The first step looks at cpuid leaf
137 * 0. Leaf 0 contains the processor's vendor information. This is done by
138 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
139 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
140 *
141 * From there, a processor is identified by a combination of three different
142 * values:
143 *
144 *  1. Family
145 *  2. Model
146 *  3. Stepping
147 *
148 * Each vendor uses the family and model to uniquely identify a processor. The
149 * way that family and model are changed depends on the vendor. For example,
150 * Intel has been using family 0x6 for almost all of their processor since the
151 * Pentium Pro/Pentium II era, often called the P6. The model is used to
152 * identify the exact processor. Different models are often used for the client
153 * (consumer) and server parts. Even though each processor often has major
154 * architectural differences, they still are considered the same family by
155 * Intel.
156 *
157 * On the other hand, each major AMD architecture generally has its own family.
158 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
159 * the model number is used to help identify specific processors.
160 *
161 * The stepping is used to refer to a revision of a specific microprocessor. The
162 * term comes from equipment used to produce masks that are used to create
163 * integrated circuits.
164 *
165 * The information is present in leaf 1, %eax. In technical documentation you
166 * will see the terms extended model and extended family. The original family,
167 * model, and stepping fields were each 4 bits wide. If the values in either
168 * are 0xf, then one is to consult the extended model and extended family, which
169 * take previously reserved bits and allow for a larger number of models and add
170 * 0xf to them.
171 *
172 * When we process this information, we store the full family, model, and
173 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
174 * cpi_step, respectively. Whenever you are performing comparisons with the
175 * family, model, and stepping, you should use these members and not the raw
176 * values from cpuid. If you must use the raw values from cpuid directly, you
177 * must make sure that you add the extended model and family to the base model
178 * and family.
179 *
180 * In general, we do not use information about the family, model, and stepping
181 * to determine whether or not a feature is present; that is generally driven by
182 * specific leaves. However, when something we care about on the processor is
183 * not considered 'architectural' meaning that it is specific to a set of
184 * processors and not promised in the architecture model to be consistent from
185 * generation to generation, then we will fall back on this information. The
186 * most common cases where this comes up is when we have to workaround errata in
187 * the processor, are dealing with processor-specific features such as CPU
188 * performance counters, or we want to provide additional information for things
189 * such as fault management.
190 *
191 * While processors also do have a brand string, which is the name that people
192 * are familiar with when buying the processor, they are not meant for
193 * programmatic consumption. That is what the family, model, and stepping are
194 * for.
195 *
196 * ------------
197 * CPUID Passes
198 * ------------
199 *
200 * As part of performing feature detection, we break this into several different
201 * passes. The passes are as follows:
202 *
203 *	Pass 0		This is a primordial pass done in locore.s to deal with
204 *			Cyrix CPUs that don't support cpuid. The reality is that
205 *			we likely don't run on them any more, but there is still
206 *			logic for handling them.
207 *
208 *	Pass 1		This is the primary pass and is responsible for doing a
209 *			large number of different things:
210 *
211 *			1. Determine which vendor manufactured the CPU and
212 *			determining the family, model, and stepping information.
213 *
214 *			2. Gathering a large number of feature flags to
215 *			determine which features the CPU support and which
216 *			indicate things that we need to do other work in the OS
217 *			to enable. Features detected this way are added to the
218 *			x86_featureset which can be queried to
219 *			determine what we should do. This includes processing
220 *			all of the basic and extended CPU features that we care
221 *			about.
222 *
223 *			3. Determining the CPU's topology. This includes
224 *			information about how many cores and threads are present
225 *			in the package. It also is responsible for figuring out
226 *			which logical CPUs are potentially part of the same core
227 *			and what other resources they might share. For more
228 *			information see the 'Topology' section.
229 *
230 *			4. Determining the set of CPU security-specific features
231 *			that we need to worry about and determine the
232 *			appropriate set of workarounds.
233 *
234 *			Pass 1 on the boot CPU occurs before KMDB is started.
235 *
236 *	Pass 2		The second pass is done after startup(). Here, we check
237 *			other miscellaneous features. Most of this is gathering
238 *			additional basic and extended features that we'll use in
239 *			later passes or for debugging support.
240 *
241 *	Pass 3		The third pass occurs after the kernel memory allocator
242 *			has been fully initialized. This gathers information
243 *			where we might need dynamic memory available for our
244 *			uses. This includes several varying width leaves that
245 *			have cache information and the processor's brand string.
246 *
247 *	Pass 4		The fourth and final normal pass is performed after the
248 *			kernel has brought most everything online. This is
249 *			invoked from post_startup(). In this pass, we go through
250 *			the set of features that we have enabled and turn that
251 *			into the hardware auxiliary vector features that
252 *			userland receives. This is used by userland, primarily
253 *			by the run-time link-editor (RTLD), though userland
254 *			software could also refer to it directly.
255 *
256 *	Microcode	After a microcode update, we do a selective rescan of
257 *			the cpuid leaves to determine what features have
258 *			changed. Microcode updates can provide more details
259 *			about security related features to deal with issues like
260 *			Spectre and L1TF. On occasion, vendors have violated
261 *			their contract and removed bits. However, we don't try
262 *			to detect that because that puts us in a situation that
263 *			we really can't deal with. As such, the only thing we
264 *			rescan are security related features today. See
265 *			cpuid_pass_ucode().
266 *
267 * All of the passes (except pass 0) are run on all CPUs. However, for the most
268 * part we only care about what the boot CPU says about this information and use
269 * the other CPUs as a rough guide to sanity check that we have the same feature
270 * set.
271 *
272 * We do not support running multiple logical CPUs with disjoint, let alone
273 * different, feature sets.
274 *
275 * ------------------
276 * Processor Topology
277 * ------------------
278 *
279 * One of the important things that we need to do is to understand the topology
280 * of the underlying processor. When we say topology in this case, we're trying
281 * to understand the relationship between the logical CPUs that the operating
282 * system sees and the underlying physical layout. Different logical CPUs may
283 * share different resources which can have important consequences for the
284 * performance of the system. For example, they may share caches, execution
285 * units, and more.
286 *
287 * The topology of the processor changes from generation to generation and
288 * vendor to vendor.  Along with that, different vendors use different
289 * terminology, and the operating system itself uses occasionally overlapping
290 * terminology. It's important to understand what this topology looks like so
291 * one can understand the different things that we try to calculate and
292 * determine.
293 *
294 * To get started, let's talk about a little bit of terminology that we've used
295 * so far, is used throughout this file, and is fairly generic across multiple
296 * vendors:
297 *
298 * CPU
299 *	A central processing unit (CPU) refers to a logical and/or virtual
300 *	entity that the operating system can execute instructions on. The
301 *	underlying resources for this CPU may be shared between multiple
302 *	entities; however, to the operating system it is a discrete unit.
303 *
304 * PROCESSOR and PACKAGE
305 *
306 *	Generally, when we use the term 'processor' on its own, we are referring
307 *	to the physical entity that one buys and plugs into a board. However,
308 *	because processor has been overloaded and one might see it used to mean
309 *	multiple different levels, we will instead use the term 'package' for
310 *	the rest of this file. The term package comes from the electrical
311 *	engineering side and refers to the physical entity that encloses the
312 *	electronics inside. Strictly speaking the package can contain more than
313 *	just the CPU, for example, on many processors it may also have what's
314 *	called an 'integrated graphical processing unit (GPU)'. Because the
315 *	package can encapsulate multiple units, it is the largest physical unit
316 *	that we refer to.
317 *
318 * SOCKET
319 *
320 *	A socket refers to unit on a system board (generally the motherboard)
321 *	that can receive a package. A single package, or processor, is plugged
322 *	into a single socket. A system may have multiple sockets. Often times,
323 *	the term socket is used interchangeably with package and refers to the
324 *	electrical component that has plugged in, and not the receptacle itself.
325 *
326 * CORE
327 *
328 *	A core refers to the physical instantiation of a CPU, generally, with a
329 *	full set of hardware resources available to it. A package may contain
330 *	multiple cores inside of it or it may just have a single one. A
331 *	processor with more than one core is often referred to as 'multi-core'.
332 *	In illumos, we will use the feature X86FSET_CMP to refer to a system
333 *	that has 'multi-core' processors.
334 *
335 *	A core may expose a single logical CPU to the operating system, or it
336 *	may expose multiple CPUs, which we call threads, defined below.
337 *
338 *	Some resources may still be shared by cores in the same package. For
339 *	example, many processors will share the level 3 cache between cores.
340 *	Some AMD generations share hardware resources between cores. For more
341 *	information on that see the section 'AMD Topology'.
342 *
343 * THREAD and STRAND
344 *
345 *	In this file, generally a thread refers to a hardware resources and not
346 *	the operating system's logical abstraction. A thread is always exposed
347 *	as an independent logical CPU to the operating system. A thread belongs
348 *	to a specific core. A core may have more than one thread. When that is
349 *	the case, the threads that are part of the same core are often referred
350 *	to as 'siblings'.
351 *
352 *	When multiple threads exist, this is generally referred to as
353 *	simultaneous multi-threading (SMT). When Intel introduced this in their
354 *	processors they called it hyper-threading (HT). When multiple threads
355 *	are active in a core, they split the resources of the core. For example,
356 *	two threads may share the same set of hardware execution units.
357 *
358 *	The operating system often uses the term 'strand' to refer to a thread.
359 *	This helps disambiguate it from the software concept.
360 *
361 * CHIP
362 *
363 *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
364 *	base meaning, it is used to refer to a single integrated circuit, which
365 *	may or may not be the only thing in the package. In illumos, when you
366 *	see the term 'chip' it is almost always referring to the same thing as
367 *	the 'package'. However, many vendors may use chip to refer to one of
368 *	many integrated circuits that have been placed in the package. As an
369 *	example, see the subsequent definition.
370 *
371 *	To try and keep things consistent, we will only use chip when referring
372 *	to the entire integrated circuit package, with the exception of the
373 *	definition of multi-chip module (because it is in the name) and use the
374 *	term 'die' when we want the more general, potential sub-component
375 *	definition.
376 *
377 * DIE
378 *
379 *	A die refers to an integrated circuit. Inside of the package there may
380 *	be a single die or multiple dies. This is sometimes called a 'chip' in
381 *	vendor's parlance, but in this file, we use the term die to refer to a
382 *	subcomponent.
383 *
384 * MULTI-CHIP MODULE
385 *
386 *	A multi-chip module (MCM) refers to putting multiple distinct chips that
387 *	are connected together in the same package. When a multi-chip design is
388 *	used, generally each chip is manufactured independently and then joined
389 *	together in the package. For example, on AMD's Zen microarchitecture
390 *	(family 0x17), the package contains several dies (the second meaning of
391 *	chip from above) that are connected together.
392 *
393 * CACHE
394 *
395 *	A cache is a part of the processor that maintains copies of recently
396 *	accessed memory. Caches are split into levels and then into types.
397 *	Commonly there are one to three levels, called level one, two, and
398 *	three. The lower the level, the smaller it is, the closer it is to the
399 *	execution units of the CPU, and the faster it is to access. The layout
400 *	and design of the cache come in many different flavors, consult other
401 *	resources for a discussion of those.
402 *
403 *	Caches are generally split into two types, the instruction and data
404 *	cache. The caches contain what their names suggest, the instruction
405 *	cache has executable program text, while the data cache has all other
406 *	memory that the processor accesses. As of this writing, data is kept
407 *	coherent between all of the caches on x86, so if one modifies program
408 *	text before it is executed, that will be in the data cache, and the
409 *	instruction cache will be synchronized with that change when the
410 *	processor actually executes those instructions. This coherency also
411 *	covers the fact that data could show up in multiple caches.
412 *
413 *	Generally, the lowest level caches are specific to a core. However, the
414 *	last layer cache is shared between some number of cores. The number of
415 *	CPUs sharing this last level cache is important. This has implications
416 *	for the choices that the scheduler makes, as accessing memory that might
417 *	be in a remote cache after thread migration can be quite expensive.
418 *
419 *	Sometimes, the word cache is abbreviated with a '$', because in US
420 *	English the word cache is pronounced the same as cash. So L1D$ refers to
421 *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
422 *	in the rest of this theory statement for clarity.
423 *
424 * MEMORY CONTROLLER
425 *
426 *	The memory controller is a component that provides access to DRAM. Each
427 *	memory controller can access a set number of DRAM channels. Each channel
428 *	can have a number of DIMMs (sticks of memory) associated with it. A
429 *	given package may have more than one memory controller. The association
430 *	of the memory controller to a group of cores is important as it is
431 *	cheaper to access memory on the controller that you are associated with.
432 *
433 * NUMA
434 *
435 *	NUMA or non-uniform memory access, describes a way that systems are
436 *	built. On x86, any processor core can address all of the memory in the
437 *	system. However, When using multiple sockets or possibly within a
438 *	multi-chip module, some of that memory is physically closer and some of
439 *	it is further. Memory that is further away is more expensive to access.
440 *	Consider the following image of multiple sockets with memory:
441 *
442 *	+--------+                                                +--------+
443 *	| DIMM A |         +----------+      +----------+         | DIMM D |
444 *	+--------+-+       |          |      |          |       +-+------+-+
445 *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
446 *	  +--------+-+     |          |      |          |     +-+------+-+
447 *	    | DIMM C |     +----------+      +----------+     | DIMM F |
448 *	    +--------+                                        +--------+
449 *
450 *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
451 *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
452 *	access DIMMs A-C and more expensive to access D-F as it has to go
453 *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
454 *	D-F are cheaper than A-C. While the socket form is the most common, when
455 *	using multi-chip modules, this can also sometimes occur. For another
456 *	example of this that's more involved, see the AMD topology section.
457 *
458 *
459 * Intel Topology
460 * --------------
461 *
462 * Most Intel processors since Nehalem, (as of this writing the current gen
463 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
464 * the package is a single monolithic die. MCMs currently aren't used. Most
465 * parts have three levels of caches, with the L3 cache being shared between
466 * all of the cores on the package. The L1/L2 cache is generally specific to
467 * an individual core. The following image shows at a simplified level what
468 * this looks like. The memory controller is commonly part of something called
469 * the 'Uncore', that used to be separate physical chips that were not a part of
470 * the package, but are now part of the same chip.
471 *
472 *  +-----------------------------------------------------------------------+
473 *  | Package                                                               |
474 *  |  +-------------------+  +-------------------+  +-------------------+  |
475 *  |  | Core              |  | Core              |  | Core              |  |
476 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
477 *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
478 *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
479 *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
480 *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
481 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
482 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
483 *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
484 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
485 *  |  +-------------------+  +-------------------+  +-------------------+  |
486 *  | +-------------------------------------------------------------------+ |
487 *  | |                         Shared L3 Cache                           | |
488 *  | +-------------------------------------------------------------------+ |
489 *  | +-------------------------------------------------------------------+ |
490 *  | |                        Memory Controller                          | |
491 *  | +-------------------------------------------------------------------+ |
492 *  +-----------------------------------------------------------------------+
493 *
494 * A side effect of this current architecture is that what we care about from a
495 * scheduling and topology perspective, is simplified. In general we care about
496 * understanding which logical CPUs are part of the same core and socket.
497 *
498 * To determine the relationship between threads and cores, Intel initially used
499 * the identifier in the advanced programmable interrupt controller (APIC). They
500 * also added cpuid leaf 4 to give additional information about the number of
501 * threads and CPUs in the processor. With the addition of x2apic (which
502 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
503 * additional cpuid topology leaf 0xB was added.
504 *
505 * AMD Topology
506 * ------------
507 *
508 * When discussing AMD topology, we want to break this into three distinct
509 * generations of topology. There's the basic topology that has been used in
510 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
511 * with family 0x15 (Bulldozer), and there's the topology that was introduced
512 * with family 0x17 (Zen). AMD also has some additional terminology that's worth
513 * talking about.
514 *
515 * Until the introduction of family 0x17 (Zen), AMD did not implement something
516 * that they considered SMT. Whether or not the AMD processors have SMT
517 * influences many things including scheduling and reliability, availability,
518 * and serviceability (RAS) features.
519 *
520 * NODE
521 *
522 *	AMD uses the term node to refer to a die that contains a number of cores
523 *	and I/O resources. Depending on the processor family and model, more
524 *	than one node can be present in the package. When there is more than one
525 *	node this indicates a multi-chip module. Usually each node has its own
526 *	access to memory and I/O devices. This is important and generally
527 *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
528 *	result, we track this relationship in the operating system.
529 *
530 *	In processors with an L3 cache, the L3 cache is generally shared across
531 *	the entire node, though the way this is carved up varies from generation
532 *	to generation.
533 *
534 * BULLDOZER
535 *
536 *	Starting with the Bulldozer family (0x15) and continuing until the
537 *	introduction of the Zen microarchitecture, AMD introduced the idea of a
538 *	compute unit. In a compute unit, two traditional cores share a number of
539 *	hardware resources. Critically, they share the FPU, L1 instruction
540 *	cache, and the L2 cache. Several compute units were then combined inside
541 *	of a single node.  Because the integer execution units, L1 data cache,
542 *	and some other resources were not shared between the cores, AMD never
543 *	considered this to be SMT.
544 *
545 * ZEN
546 *
547 *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
548 *	is called Zeppelin. These modules are similar to the idea of nodes used
549 *	previously. Each of these nodes has two DRAM channels which all of the
550 *	cores in the node can access uniformly. These nodes are linked together
551 *	in the package, creating a NUMA environment.
552 *
553 *	The Zeppelin die itself contains two different 'core complexes'. Each
554 *	core complex consists of four cores which each have two threads, for a
555 *	total of 8 logical CPUs per complex. Unlike other generations,
556 *	where all the logical CPUs in a given node share the L3 cache, here each
557 *	core complex has its own shared L3 cache.
558 *
559 *	A further thing that we need to consider is that in some configurations,
560 *	particularly with the Threadripper line of processors, not every die
561 *	actually has its memory controllers wired up to actual memory channels.
562 *	This means that some cores have memory attached to them and others
563 *	don't.
564 *
565 *	To put Zen in perspective, consider the following images:
566 *
567 *      +--------------------------------------------------------+
568 *      | Core Complex                                           |
569 *      | +-------------------+    +-------------------+  +---+  |
570 *      | | Core       +----+ |    | Core       +----+ |  |   |  |
571 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
572 *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
573 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
574 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
575 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
576 *      | +-------------------+    +-------------------+  | C |  |
577 *      | +-------------------+    +-------------------+  | a |  |
578 *      | | Core       +----+ |    | Core       +----+ |  | c |  |
579 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
580 *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
581 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
582 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
583 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
584 *      | +-------------------+    +-------------------+  +---+  |
585 *      |                                                        |
586 *	+--------------------------------------------------------+
587 *
588 *  This first image represents a single Zen core complex that consists of four
589 *  cores.
590 *
591 *
592 *	+--------------------------------------------------------+
593 *	| Zeppelin Die                                           |
594 *	|  +--------------------------------------------------+  |
595 *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
596 *	|  +--------------------------------------------------+  |
597 *      |                           HH                           |
598 *	|          +-----------+    HH    +-----------+          |
599 *	|          |           |    HH    |           |          |
600 *	|          |    Core   |==========|    Core   |          |
601 *	|          |  Complex  |==========|  Complex  |          |
602 *	|          |           |    HH    |           |          |
603 *	|          +-----------+    HH    +-----------+          |
604 *      |                           HH                           |
605 *	|  +--------------------------------------------------+  |
606 *	|  |                Memory Controller                 |  |
607 *	|  +--------------------------------------------------+  |
608 *      |                                                        |
609 *	+--------------------------------------------------------+
610 *
611 *  This image represents a single Zeppelin Die. Note how both cores are
612 *  connected to the same memory controller and I/O units. While each core
613 *  complex has its own L3 cache as seen in the first image, they both have
614 *  uniform access to memory.
615 *
616 *
617 *                      PP                     PP
618 *                      PP                     PP
619 *           +----------PP---------------------PP---------+
620 *           |          PP                     PP         |
621 *           |    +-----------+          +-----------+    |
622 *           |    |           |          |           |    |
623 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
624 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
625 *           |    |           |          |           |    |
626 *           |    +-----------+ooo    ...+-----------+    |
627 *           |          HH      ooo  ...       HH         |
628 *           |          HH        oo..         HH         |
629 *           |          HH        ..oo         HH         |
630 *           |          HH      ...  ooo       HH         |
631 *           |    +-----------+...    ooo+-----------+    |
632 *           |    |           |          |           |    |
633 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
634 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
635 *           |    |           |          |           |    |
636 *           |    +-----------+          +-----------+    |
637 *           |          PP                     PP         |
638 *           +----------PP---------------------PP---------+
639 *                      PP                     PP
640 *                      PP                     PP
641 *
642 *  This image represents a single Zen package. In this example, it has four
643 *  Zeppelin dies, though some configurations only have a single one. In this
644 *  example, each die is directly connected to the next. Also, each die is
645 *  represented as being connected to memory by the 'M' character and connected
646 *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
647 *  die is made up of two core complexes, we have multiple different NUMA
648 *  domains that we care about for these systems.
649 *
650 * CPUID LEAVES
651 *
652 * There are a few different CPUID leaves that we can use to try and understand
653 * the actual state of the world. As part of the introduction of family 0xf, AMD
654 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
655 * processors that are in the system. Because families before Zen didn't have
656 * SMT, this was always the number of cores that were in the system. However, it
657 * should always be thought of as the number of logical threads to be consistent
658 * between generations. In addition we also get the size of the APIC ID that is
659 * used to represent the number of logical processors. This is important for
660 * deriving topology information.
661 *
662 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
663 * bit between Bulldozer and later families, but it is quite useful in
664 * determining the topology information. Because this information has changed
665 * across family generations, it's worth calling out what these mean
666 * explicitly. The registers have the following meanings:
667 *
668 *	%eax	The APIC ID. The entire register is defined to have a 32-bit
669 *		APIC ID, even though on systems without x2apic support, it will
670 *		be limited to 8 bits.
671 *
672 *	%ebx	On Bulldozer-era systems this contains information about the
673 *		number of cores that are in a compute unit (cores that share
674 *		resources). It also contains a per-package compute unit ID that
675 *		identifies which compute unit the logical CPU is a part of.
676 *
677 *		On Zen-era systems this instead contains the number of threads
678 *		per core and the ID of the core that the logical CPU is a part
679 *		of. Note, this ID is unique only to the package, it is not
680 *		globally unique across the entire system.
681 *
682 *	%ecx	This contains the number of nodes that exist in the package. It
683 *		also contains an ID that identifies which node the logical CPU
684 *		is a part of.
685 *
686 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
687 * cache layout to determine which logical CPUs are sharing which caches.
688 *
689 * illumos Topology
690 * ----------------
691 *
692 * Based on the above we synthesize the information into several different
693 * variables that we store in the 'struct cpuid_info'. We'll go into the details
694 * of what each member is supposed to represent and their uniqueness. In
695 * general, there are two levels of uniqueness that we care about. We care about
696 * an ID that is globally unique. That means that it will be unique across all
697 * entities in the system. For example, the default logical CPU ID is globally
698 * unique. On the other hand, there is some information that we only care about
699 * being unique within the context of a single package / socket. Here are the
700 * variables that we keep track of and their meaning.
701 *
702 * Several of the values that are asking for an identifier, with the exception
703 * of cpi_apicid, are allowed to be synthetic.
704 *
705 *
706 * cpi_apicid
707 *
708 *	This is the value of the CPU's APIC id. This should be the full 32-bit
709 *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
710 *	APIC ID. This value is globally unique between all logical CPUs across
711 *	all packages. This is usually required by the APIC.
712 *
713 * cpi_chipid
714 *
715 *	This value indicates the ID of the package that the logical CPU is a
716 *	part of. This value is allowed to be synthetic. It is usually derived by
717 *	taking the CPU's APIC ID and determining how many bits are used to
718 *	represent CPU cores in the package. All logical CPUs that are part of
719 *	the same package must have the same value.
720 *
721 * cpi_coreid
722 *
723 *	This represents the ID of a CPU core. Two logical CPUs should only have
724 *	the same cpi_coreid value if they are part of the same core. These
725 *	values may be synthetic. On systems that support SMT, this value is
726 *	usually derived from the APIC ID, otherwise it is often synthetic and
727 *	just set to the value of the cpu_id in the cpu_t.
728 *
729 * cpi_pkgcoreid
730 *
731 *	This is similar to the cpi_coreid in that logical CPUs that are part of
732 *	the same core should have the same ID. The main difference is that these
733 *	values are only required to be unique to a given socket.
734 *
735 * cpi_clogid
736 *
737 *	This represents the logical ID of a logical CPU. This value should be
738 *	unique within a given socket for each logical CPU. This is allowed to be
739 *	synthetic, though it is usually based off of the CPU's apic ID. The
740 *	broader system expects that logical CPUs that have are part of the same
741 *	core have contiguous numbers. For example, if there were two threads per
742 *	core, then the core IDs divided by two should be the same and the first
743 *	modulus two should be zero and the second one. For example, IDs 4 and 5
744 *	indicate two logical CPUs that are part of the same core. But IDs 5 and
745 *	6 represent two logical CPUs that are part of different cores.
746 *
747 *	While it is common for the cpi_coreid and the cpi_clogid to be derived
748 *	from the same source, strictly speaking, they don't have to be and the
749 *	two values should be considered logically independent. One should not
750 *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
751 *	some kind of relationship. While this is tempting, we've seen cases on
752 *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
753 *
754 * cpi_ncpu_per_chip
755 *
756 *	This value indicates the total number of logical CPUs that exist in the
757 *	physical package. Critically, this is not the number of logical CPUs
758 *	that exist for just the single core.
759 *
760 *	This value should be the same for all logical CPUs in the same package.
761 *
762 * cpi_ncore_per_chip
763 *
764 *	This value indicates the total number of physical CPU cores that exist
765 *	in the package. The system compares this value with cpi_ncpu_per_chip to
766 *	determine if simultaneous multi-threading (SMT) is enabled. When
767 *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
768 *	the X86FSET_HTT feature is not set. If this value is greater than one,
769 *	than we consider the processor to have the feature X86FSET_CMP, to
770 *	indicate that there is support for more than one core.
771 *
772 *	This value should be the same for all logical CPUs in the same package.
773 *
774 * cpi_procnodes_per_pkg
775 *
776 *	This value indicates the number of 'nodes' that exist in the package.
777 *	When processors are actually a multi-chip module, this represents the
778 *	number of such modules that exist in the package. Currently, on Intel
779 *	based systems this member is always set to 1.
780 *
781 *	This value should be the same for all logical CPUs in the same package.
782 *
783 * cpi_procnodeid
784 *
785 *	This value indicates the ID of the node that the logical CPU is a part
786 *	of. All logical CPUs that are in the same node must have the same value
787 *	here. This value must be unique across all of the packages in the
788 *	system.  On Intel based systems, this is currently set to the value in
789 *	cpi_chipid because there is only one node.
790 *
791 * cpi_cores_per_compunit
792 *
793 *	This value indicates the number of cores that are part of a compute
794 *	unit. See the AMD topology section for this. This member only has real
795 *	meaning currently for AMD Bulldozer family processors. For all other
796 *	processors, this should currently be set to 1.
797 *
798 * cpi_compunitid
799 *
800 *	This indicates the compute unit that the logical CPU belongs to. For
801 *	processors without AMD Bulldozer-style compute units this should be set
802 *	to the value of cpi_coreid.
803 *
804 * cpi_ncpu_shr_last_cache
805 *
806 *	This indicates the number of logical CPUs that are sharing the same last
807 *	level cache. This value should be the same for all CPUs that are sharing
808 *	that cache. The last cache refers to the cache that is closest to memory
809 *	and furthest away from the CPU.
810 *
811 * cpi_last_lvl_cacheid
812 *
813 *	This indicates the ID of the last cache that the logical CPU uses. This
814 *	cache is often shared between multiple logical CPUs and is the cache
815 *	that is closest to memory and furthest away from the CPU. This value
816 *	should be the same for a group of logical CPUs only if they actually
817 *	share the same last level cache. IDs should not overlap between
818 *	packages.
819 *
820 * cpi_ncore_bits
821 *
822 *	This indicates the number of bits that are required to represent all of
823 *	the cores in the system. As cores are derived based on their APIC IDs,
824 *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
825 *	this value to be larger than the actual number of IDs that are present
826 *	in the system. This is used to size tables by the CMI framework. It is
827 *	only filled in for Intel and AMD CPUs.
828 *
829 * cpi_nthread_bits
830 *
831 *	This indicates the number of bits required to represent all of the IDs
832 *	that cover the logical CPUs that exist on a given core. It's OK for this
833 *	value to be larger than the actual number of IDs that are present in the
834 *	system.  This is used to size tables by the CMI framework. It is
835 *	only filled in for Intel and AMD CPUs.
836 *
837 * -----------
838 * Hypervisors
839 * -----------
840 *
841 * If trying to manage the differences between vendors wasn't bad enough, it can
842 * get worse thanks to our friend hardware virtualization. Hypervisors are given
843 * the ability to interpose on all cpuid instructions and change them to suit
844 * their purposes. In general, this is necessary as the hypervisor wants to be
845 * able to present a more uniform set of features or not necessarily give the
846 * guest operating system kernel knowledge of all features so it can be
847 * more easily migrated between systems.
848 *
849 * When it comes to trying to determine topology information, this can be a
850 * double edged sword. When a hypervisor doesn't actually implement a cpuid
851 * leaf, it'll often return all zeros. Because of that, you'll often see various
852 * checks scattered about fields being non-zero before we assume we can use
853 * them.
854 *
855 * When it comes to topology information, the hypervisor is often incentivized
856 * to lie to you about topology. This is because it doesn't always actually
857 * guarantee that topology at all. The topology path we take in the system
858 * depends on how the CPU advertises itself. If it advertises itself as an Intel
859 * or AMD CPU, then we basically do our normal path. However, when they don't
860 * use an actual vendor, then that usually turns into multiple one-core CPUs
861 * that we enumerate that are often on different sockets. The actual behavior
862 * depends greatly on what the hypervisor actually exposes to us.
863 *
864 * --------------------
865 * Exposing Information
866 * --------------------
867 *
868 * We expose CPUID information in three different forms in the system.
869 *
870 * The first is through the x86_featureset variable. This is used in conjunction
871 * with the is_x86_feature() function. This is queried by x86-specific functions
872 * to determine which features are or aren't present in the system and to make
873 * decisions based upon them. For example, users of this include everything from
874 * parts of the system dedicated to reliability, availability, and
875 * serviceability (RAS), to making decisions about how to handle security
876 * mitigations, to various x86-specific drivers. General purpose or
877 * architecture independent drivers should never be calling this function.
878 *
879 * The second means is through the auxiliary vector. The auxiliary vector is a
880 * series of tagged data that the kernel passes down to a user program when it
881 * begins executing. This information is used to indicate to programs what
882 * instruction set extensions are present. For example, information about the
883 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
884 * since user programs cannot make use of it. However, things like the AVX
885 * instruction sets are. Programs use this information to make run-time
886 * decisions about what features they should use. As an example, the run-time
887 * link-editor (rtld) can relocate different functions depending on the hardware
888 * support available.
889 *
890 * The final form is through a series of accessor functions that all have the
891 * form cpuid_get*. This is used by a number of different subsystems in the
892 * kernel to determine more detailed information about what we're running on,
893 * topology information, etc. Some of these subsystems include processor groups
894 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
895 * microcode, and performance monitoring. These functions all ASSERT that the
896 * CPU they're being called on has reached a certain cpuid pass. If the passes
897 * are rearranged, then this needs to be adjusted.
898 *
899 * -----------------------------------------------
900 * Speculative Execution CPU Side Channel Security
901 * -----------------------------------------------
902 *
903 * With the advent of the Spectre and Meltdown attacks which exploit speculative
904 * execution in the CPU to create side channels there have been a number of
905 * different attacks and corresponding issues that the operating system needs to
906 * mitigate against. The following list is some of the common, but not
907 * exhaustive, set of issues that we know about and have done some or need to do
908 * more work in the system to mitigate against:
909 *
910 *   - Spectre v1
911 *   - swapgs (Spectre v1 variant)
912 *   - Spectre v2
913 *   - Meltdown (Spectre v3)
914 *   - Rogue Register Read (Spectre v3a)
915 *   - Speculative Store Bypass (Spectre v4)
916 *   - ret2spec, SpectreRSB
917 *   - L1 Terminal Fault (L1TF)
918 *   - Microarchitectural Data Sampling (MDS)
919 *
920 * Each of these requires different sets of mitigations and has different attack
921 * surfaces. For the most part, this discussion is about protecting the kernel
922 * from non-kernel executing environments such as user processes and hardware
923 * virtual machines. Unfortunately, there are a number of user vs. user
924 * scenarios that exist with these. The rest of this section will describe the
925 * overall approach that the system has taken to address these as well as their
926 * shortcomings. Unfortunately, not all of the above have been handled today.
927 *
928 * SPECTRE v2, ret2spec, SpectreRSB
929 *
930 * The second variant of the spectre attack focuses on performing branch target
931 * injection. This generally impacts indirect call instructions in the system.
932 * There are three different ways to mitigate this issue that are commonly
933 * described today:
934 *
935 *  1. Using Indirect Branch Restricted Speculation (IBRS).
936 *  2. Using Retpolines and RSB Stuffing
937 *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
938 *
939 * IBRS uses a feature added to microcode to restrict speculation, among other
940 * things. This form of mitigation has not been used as it has been generally
941 * seen as too expensive and requires reactivation upon various transitions in
942 * the system.
943 *
944 * As a less impactful alternative to IBRS, retpolines were developed by
945 * Google. These basically require one to replace indirect calls with a specific
946 * trampoline that will cause speculation to fail and break the attack.
947 * Retpolines require compiler support. We always build with retpolines in the
948 * external thunk mode. This means that a traditional indirect call is replaced
949 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
950 * of this is that all indirect function calls are performed through a register.
951 *
952 * We have to use a common external location of the thunk and not inline it into
953 * the callsite so that way we can have a single place to patch these functions.
954 * As it turns out, we actually have three different forms of retpolines that
955 * exist in the system:
956 *
957 *  1. A full retpoline
958 *  2. An AMD-specific optimized retpoline
959 *  3. A no-op version
960 *
961 * The first one is used in the general case. The second one is used if we can
962 * determine that we're on an AMD system and we can successfully toggle the
963 * lfence serializing MSR that exists on the platform. Basically with this
964 * present, an lfence is sufficient and we don't need to do anywhere near as
965 * complicated a dance to successfully use retpolines.
966 *
967 * The third form described above is the most curious. It turns out that the way
968 * that retpolines are implemented is that they rely on how speculation is
969 * performed on a 'ret' instruction. Intel has continued to optimize this
970 * process (which is partly why we need to have return stack buffer stuffing,
971 * but more on that in a bit) and in processors starting with Cascade Lake
972 * on the server side, it's dangerous to rely on retpolines. Instead, a new
973 * mechanism has been introduced called Enhanced IBRS (EIBRS).
974 *
975 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
976 * physical core. However, if this is the case, we don't want to use retpolines
977 * any more. Therefore if EIBRS is present, we end up turning each retpoline
978 * function (called a thunk) into a jmp instruction. This means that we're still
979 * paying the cost of an extra jump to the external thunk, but it gives us
980 * flexibility and the ability to have a single kernel image that works across a
981 * wide variety of systems and hardware features.
982 *
983 * Unfortunately, this alone is insufficient. First, Skylake systems have
984 * additional speculation for the Return Stack Buffer (RSB) which is used to
985 * return from call instructions which retpolines take advantage of. However,
986 * this problem is not just limited to Skylake and is actually more pernicious.
987 * The SpectreRSB paper introduces several more problems that can arise with
988 * dealing with this. The RSB can be poisoned just like the indirect branch
989 * predictor. This means that one needs to clear the RSB when transitioning
990 * between two different privilege domains. Some examples include:
991 *
992 *  - Switching between two different user processes
993 *  - Going between user land and the kernel
994 *  - Returning to the kernel from a hardware virtual machine
995 *
996 * Mitigating this involves combining a couple of different things. The first is
997 * SMEP (supervisor mode execution protection) which was introduced in Ivy
998 * Bridge. When an RSB entry refers to a user address and we're executing in the
999 * kernel, speculation through it will be stopped when SMEP is enabled. This
1000 * protects against a number of the different cases that we would normally be
1001 * worried about such as when we enter the kernel from user land.
1002 *
1003 * To prevent against additional manipulation of the RSB from other contexts
1004 * such as a non-root VMX context attacking the kernel we first look to enhanced
1005 * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1006 * need to do to protect the kernel at this time.
1007 *
1008 * On CPUs without EIBRS we need to manually overwrite the contents of the
1009 * return stack buffer. We do this through the x86_rsb_stuff() function.
1010 * Currently this is employed on context switch. The x86_rsb_stuff() function is
1011 * disabled when enhanced IBRS is present because Intel claims on such systems
1012 * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1013 * to user attacks via the RSB.
1014 *
1015 * If SMEP is not present, then we would have to stuff the RSB every time we
1016 * transitioned from user mode to the kernel, which isn't very practical right
1017 * now.
1018 *
1019 * To fully protect user to user and vmx to vmx attacks from these classes of
1020 * issues, we would also need to allow them to opt into performing an Indirect
1021 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1022 *
1023 * By default, the system will enable RSB stuffing and the required variant of
1024 * retpolines and store that information in the x86_spectrev2_mitigation value.
1025 * This will be evaluated after a microcode update as well, though it is
1026 * expected that microcode updates will not take away features. This may mean
1027 * that a late loaded microcode may not end up in the optimal configuration
1028 * (though this should be rare).
1029 *
1030 * Currently we do not build kmdb with retpolines or perform any additional side
1031 * channel security mitigations for it. One complication with kmdb is that it
1032 * requires its own retpoline thunks and it would need to adjust itself based on
1033 * what the kernel does. The threat model of kmdb is more limited and therefore
1034 * it may make more sense to investigate using prediction barriers as the whole
1035 * system is only executing a single instruction at a time while in kmdb.
1036 *
1037 * SPECTRE v1, v4
1038 *
1039 * The v1 and v4 variants of spectre are not currently mitigated in the
1040 * system and require other classes of changes to occur in the code.
1041 *
1042 * SPECTRE v1 (SWAPGS VARIANT)
1043 *
1044 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1045 * can generally affect any branch-dependent code. The swapgs issue is one
1046 * variant of this. If we are coming in from userspace, we can have code like
1047 * this:
1048 *
1049 *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1050 *	je	1f
1051 *	movq	$0, REGOFF_SAVFP(%rsp)
1052 *	swapgs
1053 *	1:
1054 *	movq	%gs:CPU_THREAD, %rax
1055 *
1056 * If an attacker can cause a mis-speculation of the branch here, we could skip
1057 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1058 * load. If subsequent code can act as the usual Spectre cache gadget, this
1059 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1060 * any use of the %gs override.
1061 *
1062 * The other case is also an issue: if we're coming into a trap from kernel
1063 * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1064 * using it. AMD systems are not vulnerable to this version, as a swapgs is
1065 * serializing with respect to subsequent uses. But as AMD /does/ need the other
1066 * case, and the fix is the same in both cases (an lfence at the branch target
1067 * 1: in this example), we'll just do it unconditionally.
1068 *
1069 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1070 * harder for user-space to actually set a useful %gsbase value: although it's
1071 * not clear, it might still be feasible via lwp_setprivate(), though, so we
1072 * mitigate anyway.
1073 *
1074 * MELTDOWN
1075 *
1076 * Meltdown, or spectre v3, allowed a user process to read any data in their
1077 * address space regardless of whether or not the page tables in question
1078 * allowed the user to have the ability to read them. The solution to meltdown
1079 * is kernel page table isolation. In this world, there are two page tables that
1080 * are used for a process, one in user land and one in the kernel. To implement
1081 * this we use per-CPU page tables and switch between the user and kernel
1082 * variants when entering and exiting the kernel.  For more information about
1083 * this process and how the trampolines work, please see the big theory
1084 * statements and additional comments in:
1085 *
1086 *  - uts/i86pc/ml/kpti_trampolines.s
1087 *  - uts/i86pc/vm/hat_i86.c
1088 *
1089 * While Meltdown only impacted Intel systems and there are also Intel systems
1090 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1091 * kernel page table isolation enabled. While this may at first seem weird, an
1092 * important thing to remember is that you can't speculatively read an address
1093 * if it's never in your page table at all. Having user processes without kernel
1094 * pages present provides us with an important layer of defense in the kernel
1095 * against any other side channel attacks that exist and have yet to be
1096 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1097 * default, no matter the x86 system.
1098 *
1099 * L1 TERMINAL FAULT
1100 *
1101 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1102 * execution uses page table entries. Effectively, it is two different problems.
1103 * The first is that it ignores the not present bit in the page table entries
1104 * when performing speculative execution. This means that something can
1105 * speculatively read the listed physical address if it's present in the L1
1106 * cache under certain conditions (see Intel's documentation for the full set of
1107 * conditions). Secondly, this can be used to bypass hardware virtualization
1108 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1109 * instructions.
1110 *
1111 * For the non-hardware virtualized case, this is relatively easy to deal with.
1112 * We must make sure that all unmapped pages have an address of zero. This means
1113 * that they could read the first 4k of physical memory; however, we never use
1114 * that first page in the operating system and always skip putting it in our
1115 * memory map, even if firmware tells us we can use it in our memory map. While
1116 * other systems try to put extra metadata in the address and reserved bits,
1117 * which led to this being problematic in those cases, we do not.
1118 *
1119 * For hardware virtual machines things are more complicated. Because they can
1120 * construct their own page tables, it isn't hard for them to perform this
1121 * attack against any physical address. The one wrinkle is that this physical
1122 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1123 * to flush the L1 data cache. We wrap this up in the function
1124 * spec_uarch_flush(). This function is also used in the mitigation of
1125 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1126 * hypervisors such as KVM or bhyve are responsible for performing this before
1127 * entering the guest.
1128 *
1129 * Because this attack takes place in the L1 cache, there's another wrinkle
1130 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1131 * designs. This means that when a thread enters a hardware virtualized context
1132 * and flushes the L1 data cache, the other thread on the processor may then go
1133 * ahead and put new data in it that can be potentially attacked. While one
1134 * solution is to disable SMT on the system, another option that is available is
1135 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1136 * goes through and makes sure that if a HVM is being scheduled on one thread,
1137 * then the thing on the other thread is from the same hardware virtual machine.
1138 * If an interrupt comes in or the guest exits to the broader system, then the
1139 * other SMT thread will be kicked out.
1140 *
1141 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1142 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1143 * perform L1TF related mitigations.
1144 *
1145 * MICROARCHITECTURAL DATA SAMPLING
1146 *
1147 * Microarchitectural data sampling (MDS) is a combination of four discrete
1148 * vulnerabilities that are similar issues affecting various parts of the CPU's
1149 * microarchitectural implementation around load, store, and fill buffers.
1150 * Specifically it is made up of the following subcomponents:
1151 *
1152 *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1153 *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1154 *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1155 *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1156 *
1157 * To begin addressing these, Intel has introduced another feature in microcode
1158 * called MD_CLEAR. This changes the verw instruction to operate in a different
1159 * way. This allows us to execute the verw instruction in a particular way to
1160 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1161 * updated when this microcode is present to flush this state.
1162 *
1163 * Primarily we need to flush this state whenever we transition from the kernel
1164 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1165 * little bit different. Here the structures are statically sized when a logical
1166 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1167 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1168 * mwait, or another ACPI method. To perform these flushes, we call
1169 * x86_md_clear() at all of these transition points.
1170 *
1171 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1172 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1173 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1174 * a no-op.
1175 *
1176 * Unfortunately, with this issue hyperthreading rears its ugly head. In
1177 * particular, everything we've discussed above is only valid for a single
1178 * thread executing on a core. In the case where you have hyper-threading
1179 * present, this attack can be performed between threads. The theoretical fix
1180 * for this is to ensure that both threads are always in the same security
1181 * domain. This means that they are executing in the same ring and mutually
1182 * trust each other. Practically speaking, this would mean that a system call
1183 * would have to issue an inter-processor interrupt (IPI) to the other thread.
1184 * Rather than implement this, we recommend that one disables hyper-threading
1185 * through the use of psradm -aS.
1186 *
1187 * TSX ASYNCHRONOUS ABORT
1188 *
1189 * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1190 * behaves like MDS, but leverages Intel's transactional instructions as another
1191 * vector. Effectively, when a transaction hits one of these cases (unmapped
1192 * page, various cache snoop activity, etc.) then the same data can be exposed
1193 * as in the case of MDS. This means that you can attack your twin.
1194 *
1195 * Intel has described that there are two different ways that we can mitigate
1196 * this problem on affected processors:
1197 *
1198 *   1) We can use the same techniques used to deal with MDS. Flushing the
1199 *      microarchitectural buffers and disabling hyperthreading will mitigate
1200 *      this in the same way.
1201 *
1202 *   2) Using microcode to disable TSX.
1203 *
1204 * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1205 * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1206 * That's OK as we're already doing all such mitigations. On the other hand,
1207 * processors with MDS_NO are all supposed to receive microcode updates that
1208 * enumerate support for disabling TSX. In general, we'd rather use this method
1209 * when available as it doesn't require disabling hyperthreading to be
1210 * effective. Currently we basically are relying on microcode for processors
1211 * that enumerate MDS_NO.
1212 *
1213 * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1214 * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1215 * different powers. The first allows us to cause all transactions to
1216 * immediately abort. The second gives us a means of disabling TSX completely,
1217 * which includes removing it from cpuid. If we have support for this in
1218 * microcode during the first cpuid pass, then we'll disable TSX completely such
1219 * that user land never has a chance to observe the bit. However, if we are late
1220 * loading the microcode, then we must use the functionality to cause
1221 * transactions to automatically abort. This is necessary for user land's sake.
1222 * Once a program sees a cpuid bit, it must not be taken away.
1223 *
1224 * We track whether or not we should do this based on what cpuid pass we're in.
1225 * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1226 * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1227 * should happen twice. Once in the normal cpuid_pass1() code and then a second
1228 * time after we do the initial microcode update.  As a result we need to be
1229 * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable
1230 * microcode on the current CPU (which happens prior to cpuid_pass_ucode()).
1231 *
1232 * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1233 * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1234 * unfortunate feature in a number of ways, and taking the opportunity to
1235 * finally be able to turn it off is likely to be of benefit in the future.
1236 *
1237 * SUMMARY
1238 *
1239 * The following table attempts to summarize the mitigations for various issues
1240 * and what's done in various places:
1241 *
1242 *  - Spectre v1: Not currently mitigated
1243 *  - swapgs: lfences after swapgs paths
1244 *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1245 *  - Meltdown: Kernel Page Table Isolation
1246 *  - Spectre v3a: Updated CPU microcode
1247 *  - Spectre v4: Not currently mitigated
1248 *  - SpectreRSB: SMEP and RSB Stuffing
1249 *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1250 *  - MDS: x86_md_clear, requires microcode, disabling SMT
1251 *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1252 *
1253 * The following table indicates the x86 feature set bits that indicate that a
1254 * given problem has been solved or a notable feature is present:
1255 *
1256 *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1257 *  - MDS_NO: All forms of MDS
1258 *  - TAA_NO: TAA
1259 */
1260
1261#include <sys/types.h>
1262#include <sys/archsystm.h>
1263#include <sys/x86_archext.h>
1264#include <sys/kmem.h>
1265#include <sys/systm.h>
1266#include <sys/cmn_err.h>
1267#include <sys/sunddi.h>
1268#include <sys/sunndi.h>
1269#include <sys/cpuvar.h>
1270#include <sys/processor.h>
1271#include <sys/sysmacros.h>
1272#include <sys/pg.h>
1273#include <sys/fp.h>
1274#include <sys/controlregs.h>
1275#include <sys/bitmap.h>
1276#include <sys/auxv_386.h>
1277#include <sys/memnode.h>
1278#include <sys/pci_cfgspace.h>
1279#include <sys/comm_page.h>
1280#include <sys/mach_mmu.h>
1281#include <sys/ucode.h>
1282#include <sys/tsc.h>
1283#include <sys/kobj.h>
1284#include <sys/asm_misc.h>
1285
1286#ifdef __xpv
1287#include <sys/hypervisor.h>
1288#else
1289#include <sys/ontrap.h>
1290#endif
1291
1292uint_t x86_vendor = X86_VENDOR_IntelClone;
1293uint_t x86_type = X86_TYPE_OTHER;
1294uint_t x86_clflush_size = 0;
1295
1296#if defined(__xpv)
1297int x86_use_pcid = 0;
1298int x86_use_invpcid = 0;
1299#else
1300int x86_use_pcid = -1;
1301int x86_use_invpcid = -1;
1302#endif
1303
1304typedef enum {
1305	X86_SPECTREV2_RETPOLINE,
1306	X86_SPECTREV2_RETPOLINE_AMD,
1307	X86_SPECTREV2_ENHANCED_IBRS,
1308	X86_SPECTREV2_DISABLED
1309} x86_spectrev2_mitigation_t;
1310
1311uint_t x86_disable_spectrev2 = 0;
1312static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1313    X86_SPECTREV2_RETPOLINE;
1314
1315/*
1316 * The mitigation status for TAA:
1317 * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1318 * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1319 * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1320 * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1321 * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1322 * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1323 */
1324typedef enum {
1325	X86_TAA_NOTHING,
1326	X86_TAA_DISABLED,
1327	X86_TAA_MD_CLEAR,
1328	X86_TAA_TSX_FORCE_ABORT,
1329	X86_TAA_TSX_DISABLE,
1330	X86_TAA_HW_MITIGATED
1331} x86_taa_mitigation_t;
1332
1333uint_t x86_disable_taa = 0;
1334static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1335
1336uint_t pentiumpro_bug4046376;
1337
1338uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1339
1340static char *x86_feature_names[NUM_X86_FEATURES] = {
1341	"lgpg",
1342	"tsc",
1343	"msr",
1344	"mtrr",
1345	"pge",
1346	"de",
1347	"cmov",
1348	"mmx",
1349	"mca",
1350	"pae",
1351	"cv8",
1352	"pat",
1353	"sep",
1354	"sse",
1355	"sse2",
1356	"htt",
1357	"asysc",
1358	"nx",
1359	"sse3",
1360	"cx16",
1361	"cmp",
1362	"tscp",
1363	"mwait",
1364	"sse4a",
1365	"cpuid",
1366	"ssse3",
1367	"sse4_1",
1368	"sse4_2",
1369	"1gpg",
1370	"clfsh",
1371	"64",
1372	"aes",
1373	"pclmulqdq",
1374	"xsave",
1375	"avx",
1376	"vmx",
1377	"svm",
1378	"topoext",
1379	"f16c",
1380	"rdrand",
1381	"x2apic",
1382	"avx2",
1383	"bmi1",
1384	"bmi2",
1385	"fma",
1386	"smep",
1387	"smap",
1388	"adx",
1389	"rdseed",
1390	"mpx",
1391	"avx512f",
1392	"avx512dq",
1393	"avx512pf",
1394	"avx512er",
1395	"avx512cd",
1396	"avx512bw",
1397	"avx512vl",
1398	"avx512fma",
1399	"avx512vbmi",
1400	"avx512_vpopcntdq",
1401	"avx512_4vnniw",
1402	"avx512_4fmaps",
1403	"xsaveopt",
1404	"xsavec",
1405	"xsaves",
1406	"sha",
1407	"umip",
1408	"pku",
1409	"ospke",
1410	"pcid",
1411	"invpcid",
1412	"ibrs",
1413	"ibpb",
1414	"stibp",
1415	"ssbd",
1416	"ssbd_virt",
1417	"rdcl_no",
1418	"ibrs_all",
1419	"rsba",
1420	"ssb_no",
1421	"stibp_all",
1422	"flush_cmd",
1423	"l1d_vmentry_no",
1424	"fsgsbase",
1425	"clflushopt",
1426	"clwb",
1427	"monitorx",
1428	"clzero",
1429	"xop",
1430	"fma4",
1431	"tbm",
1432	"avx512_vnni",
1433	"amd_pcec",
1434	"mb_clear",
1435	"mds_no",
1436	"core_thermal",
1437	"pkg_thermal",
1438	"tsx_ctrl",
1439	"taa_no"
1440};
1441
1442boolean_t
1443is_x86_feature(void *featureset, uint_t feature)
1444{
1445	ASSERT(feature < NUM_X86_FEATURES);
1446	return (BT_TEST((ulong_t *)featureset, feature));
1447}
1448
1449void
1450add_x86_feature(void *featureset, uint_t feature)
1451{
1452	ASSERT(feature < NUM_X86_FEATURES);
1453	BT_SET((ulong_t *)featureset, feature);
1454}
1455
1456void
1457remove_x86_feature(void *featureset, uint_t feature)
1458{
1459	ASSERT(feature < NUM_X86_FEATURES);
1460	BT_CLEAR((ulong_t *)featureset, feature);
1461}
1462
1463boolean_t
1464compare_x86_featureset(void *setA, void *setB)
1465{
1466	/*
1467	 * We assume that the unused bits of the bitmap are always zero.
1468	 */
1469	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1470		return (B_TRUE);
1471	} else {
1472		return (B_FALSE);
1473	}
1474}
1475
1476void
1477print_x86_featureset(void *featureset)
1478{
1479	uint_t i;
1480
1481	for (i = 0; i < NUM_X86_FEATURES; i++) {
1482		if (is_x86_feature(featureset, i)) {
1483			cmn_err(CE_CONT, "?x86_feature: %s\n",
1484			    x86_feature_names[i]);
1485		}
1486	}
1487}
1488
1489/* Note: This is the maximum size for the CPU, not the size of the structure. */
1490static size_t xsave_state_size = 0;
1491uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1492boolean_t xsave_force_disable = B_FALSE;
1493extern int disable_smap;
1494
1495/*
1496 * This is set to platform type we are running on.
1497 */
1498static int platform_type = -1;
1499
1500#if !defined(__xpv)
1501/*
1502 * Variable to patch if hypervisor platform detection needs to be
1503 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1504 */
1505int enable_platform_detection = 1;
1506#endif
1507
1508/*
1509 * monitor/mwait info.
1510 *
1511 * size_actual and buf_actual are the real address and size allocated to get
1512 * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1513 * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1514 * processor cache-line alignment, but this is not guarantied in the furture.
1515 */
1516struct mwait_info {
1517	size_t		mon_min;	/* min size to avoid missed wakeups */
1518	size_t		mon_max;	/* size to avoid false wakeups */
1519	size_t		size_actual;	/* size actually allocated */
1520	void		*buf_actual;	/* memory actually allocated */
1521	uint32_t	support;	/* processor support of monitor/mwait */
1522};
1523
1524/*
1525 * xsave/xrestor info.
1526 *
1527 * This structure contains HW feature bits and the size of the xsave save area.
1528 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1529 * (xsave_state) to describe the xsave layout. However, at runtime the
1530 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1531 * xsave_state structure simply represents the legacy layout of the beginning
1532 * of the xsave area.
1533 */
1534struct xsave_info {
1535	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1536	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1537	size_t		xsav_max_size;  /* max size save area for HW features */
1538	size_t		ymm_size;	/* AVX: size of ymm save area */
1539	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1540	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1541	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1542	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1543	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1544	size_t		opmask_size;	/* AVX512: size of opmask save */
1545	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1546	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1547	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1548	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1549	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1550};
1551
1552
1553/*
1554 * These constants determine how many of the elements of the
1555 * cpuid we cache in the cpuid_info data structure; the
1556 * remaining elements are accessible via the cpuid instruction.
1557 */
1558
1559#define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1560#define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1561
1562/*
1563 * See the big theory statement for a more detailed explanation of what some of
1564 * these members mean.
1565 */
1566struct cpuid_info {
1567	uint_t cpi_pass;		/* last pass completed */
1568	/*
1569	 * standard function information
1570	 */
1571	uint_t cpi_maxeax;		/* fn 0: %eax */
1572	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1573	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1574
1575	uint_t cpi_family;		/* fn 1: extended family */
1576	uint_t cpi_model;		/* fn 1: extended model */
1577	uint_t cpi_step;		/* fn 1: stepping */
1578	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1579					/*		AMD: package/socket # */
1580	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1581	int cpi_clogid;			/* fn 1: %ebx: thread # */
1582	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1583	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1584	uint_t cpi_ncache;		/* fn 2: number of elements */
1585	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1586	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1587	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1588					/* Intel fn: 4, AMD fn: 8000001d */
1589	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1590	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1591	/*
1592	 * extended function information
1593	 */
1594	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1595	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1596	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1597	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1598	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1599	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1600
1601	id_t cpi_coreid;		/* same coreid => strands share core */
1602	int cpi_pkgcoreid;		/* core number within single package */
1603	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1604					/* Intel: fn 4: %eax[31-26] */
1605
1606	/*
1607	 * These values represent the number of bits that are required to store
1608	 * information about the number of cores and threads.
1609	 */
1610	uint_t cpi_ncore_bits;
1611	uint_t cpi_nthread_bits;
1612	/*
1613	 * supported feature information
1614	 */
1615	uint32_t cpi_support[6];
1616#define	STD_EDX_FEATURES	0
1617#define	AMD_EDX_FEATURES	1
1618#define	TM_EDX_FEATURES		2
1619#define	STD_ECX_FEATURES	3
1620#define	AMD_ECX_FEATURES	4
1621#define	STD_EBX_FEATURES	5
1622	/*
1623	 * Synthesized information, where known.
1624	 */
1625	uint32_t cpi_chiprev;		/* See X86_CHIPREV_* in x86_archext.h */
1626	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1627	uint32_t cpi_socket;		/* Chip package/socket type */
1628
1629	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1630	uint32_t cpi_apicid;
1631	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1632	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1633					/* Intel: 1 */
1634	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1635	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1636
1637	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1638};
1639
1640
1641static struct cpuid_info cpuid_info0;
1642
1643/*
1644 * These bit fields are defined by the Intel Application Note AP-485
1645 * "Intel Processor Identification and the CPUID Instruction"
1646 */
1647#define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1648#define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1649#define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1650#define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1651#define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1652#define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1653
1654#define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1655#define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1656#define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1657#define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1658#define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1659#define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1660#define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1661
1662#define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1663#define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1664#define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1665#define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1666
1667#define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1668#define	CPI_XMAXEAX_MAX		0x80000100
1669#define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1670#define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1671
1672/*
1673 * Function 4 (Deterministic Cache Parameters) macros
1674 * Defined by Intel Application Note AP-485
1675 */
1676#define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1677#define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1678#define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1679#define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1680#define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1681#define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1682#define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1683
1684#define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1685#define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1686#define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1687
1688#define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1689
1690#define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1691
1692
1693/*
1694 * A couple of shorthand macros to identify "later" P6-family chips
1695 * like the Pentium M and Core.  First, the "older" P6-based stuff
1696 * (loosely defined as "pre-Pentium-4"):
1697 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1698 */
1699#define	IS_LEGACY_P6(cpi) (			\
1700	cpi->cpi_family == 6 &&			\
1701		(cpi->cpi_model == 1 ||		\
1702		cpi->cpi_model == 3 ||		\
1703		cpi->cpi_model == 5 ||		\
1704		cpi->cpi_model == 6 ||		\
1705		cpi->cpi_model == 7 ||		\
1706		cpi->cpi_model == 8 ||		\
1707		cpi->cpi_model == 0xA ||	\
1708		cpi->cpi_model == 0xB)		\
1709)
1710
1711/* A "new F6" is everything with family 6 that's not the above */
1712#define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1713
1714/* Extended family/model support */
1715#define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1716	cpi->cpi_family >= 0xf)
1717
1718/*
1719 * Info for monitor/mwait idle loop.
1720 *
1721 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1722 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1723 * 2006.
1724 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1725 * Documentation Updates" #33633, Rev 2.05, December 2006.
1726 */
1727#define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1728#define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1729#define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1730#define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1731#define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1732#define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1733#define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1734#define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1735/*
1736 * Number of sub-cstates for a given c-state.
1737 */
1738#define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1739	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1740
1741/*
1742 * XSAVE leaf 0xD enumeration
1743 */
1744#define	CPUID_LEAFD_2_YMM_OFFSET	576
1745#define	CPUID_LEAFD_2_YMM_SIZE		256
1746
1747/*
1748 * Common extended leaf names to cut down on typos.
1749 */
1750#define	CPUID_LEAF_EXT_0		0x80000000
1751#define	CPUID_LEAF_EXT_8		0x80000008
1752#define	CPUID_LEAF_EXT_1d		0x8000001d
1753#define	CPUID_LEAF_EXT_1e		0x8000001e
1754
1755/*
1756 * Functions we consune from cpuid_subr.c;  don't publish these in a header
1757 * file to try and keep people using the expected cpuid_* interfaces.
1758 */
1759extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1760extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1761extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1762extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1763extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1764
1765/*
1766 * Apply up various platform-dependent restrictions where the
1767 * underlying platform restrictions mean the CPU can be marked
1768 * as less capable than its cpuid instruction would imply.
1769 */
1770#if defined(__xpv)
1771static void
1772platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1773{
1774	switch (eax) {
1775	case 1: {
1776		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1777		    0 : CPUID_INTC_EDX_MCA;
1778		cp->cp_edx &=
1779		    ~(mcamask |
1780		    CPUID_INTC_EDX_PSE |
1781		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1782		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1783		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1784		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1785		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1786		break;
1787	}
1788
1789	case 0x80000001:
1790		cp->cp_edx &=
1791		    ~(CPUID_AMD_EDX_PSE |
1792		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1793		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1794		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1795		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1796		    CPUID_AMD_EDX_TSCP);
1797		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1798		break;
1799	default:
1800		break;
1801	}
1802
1803	switch (vendor) {
1804	case X86_VENDOR_Intel:
1805		switch (eax) {
1806		case 4:
1807			/*
1808			 * Zero out the (ncores-per-chip - 1) field
1809			 */
1810			cp->cp_eax &= 0x03fffffff;
1811			break;
1812		default:
1813			break;
1814		}
1815		break;
1816	case X86_VENDOR_AMD:
1817		switch (eax) {
1818
1819		case 0x80000001:
1820			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1821			break;
1822
1823		case CPUID_LEAF_EXT_8:
1824			/*
1825			 * Zero out the (ncores-per-chip - 1) field
1826			 */
1827			cp->cp_ecx &= 0xffffff00;
1828			break;
1829		default:
1830			break;
1831		}
1832		break;
1833	default:
1834		break;
1835	}
1836}
1837#else
1838#define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
1839#endif
1840
1841/*
1842 *  Some undocumented ways of patching the results of the cpuid
1843 *  instruction to permit running Solaris 10 on future cpus that
1844 *  we don't currently support.  Could be set to non-zero values
1845 *  via settings in eeprom.
1846 */
1847
1848uint32_t cpuid_feature_ecx_include;
1849uint32_t cpuid_feature_ecx_exclude;
1850uint32_t cpuid_feature_edx_include;
1851uint32_t cpuid_feature_edx_exclude;
1852
1853/*
1854 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1855 */
1856void
1857cpuid_alloc_space(cpu_t *cpu)
1858{
1859	/*
1860	 * By convention, cpu0 is the boot cpu, which is set up
1861	 * before memory allocation is available.  All other cpus get
1862	 * their cpuid_info struct allocated here.
1863	 */
1864	ASSERT(cpu->cpu_id != 0);
1865	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1866	cpu->cpu_m.mcpu_cpi =
1867	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1868}
1869
1870void
1871cpuid_free_space(cpu_t *cpu)
1872{
1873	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1874	int i;
1875
1876	ASSERT(cpi != NULL);
1877	ASSERT(cpi != &cpuid_info0);
1878
1879	/*
1880	 * Free up any cache leaf related dynamic storage. The first entry was
1881	 * cached from the standard cpuid storage, so we should not free it.
1882	 */
1883	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1884		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1885	if (cpi->cpi_cache_leaf_size > 0)
1886		kmem_free(cpi->cpi_cache_leaves,
1887		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1888
1889	kmem_free(cpi, sizeof (*cpi));
1890	cpu->cpu_m.mcpu_cpi = NULL;
1891}
1892
1893#if !defined(__xpv)
1894/*
1895 * Determine the type of the underlying platform. This is used to customize
1896 * initialization of various subsystems (e.g. TSC). determine_platform() must
1897 * only ever be called once to prevent two processors from seeing different
1898 * values of platform_type. Must be called before cpuid_pass1(), the earliest
1899 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1900 */
1901void
1902determine_platform(void)
1903{
1904	struct cpuid_regs cp;
1905	uint32_t base;
1906	uint32_t regs[4];
1907	char *hvstr = (char *)regs;
1908
1909	ASSERT(platform_type == -1);
1910
1911	platform_type = HW_NATIVE;
1912
1913	if (!enable_platform_detection)
1914		return;
1915
1916	/*
1917	 * If Hypervisor CPUID bit is set, try to determine hypervisor
1918	 * vendor signature, and set platform type accordingly.
1919	 *
1920	 * References:
1921	 * http://lkml.org/lkml/2008/10/1/246
1922	 * http://kb.vmware.com/kb/1009458
1923	 */
1924	cp.cp_eax = 0x1;
1925	(void) __cpuid_insn(&cp);
1926	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1927		cp.cp_eax = 0x40000000;
1928		(void) __cpuid_insn(&cp);
1929		regs[0] = cp.cp_ebx;
1930		regs[1] = cp.cp_ecx;
1931		regs[2] = cp.cp_edx;
1932		regs[3] = 0;
1933		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1934			platform_type = HW_XEN_HVM;
1935			return;
1936		}
1937		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1938			platform_type = HW_VMWARE;
1939			return;
1940		}
1941		if (strcmp(hvstr, HVSIG_KVM) == 0) {
1942			platform_type = HW_KVM;
1943			return;
1944		}
1945		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1946			platform_type = HW_BHYVE;
1947			return;
1948		}
1949		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1950			platform_type = HW_MICROSOFT;
1951	} else {
1952		/*
1953		 * Check older VMware hardware versions. VMware hypervisor is
1954		 * detected by performing an IN operation to VMware hypervisor
1955		 * port and checking that value returned in %ebx is VMware
1956		 * hypervisor magic value.
1957		 *
1958		 * References: http://kb.vmware.com/kb/1009458
1959		 */
1960		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1961		if (regs[1] == VMWARE_HVMAGIC) {
1962			platform_type = HW_VMWARE;
1963			return;
1964		}
1965	}
1966
1967	/*
1968	 * Check Xen hypervisor. In a fully virtualized domain,
1969	 * Xen's pseudo-cpuid function returns a string representing the
1970	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1971	 * supported cpuid function. We need at least a (base + 2) leaf value
1972	 * to do what we want to do. Try different base values, since the
1973	 * hypervisor might use a different one depending on whether Hyper-V
1974	 * emulation is switched on by default or not.
1975	 */
1976	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1977		cp.cp_eax = base;
1978		(void) __cpuid_insn(&cp);
1979		regs[0] = cp.cp_ebx;
1980		regs[1] = cp.cp_ecx;
1981		regs[2] = cp.cp_edx;
1982		regs[3] = 0;
1983		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1984		    cp.cp_eax >= (base + 2)) {
1985			platform_type &= ~HW_NATIVE;
1986			platform_type |= HW_XEN_HVM;
1987			return;
1988		}
1989	}
1990}
1991
1992int
1993get_hwenv(void)
1994{
1995	ASSERT(platform_type != -1);
1996	return (platform_type);
1997}
1998
1999int
2000is_controldom(void)
2001{
2002	return (0);
2003}
2004
2005#else
2006
2007int
2008get_hwenv(void)
2009{
2010	return (HW_XEN_PV);
2011}
2012
2013int
2014is_controldom(void)
2015{
2016	return (DOMAIN_IS_INITDOMAIN(xen_info));
2017}
2018
2019#endif	/* __xpv */
2020
2021/*
2022 * Make sure that we have gathered all of the CPUID leaves that we might need to
2023 * determine topology. We assume that the standard leaf 1 has already been done
2024 * and that xmaxeax has already been calculated.
2025 */
2026static void
2027cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2028{
2029	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2030
2031	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2032		struct cpuid_regs *cp;
2033
2034		cp = &cpi->cpi_extd[8];
2035		cp->cp_eax = CPUID_LEAF_EXT_8;
2036		(void) __cpuid_insn(cp);
2037		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2038	}
2039
2040	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2041	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2042		struct cpuid_regs *cp;
2043
2044		cp = &cpi->cpi_extd[0x1e];
2045		cp->cp_eax = CPUID_LEAF_EXT_1e;
2046		(void) __cpuid_insn(cp);
2047	}
2048}
2049
2050/*
2051 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2052 * it to everything else. If not, and we're on an AMD system where 8000001e is
2053 * valid, then we use that. Othewrise, we fall back to the default value for the
2054 * APIC ID in leaf 1.
2055 */
2056static uint32_t
2057cpuid_gather_apicid(struct cpuid_info *cpi)
2058{
2059	/*
2060	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2061	 * it, we need to gather it again.
2062	 */
2063	if (cpi->cpi_maxeax >= 0xB) {
2064		struct cpuid_regs regs;
2065		struct cpuid_regs *cp;
2066
2067		cp = &regs;
2068		cp->cp_eax = 0xB;
2069		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2070		(void) __cpuid_insn(cp);
2071
2072		if (cp->cp_ebx != 0) {
2073			return (cp->cp_edx);
2074		}
2075	}
2076
2077	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2078	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2079	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2080		return (cpi->cpi_extd[0x1e].cp_eax);
2081	}
2082
2083	return (CPI_APIC_ID(cpi));
2084}
2085
2086/*
2087 * For AMD processors, attempt to calculate the number of chips and cores that
2088 * exist. The way that we do this varies based on the generation, because the
2089 * generations themselves have changed dramatically.
2090 *
2091 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2092 * However, with the advent of family 17h (Zen) it actually tells us the number
2093 * of threads, so we need to look at leaf 0x8000001e if available to determine
2094 * its value. Otherwise, for all prior families, the number of enabled cores is
2095 * the same as threads.
2096 *
2097 * If we do not have leaf 0x80000008, then we assume that this processor does
2098 * not have anything. AMD's older CPUID specification says there's no reason to
2099 * fall back to leaf 1.
2100 *
2101 * In some virtualization cases we will not have leaf 8000001e or it will be
2102 * zero. When that happens we assume the number of threads is one.
2103 */
2104static void
2105cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2106{
2107	uint_t nthreads, nthread_per_core;
2108
2109	nthreads = nthread_per_core = 1;
2110
2111	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2112		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2113	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2114		nthreads = CPI_CPU_COUNT(cpi);
2115	}
2116
2117	/*
2118	 * For us to have threads, and know about it, we have to be at least at
2119	 * family 17h and have the cpuid bit that says we have extended
2120	 * topology.
2121	 */
2122	if (cpi->cpi_family >= 0x17 &&
2123	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2124	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2125		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2126	}
2127
2128	*ncpus = nthreads;
2129	*ncores = nthreads / nthread_per_core;
2130}
2131
2132/*
2133 * Seed the initial values for the cores and threads for an Intel based
2134 * processor. These values will be overwritten if we detect that the processor
2135 * supports CPUID leaf 0xb.
2136 */
2137static void
2138cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2139{
2140	/*
2141	 * Only seed the number of physical cores from the first level leaf 4
2142	 * information. The number of threads there indicate how many share the
2143	 * L1 cache, which may or may not have anything to do with the number of
2144	 * logical CPUs per core.
2145	 */
2146	if (cpi->cpi_maxeax >= 4) {
2147		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2148	} else {
2149		*ncores = 1;
2150	}
2151
2152	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2153		*ncpus = CPI_CPU_COUNT(cpi);
2154	} else {
2155		*ncpus = *ncores;
2156	}
2157}
2158
2159static boolean_t
2160cpuid_leafB_getids(cpu_t *cpu)
2161{
2162	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2163	struct cpuid_regs regs;
2164	struct cpuid_regs *cp;
2165
2166	if (cpi->cpi_maxeax < 0xB)
2167		return (B_FALSE);
2168
2169	cp = &regs;
2170	cp->cp_eax = 0xB;
2171	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2172
2173	(void) __cpuid_insn(cp);
2174
2175	/*
2176	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2177	 * indicates that the extended topology enumeration leaf is
2178	 * available.
2179	 */
2180	if (cp->cp_ebx != 0) {
2181		uint32_t x2apic_id = 0;
2182		uint_t coreid_shift = 0;
2183		uint_t ncpu_per_core = 1;
2184		uint_t chipid_shift = 0;
2185		uint_t ncpu_per_chip = 1;
2186		uint_t i;
2187		uint_t level;
2188
2189		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2190			cp->cp_eax = 0xB;
2191			cp->cp_ecx = i;
2192
2193			(void) __cpuid_insn(cp);
2194			level = CPI_CPU_LEVEL_TYPE(cp);
2195
2196			if (level == 1) {
2197				x2apic_id = cp->cp_edx;
2198				coreid_shift = BITX(cp->cp_eax, 4, 0);
2199				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2200			} else if (level == 2) {
2201				x2apic_id = cp->cp_edx;
2202				chipid_shift = BITX(cp->cp_eax, 4, 0);
2203				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2204			}
2205		}
2206
2207		/*
2208		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2209		 */
2210		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2211		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2212		    ncpu_per_core;
2213		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2214		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2215		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2216		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2217		cpi->cpi_procnodeid = cpi->cpi_chipid;
2218		cpi->cpi_compunitid = cpi->cpi_coreid;
2219
2220		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2221			cpi->cpi_nthread_bits = coreid_shift;
2222			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2223		}
2224
2225		return (B_TRUE);
2226	} else {
2227		return (B_FALSE);
2228	}
2229}
2230
2231static void
2232cpuid_intel_getids(cpu_t *cpu, void *feature)
2233{
2234	uint_t i;
2235	uint_t chipid_shift = 0;
2236	uint_t coreid_shift = 0;
2237	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2238
2239	/*
2240	 * There are no compute units or processor nodes currently on Intel.
2241	 * Always set these to one.
2242	 */
2243	cpi->cpi_procnodes_per_pkg = 1;
2244	cpi->cpi_cores_per_compunit = 1;
2245
2246	/*
2247	 * If cpuid Leaf B is present, use that to try and get this information.
2248	 * It will be the most accurate for Intel CPUs.
2249	 */
2250	if (cpuid_leafB_getids(cpu))
2251		return;
2252
2253	/*
2254	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2255	 * and ncore_per_chip. These represent the largest power of two values
2256	 * that we need to cover all of the IDs in the system. Therefore, we use
2257	 * those values to seed the number of bits needed to cover information
2258	 * in the case when leaf B is not available. These values will probably
2259	 * be larger than required, but that's OK.
2260	 */
2261	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2262	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2263
2264	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2265		chipid_shift++;
2266
2267	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2268	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2269
2270	if (is_x86_feature(feature, X86FSET_CMP)) {
2271		/*
2272		 * Multi-core (and possibly multi-threaded)
2273		 * processors.
2274		 */
2275		uint_t ncpu_per_core = 0;
2276
2277		if (cpi->cpi_ncore_per_chip == 1)
2278			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2279		else if (cpi->cpi_ncore_per_chip > 1)
2280			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2281			    cpi->cpi_ncore_per_chip;
2282		/*
2283		 * 8bit APIC IDs on dual core Pentiums
2284		 * look like this:
2285		 *
2286		 * +-----------------------+------+------+
2287		 * | Physical Package ID   |  MC  |  HT  |
2288		 * +-----------------------+------+------+
2289		 * <------- chipid -------->
2290		 * <------- coreid --------------->
2291		 *			   <--- clogid -->
2292		 *			   <------>
2293		 *			   pkgcoreid
2294		 *
2295		 * Where the number of bits necessary to
2296		 * represent MC and HT fields together equals
2297		 * to the minimum number of bits necessary to
2298		 * store the value of cpi->cpi_ncpu_per_chip.
2299		 * Of those bits, the MC part uses the number
2300		 * of bits necessary to store the value of
2301		 * cpi->cpi_ncore_per_chip.
2302		 */
2303		for (i = 1; i < ncpu_per_core; i <<= 1)
2304			coreid_shift++;
2305		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2306		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2307	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2308		/*
2309		 * Single-core multi-threaded processors.
2310		 */
2311		cpi->cpi_coreid = cpi->cpi_chipid;
2312		cpi->cpi_pkgcoreid = 0;
2313	} else {
2314		/*
2315		 * Single-core single-thread processors.
2316		 */
2317		cpi->cpi_coreid = cpu->cpu_id;
2318		cpi->cpi_pkgcoreid = 0;
2319	}
2320	cpi->cpi_procnodeid = cpi->cpi_chipid;
2321	cpi->cpi_compunitid = cpi->cpi_coreid;
2322}
2323
2324/*
2325 * Historically, AMD has had CMP chips with only a single thread per core.
2326 * However, starting in family 17h (Zen), this has changed and they now have
2327 * multiple threads. Our internal core id needs to be a unique value.
2328 *
2329 * To determine the core id of an AMD system, if we're from a family before 17h,
2330 * then we just use the cpu id, as that gives us a good value that will be
2331 * unique for each core. If instead, we're on family 17h or later, then we need
2332 * to do something more complicated. CPUID leaf 0x8000001e can tell us
2333 * how many threads are in the system. Based on that, we'll shift the APIC ID.
2334 * We can't use the normal core id in that leaf as it's only unique within the
2335 * socket, which is perfect for cpi_pkgcoreid, but not us.
2336 */
2337static id_t
2338cpuid_amd_get_coreid(cpu_t *cpu)
2339{
2340	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2341
2342	if (cpi->cpi_family >= 0x17 &&
2343	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2344	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2345		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2346		if (nthreads > 1) {
2347			VERIFY3U(nthreads, ==, 2);
2348			return (cpi->cpi_apicid >> 1);
2349		}
2350	}
2351
2352	return (cpu->cpu_id);
2353}
2354
2355/*
2356 * IDs on AMD is a more challenging task. This is notable because of the
2357 * following two facts:
2358 *
2359 *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2360 *     also no way to get an actual unique core id from the system. As such, we
2361 *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2362 *     however, guarantee that sibling cores of a chip will have sequential
2363 *     coreids starting at a multiple of the number of cores per chip - that is
2364 *     usually the case, but if the ACPI MADT table is presented in a different
2365 *     order then we need to perform a few more gymnastics for the pkgcoreid.
2366 *
2367 *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2368 *     called compute units. These compute units share the L1I cache, L2 cache,
2369 *     and the FPU. To deal with this, a new topology leaf was added in
2370 *     0x8000001e. However, parts of this leaf have different meanings
2371 *     once we get to family 0x17.
2372 */
2373
2374static void
2375cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2376{
2377	int i, first_half, coreidsz;
2378	uint32_t nb_caps_reg;
2379	uint_t node2_1;
2380	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2381	struct cpuid_regs *cp;
2382
2383	/*
2384	 * Calculate the core id (this comes from hardware in family 0x17 if it
2385	 * hasn't been stripped by virtualization). We always set the compute
2386	 * unit id to the same value. Also, initialize the default number of
2387	 * cores per compute unit and nodes per package. This will be
2388	 * overwritten when we know information about a particular family.
2389	 */
2390	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2391	cpi->cpi_compunitid = cpi->cpi_coreid;
2392	cpi->cpi_cores_per_compunit = 1;
2393	cpi->cpi_procnodes_per_pkg = 1;
2394
2395	/*
2396	 * To construct the logical ID, we need to determine how many APIC IDs
2397	 * are dedicated to the cores and threads. This is provided for us in
2398	 * 0x80000008. However, if it's not present (say due to virtualization),
2399	 * then we assume it's one. This should be present on all 64-bit AMD
2400	 * processors.  It was added in family 0xf (Hammer).
2401	 */
2402	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2403		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2404
2405		/*
2406		 * In AMD parlance chip is really a node while illumos
2407		 * uses chip as equivalent to socket/package.
2408		 */
2409		if (coreidsz == 0) {
2410			/* Use legacy method */
2411			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2412				coreidsz++;
2413			if (coreidsz == 0)
2414				coreidsz = 1;
2415		}
2416	} else {
2417		/* Assume single-core part */
2418		coreidsz = 1;
2419	}
2420	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2421
2422	/*
2423	 * The package core ID varies depending on the family. While it may be
2424	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2425	 * this value is the core id in the given node. For non-virtualized
2426	 * family 17h, we need to take the logical core id and shift off the
2427	 * threads like we do when getting the core id.  Otherwise, we can use
2428	 * the clogid as is. When family 17h is virtualized, the clogid should
2429	 * be sufficient as if we don't have valid data in the leaf, then we
2430	 * won't think we have SMT, in which case the cpi_clogid should be
2431	 * sufficient.
2432	 */
2433	if (cpi->cpi_family >= 0x17 &&
2434	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2435	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2436	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2437		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2438		if (nthreads > 1) {
2439			VERIFY3U(nthreads, ==, 2);
2440			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2441		} else {
2442			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2443		}
2444	} else {
2445		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2446	}
2447
2448	/*
2449	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2450	 * (bulldozer) or newer, then we can derive all of this from leaf
2451	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2452	 */
2453	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2454	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2455		cp = &cpi->cpi_extd[0x1e];
2456
2457		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2458		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2459
2460		/*
2461		 * For Bulldozer-era CPUs, recalculate the compute unit
2462		 * information.
2463		 */
2464		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2465			cpi->cpi_cores_per_compunit =
2466			    BITX(cp->cp_ebx, 15, 8) + 1;
2467			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2468			    (cpi->cpi_ncore_per_chip /
2469			    cpi->cpi_cores_per_compunit) *
2470			    (cpi->cpi_procnodeid /
2471			    cpi->cpi_procnodes_per_pkg);
2472		}
2473	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2474		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2475	} else if (cpi->cpi_family == 0x10) {
2476		/*
2477		 * See if we are a multi-node processor.
2478		 * All processors in the system have the same number of nodes
2479		 */
2480		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2481		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2482			/* Single-node */
2483			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2484			    coreidsz);
2485		} else {
2486
2487			/*
2488			 * Multi-node revision D (2 nodes per package
2489			 * are supported)
2490			 */
2491			cpi->cpi_procnodes_per_pkg = 2;
2492
2493			first_half = (cpi->cpi_pkgcoreid <=
2494			    (cpi->cpi_ncore_per_chip/2 - 1));
2495
2496			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2497				/* We are BSP */
2498				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2499			} else {
2500
2501				/* We are AP */
2502				/* NodeId[2:1] bits to use for reading F3xe8 */
2503				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2504
2505				nb_caps_reg =
2506				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2507
2508				/*
2509				 * Check IntNodeNum bit (31:30, but bit 31 is
2510				 * always 0 on dual-node processors)
2511				 */
2512				if (BITX(nb_caps_reg, 30, 30) == 0)
2513					cpi->cpi_procnodeid = node2_1 +
2514					    !first_half;
2515				else
2516					cpi->cpi_procnodeid = node2_1 +
2517					    first_half;
2518			}
2519		}
2520	} else {
2521		cpi->cpi_procnodeid = 0;
2522	}
2523
2524	cpi->cpi_chipid =
2525	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2526
2527	cpi->cpi_ncore_bits = coreidsz;
2528	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2529	    cpi->cpi_ncore_per_chip);
2530}
2531
2532static void
2533spec_uarch_flush_noop(void)
2534{
2535}
2536
2537/*
2538 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2539 * MDS-related micro-architectural state that would normally happen by calling
2540 * x86_md_clear().
2541 */
2542static void
2543spec_uarch_flush_msr(void)
2544{
2545	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2546}
2547
2548/*
2549 * This function points to a function that will flush certain
2550 * micro-architectural state on the processor. This flush is used to mitigate
2551 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2552 * function can point to one of three functions:
2553 *
2554 * - A noop which is done because we either are vulnerable, but do not have
2555 *   microcode available to help deal with a fix, or because we aren't
2556 *   vulnerable.
2557 *
2558 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2559 *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2560 *   however, it only flushes the MDS related micro-architectural state on the
2561 *   current hyperthread, it does not do anything for the twin.
2562 *
2563 * - x86_md_clear which will flush the MDS related state. This is done when we
2564 *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2565 *   (RDCL_NO is set).
2566 */
2567void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2568
2569static void
2570cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2571{
2572	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2573
2574	/*
2575	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2576	 * has been fixed in hardware, it doesn't cover everything related to
2577	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2578	 * need to mitigate this.
2579	 */
2580	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2581	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2582		return;
2583	}
2584
2585	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2586		const uint8_t nop = NOP_INSTR;
2587		uint8_t *md = (uint8_t *)x86_md_clear;
2588
2589		*md = nop;
2590	}
2591
2592	membar_producer();
2593}
2594
2595static void
2596cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2597{
2598	boolean_t need_l1d, need_mds;
2599	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2600
2601	/*
2602	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2603	 * hardware, then there's nothing left for us to do for enabling the
2604	 * flush. We can also go ahead and say that SMT exclusion is
2605	 * unnecessary.
2606	 */
2607	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2608	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2609	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2610		extern int smt_exclusion;
2611		smt_exclusion = 0;
2612		spec_uarch_flush = spec_uarch_flush_noop;
2613		membar_producer();
2614		return;
2615	}
2616
2617	/*
2618	 * The locations where we need to perform an L1D flush are required both
2619	 * for mitigating L1TF and MDS. When verw support is present in
2620	 * microcode, then the L1D flush will take care of doing that as well.
2621	 * However, if we have a system where RDCL_NO is present, but we don't
2622	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2623	 * L1D flush.
2624	 */
2625	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2626	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2627	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2628		need_l1d = B_TRUE;
2629	} else {
2630		need_l1d = B_FALSE;
2631	}
2632
2633	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2634	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2635		need_mds = B_TRUE;
2636	} else {
2637		need_mds = B_FALSE;
2638	}
2639
2640	if (need_l1d) {
2641		spec_uarch_flush = spec_uarch_flush_msr;
2642	} else if (need_mds) {
2643		spec_uarch_flush = x86_md_clear;
2644	} else {
2645		/*
2646		 * We have no hardware mitigations available to us.
2647		 */
2648		spec_uarch_flush = spec_uarch_flush_noop;
2649	}
2650	membar_producer();
2651}
2652
2653/*
2654 * We default to enabling RSB mitigations.
2655 */
2656static void
2657cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2658{
2659	const uint8_t ret = RET_INSTR;
2660	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2661
2662	switch (mit) {
2663	case X86_SPECTREV2_ENHANCED_IBRS:
2664	case X86_SPECTREV2_DISABLED:
2665		*stuff = ret;
2666		break;
2667	default:
2668		break;
2669	}
2670}
2671
2672static void
2673cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2674{
2675	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2676	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2677	    "_r14", "_r15" };
2678	const uint_t nthunks = ARRAY_SIZE(thunks);
2679	const char *type;
2680	uint_t i;
2681
2682	if (mit == x86_spectrev2_mitigation)
2683		return;
2684
2685	switch (mit) {
2686	case X86_SPECTREV2_RETPOLINE:
2687		type = "gen";
2688		break;
2689	case X86_SPECTREV2_RETPOLINE_AMD:
2690		type = "amd";
2691		break;
2692	case X86_SPECTREV2_ENHANCED_IBRS:
2693	case X86_SPECTREV2_DISABLED:
2694		type = "jmp";
2695		break;
2696	default:
2697		panic("asked to updated retpoline state with unknown state!");
2698	}
2699
2700	for (i = 0; i < nthunks; i++) {
2701		uintptr_t source, dest;
2702		int ssize, dsize;
2703		char sourcebuf[64], destbuf[64];
2704		size_t len;
2705
2706		(void) snprintf(destbuf, sizeof (destbuf),
2707		    "__x86_indirect_thunk%s", thunks[i]);
2708		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2709		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2710
2711		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2712		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2713		VERIFY3U(source, !=, 0);
2714		VERIFY3U(dest, !=, 0);
2715		VERIFY3S(dsize, >=, ssize);
2716		bcopy((void *)source, (void *)dest, ssize);
2717	}
2718}
2719
2720static void
2721cpuid_enable_enhanced_ibrs(void)
2722{
2723	uint64_t val;
2724
2725	val = rdmsr(MSR_IA32_SPEC_CTRL);
2726	val |= IA32_SPEC_CTRL_IBRS;
2727	wrmsr(MSR_IA32_SPEC_CTRL, val);
2728}
2729
2730#ifndef __xpv
2731/*
2732 * Determine whether or not we can use the AMD optimized retpoline
2733 * functionality. We use this when we know we're on an AMD system and we can
2734 * successfully verify that lfence is dispatch serializing.
2735 */
2736static boolean_t
2737cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2738{
2739	uint64_t val;
2740	on_trap_data_t otd;
2741
2742	if (cpi->cpi_vendor != X86_VENDOR_AMD)
2743		return (B_FALSE);
2744
2745	/*
2746	 * We need to determine whether or not lfence is serializing. It always
2747	 * is on families 0xf and 0x11. On others, it's controlled by
2748	 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2749	 * crazy old family, don't try and do anything.
2750	 */
2751	if (cpi->cpi_family < 0xf)
2752		return (B_FALSE);
2753	if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2754		return (B_TRUE);
2755
2756	/*
2757	 * While it may be tempting to use get_hwenv(), there are no promises
2758	 * that a hypervisor will actually declare themselves to be so in a
2759	 * friendly way. As such, try to read and set the MSR. If we can then
2760	 * read back the value we set (it wasn't just set to zero), then we go
2761	 * for it.
2762	 */
2763	if (!on_trap(&otd, OT_DATA_ACCESS)) {
2764		val = rdmsr(MSR_AMD_DECODE_CONFIG);
2765		val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2766		wrmsr(MSR_AMD_DECODE_CONFIG, val);
2767		val = rdmsr(MSR_AMD_DECODE_CONFIG);
2768	} else {
2769		val = 0;
2770	}
2771	no_trap();
2772
2773	if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2774		return (B_TRUE);
2775	return (B_FALSE);
2776}
2777#endif	/* !__xpv */
2778
2779/*
2780 * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2781 * we can disable TSX, we do so.
2782 *
2783 * This determination is done only on the boot CPU, potentially after loading
2784 * updated microcode.
2785 */
2786static void
2787cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2788{
2789	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2790
2791	VERIFY(cpu->cpu_id == 0);
2792
2793	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2794		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2795		return;
2796	}
2797
2798	if (x86_disable_taa) {
2799		x86_taa_mitigation = X86_TAA_DISABLED;
2800		return;
2801	}
2802
2803	/*
2804	 * If we do not have the ability to disable TSX, then our only
2805	 * mitigation options are in hardware (TAA_NO), or by using our existing
2806	 * MDS mitigation as described above.  The latter relies upon us having
2807	 * configured MDS mitigations correctly! This includes disabling SMT if
2808	 * we want to cross-CPU-thread protection.
2809	 */
2810	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2811		/*
2812		 * It's not clear whether any parts will enumerate TAA_NO
2813		 * *without* TSX_CTRL, but let's mark it as such if we see this.
2814		 */
2815		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2816			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2817			return;
2818		}
2819
2820		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2821		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2822			x86_taa_mitigation = X86_TAA_MD_CLEAR;
2823		} else {
2824			x86_taa_mitigation = X86_TAA_NOTHING;
2825		}
2826		return;
2827	}
2828
2829	/*
2830	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
2831	 * enough in boot.
2832	 *
2833	 * Otherwise, we'll fall back to causing transactions to abort as our
2834	 * mitigation. TSX-using code will always take the fallback path.
2835	 */
2836	if (cpi->cpi_pass < 4) {
2837		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2838	} else {
2839		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2840	}
2841}
2842
2843/*
2844 * As mentioned, we should only touch the MSR when we've got a suitable
2845 * microcode loaded on this CPU.
2846 */
2847static void
2848cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2849{
2850	uint64_t val;
2851
2852	switch (taa) {
2853	case X86_TAA_TSX_DISABLE:
2854		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2855			return;
2856		val = rdmsr(MSR_IA32_TSX_CTRL);
2857		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2858		wrmsr(MSR_IA32_TSX_CTRL, val);
2859		break;
2860	case X86_TAA_TSX_FORCE_ABORT:
2861		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2862			return;
2863		val = rdmsr(MSR_IA32_TSX_CTRL);
2864		val |= IA32_TSX_CTRL_RTM_DISABLE;
2865		wrmsr(MSR_IA32_TSX_CTRL, val);
2866		break;
2867	case X86_TAA_HW_MITIGATED:
2868	case X86_TAA_MD_CLEAR:
2869	case X86_TAA_DISABLED:
2870	case X86_TAA_NOTHING:
2871		break;
2872	}
2873}
2874
2875static void
2876cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2877{
2878	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2879	x86_spectrev2_mitigation_t v2mit;
2880
2881	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2882	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2883		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2884			add_x86_feature(featureset, X86FSET_IBPB);
2885		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2886			add_x86_feature(featureset, X86FSET_IBRS);
2887		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2888			add_x86_feature(featureset, X86FSET_STIBP);
2889		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2890			add_x86_feature(featureset, X86FSET_STIBP_ALL);
2891		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2892			add_x86_feature(featureset, X86FSET_SSBD);
2893		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2894			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2895		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2896			add_x86_feature(featureset, X86FSET_SSB_NO);
2897		/*
2898		 * Don't enable enhanced IBRS unless we're told that we should
2899		 * prefer it and it has the same semantics as Intel. This is
2900		 * split into two bits rather than a single one.
2901		 */
2902		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2903		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2904			add_x86_feature(featureset, X86FSET_IBRS_ALL);
2905		}
2906
2907	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2908	    cpi->cpi_maxeax >= 7) {
2909		struct cpuid_regs *ecp;
2910		ecp = &cpi->cpi_std[7];
2911
2912		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2913			add_x86_feature(featureset, X86FSET_MD_CLEAR);
2914		}
2915
2916		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2917			add_x86_feature(featureset, X86FSET_IBRS);
2918			add_x86_feature(featureset, X86FSET_IBPB);
2919		}
2920
2921		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2922			add_x86_feature(featureset, X86FSET_STIBP);
2923		}
2924
2925		/*
2926		 * Don't read the arch caps MSR on xpv where we lack the
2927		 * on_trap().
2928		 */
2929#ifndef __xpv
2930		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2931			on_trap_data_t otd;
2932
2933			/*
2934			 * Be paranoid and assume we'll get a #GP.
2935			 */
2936			if (!on_trap(&otd, OT_DATA_ACCESS)) {
2937				uint64_t reg;
2938
2939				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2940				if (reg & IA32_ARCH_CAP_RDCL_NO) {
2941					add_x86_feature(featureset,
2942					    X86FSET_RDCL_NO);
2943				}
2944				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2945					add_x86_feature(featureset,
2946					    X86FSET_IBRS_ALL);
2947				}
2948				if (reg & IA32_ARCH_CAP_RSBA) {
2949					add_x86_feature(featureset,
2950					    X86FSET_RSBA);
2951				}
2952				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2953					add_x86_feature(featureset,
2954					    X86FSET_L1D_VM_NO);
2955				}
2956				if (reg & IA32_ARCH_CAP_SSB_NO) {
2957					add_x86_feature(featureset,
2958					    X86FSET_SSB_NO);
2959				}
2960				if (reg & IA32_ARCH_CAP_MDS_NO) {
2961					add_x86_feature(featureset,
2962					    X86FSET_MDS_NO);
2963				}
2964				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
2965					add_x86_feature(featureset,
2966					    X86FSET_TSX_CTRL);
2967				}
2968				if (reg & IA32_ARCH_CAP_TAA_NO) {
2969					add_x86_feature(featureset,
2970					    X86FSET_TAA_NO);
2971				}
2972			}
2973			no_trap();
2974		}
2975#endif	/* !__xpv */
2976
2977		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2978			add_x86_feature(featureset, X86FSET_SSBD);
2979
2980		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2981			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2982	}
2983
2984	/*
2985	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
2986	 * will have already run this function and determined what we need to
2987	 * do. This gives us a hook for per-HW thread mitigations such as
2988	 * enhanced IBRS, or disabling TSX.
2989	 */
2990	if (cpu->cpu_id != 0) {
2991		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2992			cpuid_enable_enhanced_ibrs();
2993		}
2994
2995		cpuid_apply_tsx(x86_taa_mitigation, featureset);
2996		return;
2997	}
2998
2999	/*
3000	 * Go through and initialize various security mechanisms that we should
3001	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3002	 * TAA.
3003	 */
3004
3005	/*
3006	 * By default we've come in with retpolines enabled. Check whether we
3007	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3008	 * by default, but disabled if we are using enhanced IBRS.
3009	 */
3010	if (x86_disable_spectrev2 != 0) {
3011		v2mit = X86_SPECTREV2_DISABLED;
3012	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3013		cpuid_enable_enhanced_ibrs();
3014		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3015#ifndef __xpv
3016	} else if (cpuid_use_amd_retpoline(cpi)) {
3017		v2mit = X86_SPECTREV2_RETPOLINE_AMD;
3018#endif	/* !__xpv */
3019	} else {
3020		v2mit = X86_SPECTREV2_RETPOLINE;
3021	}
3022
3023	cpuid_patch_retpolines(v2mit);
3024	cpuid_patch_rsb(v2mit);
3025	x86_spectrev2_mitigation = v2mit;
3026	membar_producer();
3027
3028	/*
3029	 * We need to determine what changes are required for mitigating L1TF
3030	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3031	 * is required.
3032	 *
3033	 * If any of these are present, then we need to flush u-arch state at
3034	 * various points. For MDS, we need to do so whenever we change to a
3035	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3036	 * flush the L1D cache at VM entry. When we have microcode that handles
3037	 * MDS, the L1D flush also clears the other u-arch state that the
3038	 * md_clear does.
3039	 */
3040
3041	/*
3042	 * Update whether or not we need to be taking explicit action against
3043	 * MDS.
3044	 */
3045	cpuid_update_md_clear(cpu, featureset);
3046
3047	/*
3048	 * Determine whether SMT exclusion is required and whether or not we
3049	 * need to perform an l1d flush.
3050	 */
3051	cpuid_update_l1d_flush(cpu, featureset);
3052
3053	/*
3054	 * Determine what our mitigation strategy should be for TAA and then
3055	 * also apply TAA mitigations.
3056	 */
3057	cpuid_update_tsx(cpu, featureset);
3058	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3059}
3060
3061/*
3062 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3063 */
3064void
3065setup_xfem(void)
3066{
3067	uint64_t flags = XFEATURE_LEGACY_FP;
3068
3069	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3070
3071	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3072		flags |= XFEATURE_SSE;
3073
3074	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3075		flags |= XFEATURE_AVX;
3076
3077	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3078		flags |= XFEATURE_AVX512;
3079
3080	set_xcr(XFEATURE_ENABLED_MASK, flags);
3081
3082	xsave_bv_all = flags;
3083}
3084
3085static void
3086cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
3087{
3088	struct cpuid_info *cpi;
3089
3090	cpi = cpu->cpu_m.mcpu_cpi;
3091
3092	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3093		cpuid_gather_amd_topology_leaves(cpu);
3094	}
3095
3096	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3097
3098	/*
3099	 * Before we can calculate the IDs that we should assign to this
3100	 * processor, we need to understand how many cores and threads it has.
3101	 */
3102	switch (cpi->cpi_vendor) {
3103	case X86_VENDOR_Intel:
3104		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3105		    &cpi->cpi_ncore_per_chip);
3106		break;
3107	case X86_VENDOR_AMD:
3108		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3109		    &cpi->cpi_ncore_per_chip);
3110		break;
3111	default:
3112		/*
3113		 * If we have some other x86 compatible chip, it's not clear how
3114		 * they would behave. The most common case is virtualization
3115		 * today, though there are also 64-bit VIA chips. Assume that
3116		 * all we can get is the basic Leaf 1 HTT information.
3117		 */
3118		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3119			cpi->cpi_ncore_per_chip = 1;
3120			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3121		}
3122		break;
3123	}
3124
3125	/*
3126	 * Based on the calculated number of threads and cores, potentially
3127	 * assign the HTT and CMT features.
3128	 */
3129	if (cpi->cpi_ncore_per_chip > 1) {
3130		add_x86_feature(featureset, X86FSET_CMP);
3131	}
3132
3133	if (cpi->cpi_ncpu_per_chip > 1 &&
3134	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3135		add_x86_feature(featureset, X86FSET_HTT);
3136	}
3137
3138	/*
3139	 * Now that has been set up, we need to go through and calculate all of
3140	 * the rest of the parameters that exist. If we think the CPU doesn't
3141	 * have either SMT (HTT) or CMP, then we basically go through and fake
3142	 * up information in some way. The most likely case for this is
3143	 * virtualization where we have a lot of partial topology information.
3144	 */
3145	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3146	    !is_x86_feature(featureset, X86FSET_CMP)) {
3147		/*
3148		 * This is a single core, single-threaded processor.
3149		 */
3150		cpi->cpi_procnodes_per_pkg = 1;
3151		cpi->cpi_cores_per_compunit = 1;
3152		cpi->cpi_compunitid = 0;
3153		cpi->cpi_chipid = -1;
3154		cpi->cpi_clogid = 0;
3155		cpi->cpi_coreid = cpu->cpu_id;
3156		cpi->cpi_pkgcoreid = 0;
3157		if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3158			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3159		} else {
3160			cpi->cpi_procnodeid = cpi->cpi_chipid;
3161		}
3162	} else {
3163		switch (cpi->cpi_vendor) {
3164		case X86_VENDOR_Intel:
3165			cpuid_intel_getids(cpu, featureset);
3166			break;
3167		case X86_VENDOR_AMD:
3168			cpuid_amd_getids(cpu, featureset);
3169			break;
3170		default:
3171			/*
3172			 * In this case, it's hard to say what we should do.
3173			 * We're going to model them to the OS as single core
3174			 * threads. We don't have a good identifier for them, so
3175			 * we're just going to use the cpu id all on a single
3176			 * chip.
3177			 *
3178			 * This case has historically been different from the
3179			 * case above where we don't have HTT or CMP. While they
3180			 * could be combined, we've opted to keep it separate to
3181			 * minimize the risk of topology changes in weird cases.
3182			 */
3183			cpi->cpi_procnodes_per_pkg = 1;
3184			cpi->cpi_cores_per_compunit = 1;
3185			cpi->cpi_chipid = 0;
3186			cpi->cpi_coreid = cpu->cpu_id;
3187			cpi->cpi_clogid = cpu->cpu_id;
3188			cpi->cpi_pkgcoreid = cpu->cpu_id;
3189			cpi->cpi_procnodeid = cpi->cpi_chipid;
3190			cpi->cpi_compunitid = cpi->cpi_coreid;
3191			break;
3192		}
3193	}
3194}
3195
3196/*
3197 * Gather relevant CPU features from leaf 6 which covers thermal information. We
3198 * always gather leaf 6 if it's supported; however, we only look for features on
3199 * Intel systems as AMD does not currently define any of the features we look
3200 * for below.
3201 */
3202static void
3203cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3204{
3205	struct cpuid_regs *cp;
3206	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3207
3208	if (cpi->cpi_maxeax < 6) {
3209		return;
3210	}
3211
3212	cp = &cpi->cpi_std[6];
3213	cp->cp_eax = 6;
3214	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3215	(void) __cpuid_insn(cp);
3216	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3217
3218	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3219		return;
3220	}
3221
3222	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3223		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3224	}
3225
3226	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3227		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3228	}
3229}
3230
3231void
3232cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3233{
3234	uint32_t mask_ecx, mask_edx;
3235	struct cpuid_info *cpi;
3236	struct cpuid_regs *cp;
3237	int xcpuid;
3238#if !defined(__xpv)
3239	extern int idle_cpu_prefer_mwait;
3240#endif
3241
3242	/*
3243	 * Space statically allocated for BSP, ensure pointer is set
3244	 */
3245	if (cpu->cpu_id == 0) {
3246		if (cpu->cpu_m.mcpu_cpi == NULL)
3247			cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3248	}
3249
3250	add_x86_feature(featureset, X86FSET_CPUID);
3251
3252	cpi = cpu->cpu_m.mcpu_cpi;
3253	ASSERT(cpi != NULL);
3254	cp = &cpi->cpi_std[0];
3255	cp->cp_eax = 0;
3256	cpi->cpi_maxeax = __cpuid_insn(cp);
3257	{
3258		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3259		*iptr++ = cp->cp_ebx;
3260		*iptr++ = cp->cp_edx;
3261		*iptr++ = cp->cp_ecx;
3262		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3263	}
3264
3265	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3266	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3267
3268	/*
3269	 * Limit the range in case of weird hardware
3270	 */
3271	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3272		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3273	if (cpi->cpi_maxeax < 1)
3274		goto pass1_done;
3275
3276	cp = &cpi->cpi_std[1];
3277	cp->cp_eax = 1;
3278	(void) __cpuid_insn(cp);
3279
3280	/*
3281	 * Extract identifying constants for easy access.
3282	 */
3283	cpi->cpi_model = CPI_MODEL(cpi);
3284	cpi->cpi_family = CPI_FAMILY(cpi);
3285
3286	if (cpi->cpi_family == 0xf)
3287		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3288
3289	/*
3290	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3291	 * Intel, and presumably everyone else, uses model == 0xf, as
3292	 * one would expect (max value means possible overflow).  Sigh.
3293	 */
3294
3295	switch (cpi->cpi_vendor) {
3296	case X86_VENDOR_Intel:
3297		if (IS_EXTENDED_MODEL_INTEL(cpi))
3298			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3299		break;
3300	case X86_VENDOR_AMD:
3301		if (CPI_FAMILY(cpi) == 0xf)
3302			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3303		break;
3304	default:
3305		if (cpi->cpi_model == 0xf)
3306			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3307		break;
3308	}
3309
3310	cpi->cpi_step = CPI_STEP(cpi);
3311	cpi->cpi_brandid = CPI_BRANDID(cpi);
3312
3313	/*
3314	 * *default* assumptions:
3315	 * - believe %edx feature word
3316	 * - ignore %ecx feature word
3317	 * - 32-bit virtual and physical addressing
3318	 */
3319	mask_edx = 0xffffffff;
3320	mask_ecx = 0;
3321
3322	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3323
3324	switch (cpi->cpi_vendor) {
3325	case X86_VENDOR_Intel:
3326		if (cpi->cpi_family == 5)
3327			x86_type = X86_TYPE_P5;
3328		else if (IS_LEGACY_P6(cpi)) {
3329			x86_type = X86_TYPE_P6;
3330			pentiumpro_bug4046376 = 1;
3331			/*
3332			 * Clear the SEP bit when it was set erroneously
3333			 */
3334			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3335				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3336		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3337			x86_type = X86_TYPE_P4;
3338			/*
3339			 * We don't currently depend on any of the %ecx
3340			 * features until Prescott, so we'll only check
3341			 * this from P4 onwards.  We might want to revisit
3342			 * that idea later.
3343			 */
3344			mask_ecx = 0xffffffff;
3345		} else if (cpi->cpi_family > 0xf)
3346			mask_ecx = 0xffffffff;
3347		/*
3348		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3349		 * to obtain the monitor linesize.
3350		 */
3351		if (cpi->cpi_maxeax < 5)
3352			mask_ecx &= ~CPUID_INTC_ECX_MON;
3353		break;
3354	case X86_VENDOR_IntelClone:
3355	default:
3356		break;
3357	case X86_VENDOR_AMD:
3358#if defined(OPTERON_ERRATUM_108)
3359		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3360			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3361			cpi->cpi_model = 0xc;
3362		} else
3363#endif
3364		if (cpi->cpi_family == 5) {
3365			/*
3366			 * AMD K5 and K6
3367			 *
3368			 * These CPUs have an incomplete implementation
3369			 * of MCA/MCE which we mask away.
3370			 */
3371			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3372
3373			/*
3374			 * Model 0 uses the wrong (APIC) bit
3375			 * to indicate PGE.  Fix it here.
3376			 */
3377			if (cpi->cpi_model == 0) {
3378				if (cp->cp_edx & 0x200) {
3379					cp->cp_edx &= ~0x200;
3380					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3381				}
3382			}
3383
3384			/*
3385			 * Early models had problems w/ MMX; disable.
3386			 */
3387			if (cpi->cpi_model < 6)
3388				mask_edx &= ~CPUID_INTC_EDX_MMX;
3389		}
3390
3391		/*
3392		 * For newer families, SSE3 and CX16, at least, are valid;
3393		 * enable all
3394		 */
3395		if (cpi->cpi_family >= 0xf)
3396			mask_ecx = 0xffffffff;
3397		/*
3398		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3399		 * to obtain the monitor linesize.
3400		 */
3401		if (cpi->cpi_maxeax < 5)
3402			mask_ecx &= ~CPUID_INTC_ECX_MON;
3403
3404#if !defined(__xpv)
3405		/*
3406		 * AMD has not historically used MWAIT in the CPU's idle loop.
3407		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3408		 * know for certain that in at least family 17h, per AMD, mwait
3409		 * is preferred. Families in-between are less certain.
3410		 */
3411		if (cpi->cpi_family < 0x17) {
3412			idle_cpu_prefer_mwait = 0;
3413		}
3414#endif
3415
3416		break;
3417	case X86_VENDOR_TM:
3418		/*
3419		 * workaround the NT workaround in CMS 4.1
3420		 */
3421		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3422		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3423			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3424		break;
3425	case X86_VENDOR_Centaur:
3426		/*
3427		 * workaround the NT workarounds again
3428		 */
3429		if (cpi->cpi_family == 6)
3430			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3431		break;
3432	case X86_VENDOR_Cyrix:
3433		/*
3434		 * We rely heavily on the probing in locore
3435		 * to actually figure out what parts, if any,
3436		 * of the Cyrix cpuid instruction to believe.
3437		 */
3438		switch (x86_type) {
3439		case X86_TYPE_CYRIX_486:
3440			mask_edx = 0;
3441			break;
3442		case X86_TYPE_CYRIX_6x86:
3443			mask_edx = 0;
3444			break;
3445		case X86_TYPE_CYRIX_6x86L:
3446			mask_edx =
3447			    CPUID_INTC_EDX_DE |
3448			    CPUID_INTC_EDX_CX8;
3449			break;
3450		case X86_TYPE_CYRIX_6x86MX:
3451			mask_edx =
3452			    CPUID_INTC_EDX_DE |
3453			    CPUID_INTC_EDX_MSR |
3454			    CPUID_INTC_EDX_CX8 |
3455			    CPUID_INTC_EDX_PGE |
3456			    CPUID_INTC_EDX_CMOV |
3457			    CPUID_INTC_EDX_MMX;
3458			break;
3459		case X86_TYPE_CYRIX_GXm:
3460			mask_edx =
3461			    CPUID_INTC_EDX_MSR |
3462			    CPUID_INTC_EDX_CX8 |
3463			    CPUID_INTC_EDX_CMOV |
3464			    CPUID_INTC_EDX_MMX;
3465			break;
3466		case X86_TYPE_CYRIX_MediaGX:
3467			break;
3468		case X86_TYPE_CYRIX_MII:
3469		case X86_TYPE_VIA_CYRIX_III:
3470			mask_edx =
3471			    CPUID_INTC_EDX_DE |
3472			    CPUID_INTC_EDX_TSC |
3473			    CPUID_INTC_EDX_MSR |
3474			    CPUID_INTC_EDX_CX8 |
3475			    CPUID_INTC_EDX_PGE |
3476			    CPUID_INTC_EDX_CMOV |
3477			    CPUID_INTC_EDX_MMX;
3478			break;
3479		default:
3480			break;
3481		}
3482		break;
3483	}
3484
3485#if defined(__xpv)
3486	/*
3487	 * Do not support MONITOR/MWAIT under a hypervisor
3488	 */
3489	mask_ecx &= ~CPUID_INTC_ECX_MON;
3490	/*
3491	 * Do not support XSAVE under a hypervisor for now
3492	 */
3493	xsave_force_disable = B_TRUE;
3494
3495#endif	/* __xpv */
3496
3497	if (xsave_force_disable) {
3498		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3499		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3500		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3501		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3502	}
3503
3504	/*
3505	 * Now we've figured out the masks that determine
3506	 * which bits we choose to believe, apply the masks
3507	 * to the feature words, then map the kernel's view
3508	 * of these feature words into its feature word.
3509	 */
3510	cp->cp_edx &= mask_edx;
3511	cp->cp_ecx &= mask_ecx;
3512
3513	/*
3514	 * apply any platform restrictions (we don't call this
3515	 * immediately after __cpuid_insn here, because we need the
3516	 * workarounds applied above first)
3517	 */
3518	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3519
3520	/*
3521	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3522	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3523	 */
3524	if (cpi->cpi_maxeax >= 7) {
3525		struct cpuid_regs *ecp;
3526		ecp = &cpi->cpi_std[7];
3527		ecp->cp_eax = 7;
3528		ecp->cp_ecx = 0;
3529		(void) __cpuid_insn(ecp);
3530
3531		/*
3532		 * If XSAVE has been disabled, just ignore all of the
3533		 * extended-save-area dependent flags here.
3534		 */
3535		if (xsave_force_disable) {
3536			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3537			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3538			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3539			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3540			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3541			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3542			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3543		}
3544
3545		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3546			add_x86_feature(featureset, X86FSET_SMEP);
3547
3548		/*
3549		 * We check disable_smap here in addition to in startup_smap()
3550		 * to ensure CPUs that aren't the boot CPU don't accidentally
3551		 * include it in the feature set and thus generate a mismatched
3552		 * x86 feature set across CPUs.
3553		 */
3554		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3555		    disable_smap == 0)
3556			add_x86_feature(featureset, X86FSET_SMAP);
3557
3558		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3559			add_x86_feature(featureset, X86FSET_RDSEED);
3560
3561		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3562			add_x86_feature(featureset, X86FSET_ADX);
3563
3564		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3565			add_x86_feature(featureset, X86FSET_FSGSBASE);
3566
3567		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3568			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3569
3570		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3571			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3572				add_x86_feature(featureset, X86FSET_INVPCID);
3573
3574			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3575				add_x86_feature(featureset, X86FSET_MPX);
3576
3577			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3578				add_x86_feature(featureset, X86FSET_CLWB);
3579		}
3580	}
3581
3582	/*
3583	 * fold in overrides from the "eeprom" mechanism
3584	 */
3585	cp->cp_edx |= cpuid_feature_edx_include;
3586	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3587
3588	cp->cp_ecx |= cpuid_feature_ecx_include;
3589	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3590
3591	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3592		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3593	}
3594	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3595		add_x86_feature(featureset, X86FSET_TSC);
3596	}
3597	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3598		add_x86_feature(featureset, X86FSET_MSR);
3599	}
3600	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3601		add_x86_feature(featureset, X86FSET_MTRR);
3602	}
3603	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3604		add_x86_feature(featureset, X86FSET_PGE);
3605	}
3606	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3607		add_x86_feature(featureset, X86FSET_CMOV);
3608	}
3609	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3610		add_x86_feature(featureset, X86FSET_MMX);
3611	}
3612	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3613	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3614		add_x86_feature(featureset, X86FSET_MCA);
3615	}
3616	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3617		add_x86_feature(featureset, X86FSET_PAE);
3618	}
3619	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3620		add_x86_feature(featureset, X86FSET_CX8);
3621	}
3622	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3623		add_x86_feature(featureset, X86FSET_CX16);
3624	}
3625	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3626		add_x86_feature(featureset, X86FSET_PAT);
3627	}
3628	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3629		add_x86_feature(featureset, X86FSET_SEP);
3630	}
3631	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3632		/*
3633		 * In our implementation, fxsave/fxrstor
3634		 * are prerequisites before we'll even
3635		 * try and do SSE things.
3636		 */
3637		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3638			add_x86_feature(featureset, X86FSET_SSE);
3639		}
3640		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3641			add_x86_feature(featureset, X86FSET_SSE2);
3642		}
3643		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3644			add_x86_feature(featureset, X86FSET_SSE3);
3645		}
3646		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3647			add_x86_feature(featureset, X86FSET_SSSE3);
3648		}
3649		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3650			add_x86_feature(featureset, X86FSET_SSE4_1);
3651		}
3652		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3653			add_x86_feature(featureset, X86FSET_SSE4_2);
3654		}
3655		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3656			add_x86_feature(featureset, X86FSET_AES);
3657		}
3658		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3659			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3660		}
3661
3662		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3663			add_x86_feature(featureset, X86FSET_SHA);
3664
3665		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3666			add_x86_feature(featureset, X86FSET_UMIP);
3667		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3668			add_x86_feature(featureset, X86FSET_PKU);
3669		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3670			add_x86_feature(featureset, X86FSET_OSPKE);
3671
3672		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3673			add_x86_feature(featureset, X86FSET_XSAVE);
3674
3675			/* We only test AVX & AVX512 when there is XSAVE */
3676
3677			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3678				add_x86_feature(featureset,
3679				    X86FSET_AVX);
3680
3681				/*
3682				 * Intel says we can't check these without also
3683				 * checking AVX.
3684				 */
3685				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3686					add_x86_feature(featureset,
3687					    X86FSET_F16C);
3688
3689				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3690					add_x86_feature(featureset,
3691					    X86FSET_FMA);
3692
3693				if (cpi->cpi_std[7].cp_ebx &
3694				    CPUID_INTC_EBX_7_0_BMI1)
3695					add_x86_feature(featureset,
3696					    X86FSET_BMI1);
3697
3698				if (cpi->cpi_std[7].cp_ebx &
3699				    CPUID_INTC_EBX_7_0_BMI2)
3700					add_x86_feature(featureset,
3701					    X86FSET_BMI2);
3702
3703				if (cpi->cpi_std[7].cp_ebx &
3704				    CPUID_INTC_EBX_7_0_AVX2)
3705					add_x86_feature(featureset,
3706					    X86FSET_AVX2);
3707			}
3708
3709			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3710			    (cpi->cpi_std[7].cp_ebx &
3711			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3712				add_x86_feature(featureset, X86FSET_AVX512F);
3713
3714				if (cpi->cpi_std[7].cp_ebx &
3715				    CPUID_INTC_EBX_7_0_AVX512DQ)
3716					add_x86_feature(featureset,
3717					    X86FSET_AVX512DQ);
3718				if (cpi->cpi_std[7].cp_ebx &
3719				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3720					add_x86_feature(featureset,
3721					    X86FSET_AVX512FMA);
3722				if (cpi->cpi_std[7].cp_ebx &
3723				    CPUID_INTC_EBX_7_0_AVX512PF)
3724					add_x86_feature(featureset,
3725					    X86FSET_AVX512PF);
3726				if (cpi->cpi_std[7].cp_ebx &
3727				    CPUID_INTC_EBX_7_0_AVX512ER)
3728					add_x86_feature(featureset,
3729					    X86FSET_AVX512ER);
3730				if (cpi->cpi_std[7].cp_ebx &
3731				    CPUID_INTC_EBX_7_0_AVX512CD)
3732					add_x86_feature(featureset,
3733					    X86FSET_AVX512CD);
3734				if (cpi->cpi_std[7].cp_ebx &
3735				    CPUID_INTC_EBX_7_0_AVX512BW)
3736					add_x86_feature(featureset,
3737					    X86FSET_AVX512BW);
3738				if (cpi->cpi_std[7].cp_ebx &
3739				    CPUID_INTC_EBX_7_0_AVX512VL)
3740					add_x86_feature(featureset,
3741					    X86FSET_AVX512VL);
3742
3743				if (cpi->cpi_std[7].cp_ecx &
3744				    CPUID_INTC_ECX_7_0_AVX512VBMI)
3745					add_x86_feature(featureset,
3746					    X86FSET_AVX512VBMI);
3747				if (cpi->cpi_std[7].cp_ecx &
3748				    CPUID_INTC_ECX_7_0_AVX512VNNI)
3749					add_x86_feature(featureset,
3750					    X86FSET_AVX512VNNI);
3751				if (cpi->cpi_std[7].cp_ecx &
3752				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3753					add_x86_feature(featureset,
3754					    X86FSET_AVX512VPOPCDQ);
3755
3756				if (cpi->cpi_std[7].cp_edx &
3757				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
3758					add_x86_feature(featureset,
3759					    X86FSET_AVX512NNIW);
3760				if (cpi->cpi_std[7].cp_edx &
3761				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3762					add_x86_feature(featureset,
3763					    X86FSET_AVX512FMAPS);
3764			}
3765		}
3766	}
3767
3768	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3769		if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3770			add_x86_feature(featureset, X86FSET_PCID);
3771		}
3772	}
3773
3774	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3775		add_x86_feature(featureset, X86FSET_X2APIC);
3776	}
3777	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3778		add_x86_feature(featureset, X86FSET_DE);
3779	}
3780#if !defined(__xpv)
3781	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3782
3783		/*
3784		 * We require the CLFLUSH instruction for erratum workaround
3785		 * to use MONITOR/MWAIT.
3786		 */
3787		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3788			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3789			add_x86_feature(featureset, X86FSET_MWAIT);
3790		} else {
3791			extern int idle_cpu_assert_cflush_monitor;
3792
3793			/*
3794			 * All processors we are aware of which have
3795			 * MONITOR/MWAIT also have CLFLUSH.
3796			 */
3797			if (idle_cpu_assert_cflush_monitor) {
3798				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3799				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3800			}
3801		}
3802	}
3803#endif	/* __xpv */
3804
3805	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3806		add_x86_feature(featureset, X86FSET_VMX);
3807	}
3808
3809	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3810		add_x86_feature(featureset, X86FSET_RDRAND);
3811
3812	/*
3813	 * Only need it first time, rest of the cpus would follow suit.
3814	 * we only capture this for the bootcpu.
3815	 */
3816	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3817		add_x86_feature(featureset, X86FSET_CLFSH);
3818		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3819	}
3820	if (is_x86_feature(featureset, X86FSET_PAE))
3821		cpi->cpi_pabits = 36;
3822
3823	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3824		struct cpuid_regs r, *ecp;
3825
3826		ecp = &r;
3827		ecp->cp_eax = 0xD;
3828		ecp->cp_ecx = 1;
3829		ecp->cp_edx = ecp->cp_ebx = 0;
3830		(void) __cpuid_insn(ecp);
3831
3832		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3833			add_x86_feature(featureset, X86FSET_XSAVEOPT);
3834		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3835			add_x86_feature(featureset, X86FSET_XSAVEC);
3836		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3837			add_x86_feature(featureset, X86FSET_XSAVES);
3838	}
3839
3840	/*
3841	 * Work on the "extended" feature information, doing
3842	 * some basic initialization for cpuid_pass2()
3843	 */
3844	xcpuid = 0;
3845	switch (cpi->cpi_vendor) {
3846	case X86_VENDOR_Intel:
3847		/*
3848		 * On KVM we know we will have proper support for extended
3849		 * cpuid.
3850		 */
3851		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3852		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3853		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3854			xcpuid++;
3855		break;
3856	case X86_VENDOR_AMD:
3857		if (cpi->cpi_family > 5 ||
3858		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3859			xcpuid++;
3860		break;
3861	case X86_VENDOR_Cyrix:
3862		/*
3863		 * Only these Cyrix CPUs are -known- to support
3864		 * extended cpuid operations.
3865		 */
3866		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3867		    x86_type == X86_TYPE_CYRIX_GXm)
3868			xcpuid++;
3869		break;
3870	case X86_VENDOR_Centaur:
3871	case X86_VENDOR_TM:
3872	default:
3873		xcpuid++;
3874		break;
3875	}
3876
3877	if (xcpuid) {
3878		cp = &cpi->cpi_extd[0];
3879		cp->cp_eax = CPUID_LEAF_EXT_0;
3880		cpi->cpi_xmaxeax = __cpuid_insn(cp);
3881	}
3882
3883	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3884
3885		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3886			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3887
3888		switch (cpi->cpi_vendor) {
3889		case X86_VENDOR_Intel:
3890		case X86_VENDOR_AMD:
3891			if (cpi->cpi_xmaxeax < 0x80000001)
3892				break;
3893			cp = &cpi->cpi_extd[1];
3894			cp->cp_eax = 0x80000001;
3895			(void) __cpuid_insn(cp);
3896
3897			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3898			    cpi->cpi_family == 5 &&
3899			    cpi->cpi_model == 6 &&
3900			    cpi->cpi_step == 6) {
3901				/*
3902				 * K6 model 6 uses bit 10 to indicate SYSC
3903				 * Later models use bit 11. Fix it here.
3904				 */
3905				if (cp->cp_edx & 0x400) {
3906					cp->cp_edx &= ~0x400;
3907					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3908				}
3909			}
3910
3911			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3912
3913			/*
3914			 * Compute the additions to the kernel's feature word.
3915			 */
3916			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3917				add_x86_feature(featureset, X86FSET_NX);
3918			}
3919
3920			/*
3921			 * Regardless whether or not we boot 64-bit,
3922			 * we should have a way to identify whether
3923			 * the CPU is capable of running 64-bit.
3924			 */
3925			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3926				add_x86_feature(featureset, X86FSET_64);
3927			}
3928
3929			/* 1 GB large page - enable only for 64 bit kernel */
3930			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3931				add_x86_feature(featureset, X86FSET_1GPG);
3932			}
3933
3934			if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3935			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3936			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3937				add_x86_feature(featureset, X86FSET_SSE4A);
3938			}
3939
3940			/*
3941			 * It's really tricky to support syscall/sysret in
3942			 * the i386 kernel; we rely on sysenter/sysexit
3943			 * instead.  In the amd64 kernel, things are -way-
3944			 * better.
3945			 */
3946			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3947				add_x86_feature(featureset, X86FSET_ASYSC);
3948			}
3949
3950			/*
3951			 * While we're thinking about system calls, note
3952			 * that AMD processors don't support sysenter
3953			 * in long mode at all, so don't try to program them.
3954			 */
3955			if (x86_vendor == X86_VENDOR_AMD) {
3956				remove_x86_feature(featureset, X86FSET_SEP);
3957			}
3958
3959			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3960				add_x86_feature(featureset, X86FSET_TSCP);
3961			}
3962
3963			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3964				add_x86_feature(featureset, X86FSET_SVM);
3965			}
3966
3967			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3968				add_x86_feature(featureset, X86FSET_TOPOEXT);
3969			}
3970
3971			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3972				add_x86_feature(featureset, X86FSET_AMD_PCEC);
3973			}
3974
3975			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3976				add_x86_feature(featureset, X86FSET_XOP);
3977			}
3978
3979			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3980				add_x86_feature(featureset, X86FSET_FMA4);
3981			}
3982
3983			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3984				add_x86_feature(featureset, X86FSET_TBM);
3985			}
3986
3987			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3988				add_x86_feature(featureset, X86FSET_MONITORX);
3989			}
3990			break;
3991		default:
3992			break;
3993		}
3994
3995		/*
3996		 * Get CPUID data about processor cores and hyperthreads.
3997		 */
3998		switch (cpi->cpi_vendor) {
3999		case X86_VENDOR_Intel:
4000			if (cpi->cpi_maxeax >= 4) {
4001				cp = &cpi->cpi_std[4];
4002				cp->cp_eax = 4;
4003				cp->cp_ecx = 0;
4004				(void) __cpuid_insn(cp);
4005				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4006			}
4007			/*FALLTHROUGH*/
4008		case X86_VENDOR_AMD:
4009			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4010				break;
4011			cp = &cpi->cpi_extd[8];
4012			cp->cp_eax = CPUID_LEAF_EXT_8;
4013			(void) __cpuid_insn(cp);
4014			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4015			    cp);
4016
4017			/*
4018			 * AMD uses ebx for some extended functions.
4019			 */
4020			if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4021				/*
4022				 * While we're here, check for the AMD "Error
4023				 * Pointer Zero/Restore" feature. This can be
4024				 * used to setup the FP save handlers
4025				 * appropriately.
4026				 */
4027				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4028					cpi->cpi_fp_amd_save = 0;
4029				} else {
4030					cpi->cpi_fp_amd_save = 1;
4031				}
4032
4033				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4034					add_x86_feature(featureset,
4035					    X86FSET_CLZERO);
4036				}
4037			}
4038
4039			/*
4040			 * Virtual and physical address limits from
4041			 * cpuid override previously guessed values.
4042			 */
4043			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4044			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4045			break;
4046		default:
4047			break;
4048		}
4049
4050		/*
4051		 * Get CPUID data about TSC Invariance in Deep C-State.
4052		 */
4053		switch (cpi->cpi_vendor) {
4054		case X86_VENDOR_Intel:
4055		case X86_VENDOR_AMD:
4056			if (cpi->cpi_maxeax >= 7) {
4057				cp = &cpi->cpi_extd[7];
4058				cp->cp_eax = 0x80000007;
4059				cp->cp_ecx = 0;
4060				(void) __cpuid_insn(cp);
4061			}
4062			break;
4063		default:
4064			break;
4065		}
4066	}
4067
4068	cpuid_pass1_topology(cpu, featureset);
4069	cpuid_pass1_thermal(cpu, featureset);
4070
4071	/*
4072	 * Synthesize chip "revision" and socket type
4073	 */
4074	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4075	    cpi->cpi_model, cpi->cpi_step);
4076	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4077	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4078	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4079	    cpi->cpi_model, cpi->cpi_step);
4080
4081	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4082		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4083		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4084			/* Special handling for AMD FP not necessary. */
4085			cpi->cpi_fp_amd_save = 0;
4086		} else {
4087			cpi->cpi_fp_amd_save = 1;
4088		}
4089	}
4090
4091	/*
4092	 * Check the processor leaves that are used for security features.
4093	 */
4094	cpuid_scan_security(cpu, featureset);
4095
4096pass1_done:
4097	cpi->cpi_pass = 1;
4098}
4099
4100/*
4101 * Make copies of the cpuid table entries we depend on, in
4102 * part for ease of parsing now, in part so that we have only
4103 * one place to correct any of it, in part for ease of
4104 * later export to userland, and in part so we can look at
4105 * this stuff in a crash dump.
4106 */
4107
4108/*ARGSUSED*/
4109void
4110cpuid_pass2(cpu_t *cpu)
4111{
4112	uint_t n, nmax;
4113	int i;
4114	struct cpuid_regs *cp;
4115	uint8_t *dp;
4116	uint32_t *iptr;
4117	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4118
4119	ASSERT(cpi->cpi_pass == 1);
4120
4121	if (cpi->cpi_maxeax < 1)
4122		goto pass2_done;
4123
4124	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4125		nmax = NMAX_CPI_STD;
4126	/*
4127	 * (We already handled n == 0 and n == 1 in pass 1)
4128	 */
4129	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4130		/*
4131		 * leaves 6 and 7 were handled in pass 1
4132		 */
4133		if (n == 6 || n == 7)
4134			continue;
4135
4136		cp->cp_eax = n;
4137
4138		/*
4139		 * CPUID function 4 expects %ecx to be initialized
4140		 * with an index which indicates which cache to return
4141		 * information about. The OS is expected to call function 4
4142		 * with %ecx set to 0, 1, 2, ... until it returns with
4143		 * EAX[4:0] set to 0, which indicates there are no more
4144		 * caches.
4145		 *
4146		 * Here, populate cpi_std[4] with the information returned by
4147		 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
4148		 * when dynamic memory allocation becomes available.
4149		 *
4150		 * Note: we need to explicitly initialize %ecx here, since
4151		 * function 4 may have been previously invoked.
4152		 */
4153		if (n == 4)
4154			cp->cp_ecx = 0;
4155
4156		(void) __cpuid_insn(cp);
4157		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4158		switch (n) {
4159		case 2:
4160			/*
4161			 * "the lower 8 bits of the %eax register
4162			 * contain a value that identifies the number
4163			 * of times the cpuid [instruction] has to be
4164			 * executed to obtain a complete image of the
4165			 * processor's caching systems."
4166			 *
4167			 * How *do* they make this stuff up?
4168			 */
4169			cpi->cpi_ncache = sizeof (*cp) *
4170			    BITX(cp->cp_eax, 7, 0);
4171			if (cpi->cpi_ncache == 0)
4172				break;
4173			cpi->cpi_ncache--;	/* skip count byte */
4174
4175			/*
4176			 * Well, for now, rather than attempt to implement
4177			 * this slightly dubious algorithm, we just look
4178			 * at the first 15 ..
4179			 */
4180			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4181				cpi->cpi_ncache = sizeof (*cp) - 1;
4182
4183			dp = cpi->cpi_cacheinfo;
4184			if (BITX(cp->cp_eax, 31, 31) == 0) {
4185				uint8_t *p = (void *)&cp->cp_eax;
4186				for (i = 1; i < 4; i++)
4187					if (p[i] != 0)
4188						*dp++ = p[i];
4189			}
4190			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4191				uint8_t *p = (void *)&cp->cp_ebx;
4192				for (i = 0; i < 4; i++)
4193					if (p[i] != 0)
4194						*dp++ = p[i];
4195			}
4196			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4197				uint8_t *p = (void *)&cp->cp_ecx;
4198				for (i = 0; i < 4; i++)
4199					if (p[i] != 0)
4200						*dp++ = p[i];
4201			}
4202			if (BITX(cp->cp_edx, 31, 31) == 0) {
4203				uint8_t *p = (void *)&cp->cp_edx;
4204				for (i = 0; i < 4; i++)
4205					if (p[i] != 0)
4206						*dp++ = p[i];
4207			}
4208			break;
4209
4210		case 3:	/* Processor serial number, if PSN supported */
4211			break;
4212
4213		case 4:	/* Deterministic cache parameters */
4214			break;
4215
4216		case 5:	/* Monitor/Mwait parameters */
4217		{
4218			size_t mwait_size;
4219
4220			/*
4221			 * check cpi_mwait.support which was set in cpuid_pass1
4222			 */
4223			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4224				break;
4225
4226			/*
4227			 * Protect ourself from insane mwait line size.
4228			 * Workaround for incomplete hardware emulator(s).
4229			 */
4230			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4231			if (mwait_size < sizeof (uint32_t) ||
4232			    !ISP2(mwait_size)) {
4233#if DEBUG
4234				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4235				    "size %ld", cpu->cpu_id, (long)mwait_size);
4236#endif
4237				break;
4238			}
4239
4240			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4241			cpi->cpi_mwait.mon_max = mwait_size;
4242			if (MWAIT_EXTENSION(cpi)) {
4243				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4244				if (MWAIT_INT_ENABLE(cpi))
4245					cpi->cpi_mwait.support |=
4246					    MWAIT_ECX_INT_ENABLE;
4247			}
4248			break;
4249		}
4250		default:
4251			break;
4252		}
4253	}
4254
4255	/*
4256	 * XSAVE enumeration
4257	 */
4258	if (cpi->cpi_maxeax >= 0xD) {
4259		struct cpuid_regs regs;
4260		boolean_t cpuid_d_valid = B_TRUE;
4261
4262		cp = &regs;
4263		cp->cp_eax = 0xD;
4264		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4265
4266		(void) __cpuid_insn(cp);
4267
4268		/*
4269		 * Sanity checks for debug
4270		 */
4271		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4272		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4273			cpuid_d_valid = B_FALSE;
4274		}
4275
4276		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4277		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4278		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4279
4280		/*
4281		 * If the hw supports AVX, get the size and offset in the save
4282		 * area for the ymm state.
4283		 */
4284		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4285			cp->cp_eax = 0xD;
4286			cp->cp_ecx = 2;
4287			cp->cp_edx = cp->cp_ebx = 0;
4288
4289			(void) __cpuid_insn(cp);
4290
4291			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4292			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4293				cpuid_d_valid = B_FALSE;
4294			}
4295
4296			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4297			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4298		}
4299
4300		/*
4301		 * If the hw supports MPX, get the size and offset in the
4302		 * save area for BNDREGS and BNDCSR.
4303		 */
4304		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4305			cp->cp_eax = 0xD;
4306			cp->cp_ecx = 3;
4307			cp->cp_edx = cp->cp_ebx = 0;
4308
4309			(void) __cpuid_insn(cp);
4310
4311			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4312			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4313
4314			cp->cp_eax = 0xD;
4315			cp->cp_ecx = 4;
4316			cp->cp_edx = cp->cp_ebx = 0;
4317
4318			(void) __cpuid_insn(cp);
4319
4320			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4321			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4322		}
4323
4324		/*
4325		 * If the hw supports AVX512, get the size and offset in the
4326		 * save area for the opmask registers and zmm state.
4327		 */
4328		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4329			cp->cp_eax = 0xD;
4330			cp->cp_ecx = 5;
4331			cp->cp_edx = cp->cp_ebx = 0;
4332
4333			(void) __cpuid_insn(cp);
4334
4335			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4336			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4337
4338			cp->cp_eax = 0xD;
4339			cp->cp_ecx = 6;
4340			cp->cp_edx = cp->cp_ebx = 0;
4341
4342			(void) __cpuid_insn(cp);
4343
4344			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4345			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4346
4347			cp->cp_eax = 0xD;
4348			cp->cp_ecx = 7;
4349			cp->cp_edx = cp->cp_ebx = 0;
4350
4351			(void) __cpuid_insn(cp);
4352
4353			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4354			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4355		}
4356
4357		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4358			xsave_state_size = 0;
4359		} else if (cpuid_d_valid) {
4360			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4361		} else {
4362			/* Broken CPUID 0xD, probably in HVM */
4363			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4364			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4365			    ", ymm_size = %d, ymm_offset = %d\n",
4366			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4367			    cpi->cpi_xsave.xsav_hw_features_high,
4368			    (int)cpi->cpi_xsave.xsav_max_size,
4369			    (int)cpi->cpi_xsave.ymm_size,
4370			    (int)cpi->cpi_xsave.ymm_offset);
4371
4372			if (xsave_state_size != 0) {
4373				/*
4374				 * This must be a non-boot CPU. We cannot
4375				 * continue, because boot cpu has already
4376				 * enabled XSAVE.
4377				 */
4378				ASSERT(cpu->cpu_id != 0);
4379				cmn_err(CE_PANIC, "cpu%d: we have already "
4380				    "enabled XSAVE on boot cpu, cannot "
4381				    "continue.", cpu->cpu_id);
4382			} else {
4383				/*
4384				 * If we reached here on the boot CPU, it's also
4385				 * almost certain that we'll reach here on the
4386				 * non-boot CPUs. When we're here on a boot CPU
4387				 * we should disable the feature, on a non-boot
4388				 * CPU we need to confirm that we have.
4389				 */
4390				if (cpu->cpu_id == 0) {
4391					remove_x86_feature(x86_featureset,
4392					    X86FSET_XSAVE);
4393					remove_x86_feature(x86_featureset,
4394					    X86FSET_AVX);
4395					remove_x86_feature(x86_featureset,
4396					    X86FSET_F16C);
4397					remove_x86_feature(x86_featureset,
4398					    X86FSET_BMI1);
4399					remove_x86_feature(x86_featureset,
4400					    X86FSET_BMI2);
4401					remove_x86_feature(x86_featureset,
4402					    X86FSET_FMA);
4403					remove_x86_feature(x86_featureset,
4404					    X86FSET_AVX2);
4405					remove_x86_feature(x86_featureset,
4406					    X86FSET_MPX);
4407					remove_x86_feature(x86_featureset,
4408					    X86FSET_AVX512F);
4409					remove_x86_feature(x86_featureset,
4410					    X86FSET_AVX512DQ);
4411					remove_x86_feature(x86_featureset,
4412					    X86FSET_AVX512PF);
4413					remove_x86_feature(x86_featureset,
4414					    X86FSET_AVX512ER);
4415					remove_x86_feature(x86_featureset,
4416					    X86FSET_AVX512CD);
4417					remove_x86_feature(x86_featureset,
4418					    X86FSET_AVX512BW);
4419					remove_x86_feature(x86_featureset,
4420					    X86FSET_AVX512VL);
4421					remove_x86_feature(x86_featureset,
4422					    X86FSET_AVX512FMA);
4423					remove_x86_feature(x86_featureset,
4424					    X86FSET_AVX512VBMI);
4425					remove_x86_feature(x86_featureset,
4426					    X86FSET_AVX512VNNI);
4427					remove_x86_feature(x86_featureset,
4428					    X86FSET_AVX512VPOPCDQ);
4429					remove_x86_feature(x86_featureset,
4430					    X86FSET_AVX512NNIW);
4431					remove_x86_feature(x86_featureset,
4432					    X86FSET_AVX512FMAPS);
4433
4434					CPI_FEATURES_ECX(cpi) &=
4435					    ~CPUID_INTC_ECX_XSAVE;
4436					CPI_FEATURES_ECX(cpi) &=
4437					    ~CPUID_INTC_ECX_AVX;
4438					CPI_FEATURES_ECX(cpi) &=
4439					    ~CPUID_INTC_ECX_F16C;
4440					CPI_FEATURES_ECX(cpi) &=
4441					    ~CPUID_INTC_ECX_FMA;
4442					CPI_FEATURES_7_0_EBX(cpi) &=
4443					    ~CPUID_INTC_EBX_7_0_BMI1;
4444					CPI_FEATURES_7_0_EBX(cpi) &=
4445					    ~CPUID_INTC_EBX_7_0_BMI2;
4446					CPI_FEATURES_7_0_EBX(cpi) &=
4447					    ~CPUID_INTC_EBX_7_0_AVX2;
4448					CPI_FEATURES_7_0_EBX(cpi) &=
4449					    ~CPUID_INTC_EBX_7_0_MPX;
4450					CPI_FEATURES_7_0_EBX(cpi) &=
4451					    ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4452
4453					CPI_FEATURES_7_0_ECX(cpi) &=
4454					    ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4455
4456					CPI_FEATURES_7_0_EDX(cpi) &=
4457					    ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4458
4459					xsave_force_disable = B_TRUE;
4460				} else {
4461					VERIFY(is_x86_feature(x86_featureset,
4462					    X86FSET_XSAVE) == B_FALSE);
4463				}
4464			}
4465		}
4466	}
4467
4468
4469	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4470		goto pass2_done;
4471
4472	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4473		nmax = NMAX_CPI_EXTD;
4474	/*
4475	 * Copy the extended properties, fixing them as we go.
4476	 * (We already handled n == 0 and n == 1 in pass 1)
4477	 */
4478	iptr = (void *)cpi->cpi_brandstr;
4479	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4480		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4481		(void) __cpuid_insn(cp);
4482		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4483		    cp);
4484		switch (n) {
4485		case 2:
4486		case 3:
4487		case 4:
4488			/*
4489			 * Extract the brand string
4490			 */
4491			*iptr++ = cp->cp_eax;
4492			*iptr++ = cp->cp_ebx;
4493			*iptr++ = cp->cp_ecx;
4494			*iptr++ = cp->cp_edx;
4495			break;
4496		case 5:
4497			switch (cpi->cpi_vendor) {
4498			case X86_VENDOR_AMD:
4499				/*
4500				 * The Athlon and Duron were the first
4501				 * parts to report the sizes of the
4502				 * TLB for large pages. Before then,
4503				 * we don't trust the data.
4504				 */
4505				if (cpi->cpi_family < 6 ||
4506				    (cpi->cpi_family == 6 &&
4507				    cpi->cpi_model < 1))
4508					cp->cp_eax = 0;
4509				break;
4510			default:
4511				break;
4512			}
4513			break;
4514		case 6:
4515			switch (cpi->cpi_vendor) {
4516			case X86_VENDOR_AMD:
4517				/*
4518				 * The Athlon and Duron were the first
4519				 * AMD parts with L2 TLB's.
4520				 * Before then, don't trust the data.
4521				 */
4522				if (cpi->cpi_family < 6 ||
4523				    cpi->cpi_family == 6 &&
4524				    cpi->cpi_model < 1)
4525					cp->cp_eax = cp->cp_ebx = 0;
4526				/*
4527				 * AMD Duron rev A0 reports L2
4528				 * cache size incorrectly as 1K
4529				 * when it is really 64K
4530				 */
4531				if (cpi->cpi_family == 6 &&
4532				    cpi->cpi_model == 3 &&
4533				    cpi->cpi_step == 0) {
4534					cp->cp_ecx &= 0xffff;
4535					cp->cp_ecx |= 0x400000;
4536				}
4537				break;
4538			case X86_VENDOR_Cyrix:	/* VIA C3 */
4539				/*
4540				 * VIA C3 processors are a bit messed
4541				 * up w.r.t. encoding cache sizes in %ecx
4542				 */
4543				if (cpi->cpi_family != 6)
4544					break;
4545				/*
4546				 * model 7 and 8 were incorrectly encoded
4547				 *
4548				 * xxx is model 8 really broken?
4549				 */
4550				if (cpi->cpi_model == 7 ||
4551				    cpi->cpi_model == 8)
4552					cp->cp_ecx =
4553					    BITX(cp->cp_ecx, 31, 24) << 16 |
4554					    BITX(cp->cp_ecx, 23, 16) << 12 |
4555					    BITX(cp->cp_ecx, 15, 8) << 8 |
4556					    BITX(cp->cp_ecx, 7, 0);
4557				/*
4558				 * model 9 stepping 1 has wrong associativity
4559				 */
4560				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4561					cp->cp_ecx |= 8 << 12;
4562				break;
4563			case X86_VENDOR_Intel:
4564				/*
4565				 * Extended L2 Cache features function.
4566				 * First appeared on Prescott.
4567				 */
4568			default:
4569				break;
4570			}
4571			break;
4572		default:
4573			break;
4574		}
4575	}
4576
4577pass2_done:
4578	cpi->cpi_pass = 2;
4579}
4580
4581static const char *
4582intel_cpubrand(const struct cpuid_info *cpi)
4583{
4584	int i;
4585
4586	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4587	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4588		return ("i486");
4589
4590	switch (cpi->cpi_family) {
4591	case 5:
4592		return ("Intel Pentium(r)");
4593	case 6:
4594		switch (cpi->cpi_model) {
4595			uint_t celeron, xeon;
4596			const struct cpuid_regs *cp;
4597		case 0:
4598		case 1:
4599		case 2:
4600			return ("Intel Pentium(r) Pro");
4601		case 3:
4602		case 4:
4603			return ("Intel Pentium(r) II");
4604		case 6:
4605			return ("Intel Celeron(r)");
4606		case 5:
4607		case 7:
4608			celeron = xeon = 0;
4609			cp = &cpi->cpi_std[2];	/* cache info */
4610
4611			for (i = 1; i < 4; i++) {
4612				uint_t tmp;
4613
4614				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4615				if (tmp == 0x40)
4616					celeron++;
4617				if (tmp >= 0x44 && tmp <= 0x45)
4618					xeon++;
4619			}
4620
4621			for (i = 0; i < 2; i++) {
4622				uint_t tmp;
4623
4624				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4625				if (tmp == 0x40)
4626					celeron++;
4627				else if (tmp >= 0x44 && tmp <= 0x45)
4628					xeon++;
4629			}
4630
4631			for (i = 0; i < 4; i++) {
4632				uint_t tmp;
4633
4634				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4635				if (tmp == 0x40)
4636					celeron++;
4637				else if (tmp >= 0x44 && tmp <= 0x45)
4638					xeon++;
4639			}
4640
4641			for (i = 0; i < 4; i++) {
4642				uint_t tmp;
4643
4644				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4645				if (tmp == 0x40)
4646					celeron++;
4647				else if (tmp >= 0x44 && tmp <= 0x45)
4648					xeon++;
4649			}
4650
4651			if (celeron)
4652				return ("Intel Celeron(r)");
4653			if (xeon)
4654				return (cpi->cpi_model == 5 ?
4655				    "Intel Pentium(r) II Xeon(tm)" :
4656				    "Intel Pentium(r) III Xeon(tm)");
4657			return (cpi->cpi_model == 5 ?
4658			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4659			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4660		default:
4661			break;
4662		}
4663	default:
4664		break;
4665	}
4666
4667	/* BrandID is present if the field is nonzero */
4668	if (cpi->cpi_brandid != 0) {
4669		static const struct {
4670			uint_t bt_bid;
4671			const char *bt_str;
4672		} brand_tbl[] = {
4673			{ 0x1,	"Intel(r) Celeron(r)" },
4674			{ 0x2,	"Intel(r) Pentium(r) III" },
4675			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
4676			{ 0x4,	"Intel(r) Pentium(r) III" },
4677			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
4678			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
4679			{ 0x8,	"Intel(r) Pentium(r) 4" },
4680			{ 0x9,	"Intel(r) Pentium(r) 4" },
4681			{ 0xa,	"Intel(r) Celeron(r)" },
4682			{ 0xb,	"Intel(r) Xeon(tm)" },
4683			{ 0xc,	"Intel(r) Xeon(tm) MP" },
4684			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
4685			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
4686			{ 0x11, "Mobile Genuine Intel(r)" },
4687			{ 0x12, "Intel(r) Celeron(r) M" },
4688			{ 0x13, "Mobile Intel(r) Celeron(r)" },
4689			{ 0x14, "Intel(r) Celeron(r)" },
4690			{ 0x15, "Mobile Genuine Intel(r)" },
4691			{ 0x16,	"Intel(r) Pentium(r) M" },
4692			{ 0x17, "Mobile Intel(r) Celeron(r)" }
4693		};
4694		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4695		uint_t sgn;
4696
4697		sgn = (cpi->cpi_family << 8) |
4698		    (cpi->cpi_model << 4) | cpi->cpi_step;
4699
4700		for (i = 0; i < btblmax; i++)
4701			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4702				break;
4703		if (i < btblmax) {
4704			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4705				return ("Intel(r) Celeron(r)");
4706			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4707				return ("Intel(r) Xeon(tm) MP");
4708			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4709				return ("Intel(r) Xeon(tm)");
4710			return (brand_tbl[i].bt_str);
4711		}
4712	}
4713
4714	return (NULL);
4715}
4716
4717static const char *
4718amd_cpubrand(const struct cpuid_info *cpi)
4719{
4720	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4721	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4722		return ("i486 compatible");
4723
4724	switch (cpi->cpi_family) {
4725	case 5:
4726		switch (cpi->cpi_model) {
4727		case 0:
4728		case 1:
4729		case 2:
4730		case 3:
4731		case 4:
4732		case 5:
4733			return ("AMD-K5(r)");
4734		case 6:
4735		case 7:
4736			return ("AMD-K6(r)");
4737		case 8:
4738			return ("AMD-K6(r)-2");
4739		case 9:
4740			return ("AMD-K6(r)-III");
4741		default:
4742			return ("AMD (family 5)");
4743		}
4744	case 6:
4745		switch (cpi->cpi_model) {
4746		case 1:
4747			return ("AMD-K7(tm)");
4748		case 0:
4749		case 2:
4750		case 4:
4751			return ("AMD Athlon(tm)");
4752		case 3:
4753		case 7:
4754			return ("AMD Duron(tm)");
4755		case 6:
4756		case 8:
4757		case 10:
4758			/*
4759			 * Use the L2 cache size to distinguish
4760			 */
4761			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4762			    "AMD Athlon(tm)" : "AMD Duron(tm)");
4763		default:
4764			return ("AMD (family 6)");
4765		}
4766	default:
4767		break;
4768	}
4769
4770	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4771	    cpi->cpi_brandid != 0) {
4772		switch (BITX(cpi->cpi_brandid, 7, 5)) {
4773		case 3:
4774			return ("AMD Opteron(tm) UP 1xx");
4775		case 4:
4776			return ("AMD Opteron(tm) DP 2xx");
4777		case 5:
4778			return ("AMD Opteron(tm) MP 8xx");
4779		default:
4780			return ("AMD Opteron(tm)");
4781		}
4782	}
4783
4784	return (NULL);
4785}
4786
4787static const char *
4788cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4789{
4790	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4791	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4792	    type == X86_TYPE_CYRIX_486)
4793		return ("i486 compatible");
4794
4795	switch (type) {
4796	case X86_TYPE_CYRIX_6x86:
4797		return ("Cyrix 6x86");
4798	case X86_TYPE_CYRIX_6x86L:
4799		return ("Cyrix 6x86L");
4800	case X86_TYPE_CYRIX_6x86MX:
4801		return ("Cyrix 6x86MX");
4802	case X86_TYPE_CYRIX_GXm:
4803		return ("Cyrix GXm");
4804	case X86_TYPE_CYRIX_MediaGX:
4805		return ("Cyrix MediaGX");
4806	case X86_TYPE_CYRIX_MII:
4807		return ("Cyrix M2");
4808	case X86_TYPE_VIA_CYRIX_III:
4809		return ("VIA Cyrix M3");
4810	default:
4811		/*
4812		 * Have another wild guess ..
4813		 */
4814		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4815			return ("Cyrix 5x86");
4816		else if (cpi->cpi_family == 5) {
4817			switch (cpi->cpi_model) {
4818			case 2:
4819				return ("Cyrix 6x86");	/* Cyrix M1 */
4820			case 4:
4821				return ("Cyrix MediaGX");
4822			default:
4823				break;
4824			}
4825		} else if (cpi->cpi_family == 6) {
4826			switch (cpi->cpi_model) {
4827			case 0:
4828				return ("Cyrix 6x86MX"); /* Cyrix M2? */
4829			case 5:
4830			case 6:
4831			case 7:
4832			case 8:
4833			case 9:
4834				return ("VIA C3");
4835			default:
4836				break;
4837			}
4838		}
4839		break;
4840	}
4841	return (NULL);
4842}
4843
4844/*
4845 * This only gets called in the case that the CPU extended
4846 * feature brand string (0x80000002, 0x80000003, 0x80000004)
4847 * aren't available, or contain null bytes for some reason.
4848 */
4849static void
4850fabricate_brandstr(struct cpuid_info *cpi)
4851{
4852	const char *brand = NULL;
4853
4854	switch (cpi->cpi_vendor) {
4855	case X86_VENDOR_Intel:
4856		brand = intel_cpubrand(cpi);
4857		break;
4858	case X86_VENDOR_AMD:
4859		brand = amd_cpubrand(cpi);
4860		break;
4861	case X86_VENDOR_Cyrix:
4862		brand = cyrix_cpubrand(cpi, x86_type);
4863		break;
4864	case X86_VENDOR_NexGen:
4865		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4866			brand = "NexGen Nx586";
4867		break;
4868	case X86_VENDOR_Centaur:
4869		if (cpi->cpi_family == 5)
4870			switch (cpi->cpi_model) {
4871			case 4:
4872				brand = "Centaur C6";
4873				break;
4874			case 8:
4875				brand = "Centaur C2";
4876				break;
4877			case 9:
4878				brand = "Centaur C3";
4879				break;
4880			default:
4881				break;
4882			}
4883		break;
4884	case X86_VENDOR_Rise:
4885		if (cpi->cpi_family == 5 &&
4886		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4887			brand = "Rise mP6";
4888		break;
4889	case X86_VENDOR_SiS:
4890		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4891			brand = "SiS 55x";
4892		break;
4893	case X86_VENDOR_TM:
4894		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4895			brand = "Transmeta Crusoe TM3x00 or TM5x00";
4896		break;
4897	case X86_VENDOR_NSC:
4898	case X86_VENDOR_UMC:
4899	default:
4900		break;
4901	}
4902	if (brand) {
4903		(void) strcpy((char *)cpi->cpi_brandstr, brand);
4904		return;
4905	}
4906
4907	/*
4908	 * If all else fails ...
4909	 */
4910	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4911	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4912	    cpi->cpi_model, cpi->cpi_step);
4913}
4914
4915/*
4916 * This routine is called just after kernel memory allocation
4917 * becomes available on cpu0, and as part of mp_startup() on
4918 * the other cpus.
4919 *
4920 * Fixup the brand string, and collect any information from cpuid
4921 * that requires dynamically allocated storage to represent.
4922 */
4923/*ARGSUSED*/
4924void
4925cpuid_pass3(cpu_t *cpu)
4926{
4927	int	i, max, shft, level, size;
4928	struct cpuid_regs regs;
4929	struct cpuid_regs *cp;
4930	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4931
4932	ASSERT(cpi->cpi_pass == 2);
4933
4934	/*
4935	 * Deterministic cache parameters
4936	 *
4937	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4938	 * values that are present are currently defined to be the same. This
4939	 * means we can use the same logic to parse it as long as we use the
4940	 * appropriate leaf to get the data. If you're updating this, make sure
4941	 * you're careful about which vendor supports which aspect.
4942	 *
4943	 * Take this opportunity to detect the number of threads sharing the
4944	 * last level cache, and construct a corresponding cache id. The
4945	 * respective cpuid_info members are initialized to the default case of
4946	 * "no last level cache sharing".
4947	 */
4948	cpi->cpi_ncpu_shr_last_cache = 1;
4949	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4950
4951	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4952	    (cpi->cpi_vendor == X86_VENDOR_AMD &&
4953	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4954	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4955		uint32_t leaf;
4956
4957		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4958			leaf = 4;
4959		} else {
4960			leaf = CPUID_LEAF_EXT_1d;
4961		}
4962
4963		/*
4964		 * Find the # of elements (size) returned by the leaf and along
4965		 * the way detect last level cache sharing details.
4966		 */
4967		bzero(&regs, sizeof (regs));
4968		cp = &regs;
4969		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4970			cp->cp_eax = leaf;
4971			cp->cp_ecx = i;
4972
4973			(void) __cpuid_insn(cp);
4974
4975			if (CPI_CACHE_TYPE(cp) == 0)
4976				break;
4977			level = CPI_CACHE_LVL(cp);
4978			if (level > max) {
4979				max = level;
4980				cpi->cpi_ncpu_shr_last_cache =
4981				    CPI_NTHR_SHR_CACHE(cp) + 1;
4982			}
4983		}
4984		cpi->cpi_cache_leaf_size = size = i;
4985
4986		/*
4987		 * Allocate the cpi_cache_leaves array. The first element
4988		 * references the regs for the corresponding leaf with %ecx set
4989		 * to 0. This was gathered in cpuid_pass2().
4990		 */
4991		if (size > 0) {
4992			cpi->cpi_cache_leaves =
4993			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
4994			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4995				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4996			} else {
4997				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4998			}
4999
5000			/*
5001			 * Allocate storage to hold the additional regs
5002			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5003			 *
5004			 * The regs for the leaf, %ecx == 0 has already
5005			 * been allocated as indicated above.
5006			 */
5007			for (i = 1; i < size; i++) {
5008				cp = cpi->cpi_cache_leaves[i] =
5009				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5010				cp->cp_eax = leaf;
5011				cp->cp_ecx = i;
5012
5013				(void) __cpuid_insn(cp);
5014			}
5015		}
5016		/*
5017		 * Determine the number of bits needed to represent
5018		 * the number of CPUs sharing the last level cache.
5019		 *
5020		 * Shift off that number of bits from the APIC id to
5021		 * derive the cache id.
5022		 */
5023		shft = 0;
5024		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5025			shft++;
5026		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5027	}
5028
5029	/*
5030	 * Now fixup the brand string
5031	 */
5032	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5033		fabricate_brandstr(cpi);
5034	} else {
5035
5036		/*
5037		 * If we successfully extracted a brand string from the cpuid
5038		 * instruction, clean it up by removing leading spaces and
5039		 * similar junk.
5040		 */
5041		if (cpi->cpi_brandstr[0]) {
5042			size_t maxlen = sizeof (cpi->cpi_brandstr);
5043			char *src, *dst;
5044
5045			dst = src = (char *)cpi->cpi_brandstr;
5046			src[maxlen - 1] = '\0';
5047			/*
5048			 * strip leading spaces
5049			 */
5050			while (*src == ' ')
5051				src++;
5052			/*
5053			 * Remove any 'Genuine' or "Authentic" prefixes
5054			 */
5055			if (strncmp(src, "Genuine ", 8) == 0)
5056				src += 8;
5057			if (strncmp(src, "Authentic ", 10) == 0)
5058				src += 10;
5059
5060			/*
5061			 * Now do an in-place copy.
5062			 * Map (R) to (r) and (TM) to (tm).
5063			 * The era of teletypes is long gone, and there's
5064			 * -really- no need to shout.
5065			 */
5066			while (*src != '\0') {
5067				if (src[0] == '(') {
5068					if (strncmp(src + 1, "R)", 2) == 0) {
5069						(void) strncpy(dst, "(r)", 3);
5070						src += 3;
5071						dst += 3;
5072						continue;
5073					}
5074					if (strncmp(src + 1, "TM)", 3) == 0) {
5075						(void) strncpy(dst, "(tm)", 4);
5076						src += 4;
5077						dst += 4;
5078						continue;
5079					}
5080				}
5081				*dst++ = *src++;
5082			}
5083			*dst = '\0';
5084
5085			/*
5086			 * Finally, remove any trailing spaces
5087			 */
5088			while (--dst > cpi->cpi_brandstr)
5089				if (*dst == ' ')
5090					*dst = '\0';
5091				else
5092					break;
5093		} else
5094			fabricate_brandstr(cpi);
5095	}
5096	cpi->cpi_pass = 3;
5097}
5098
5099/*
5100 * This routine is called out of bind_hwcap() much later in the life
5101 * of the kernel (post_startup()).  The job of this routine is to resolve
5102 * the hardware feature support and kernel support for those features into
5103 * what we're actually going to tell applications via the aux vector.
5104 */
5105void
5106cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
5107{
5108	struct cpuid_info *cpi;
5109	uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
5110
5111	if (cpu == NULL)
5112		cpu = CPU;
5113	cpi = cpu->cpu_m.mcpu_cpi;
5114
5115	ASSERT(cpi->cpi_pass == 3);
5116
5117	if (cpi->cpi_maxeax >= 1) {
5118		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5119		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5120		uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
5121
5122		*edx = CPI_FEATURES_EDX(cpi);
5123		*ecx = CPI_FEATURES_ECX(cpi);
5124		*ebx = CPI_FEATURES_7_0_EBX(cpi);
5125
5126		/*
5127		 * [these require explicit kernel support]
5128		 */
5129		if (!is_x86_feature(x86_featureset, X86FSET_SEP))
5130			*edx &= ~CPUID_INTC_EDX_SEP;
5131
5132		if (!is_x86_feature(x86_featureset, X86FSET_SSE))
5133			*edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
5134		if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
5135			*edx &= ~CPUID_INTC_EDX_SSE2;
5136
5137		if (!is_x86_feature(x86_featureset, X86FSET_HTT))
5138			*edx &= ~CPUID_INTC_EDX_HTT;
5139
5140		if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
5141			*ecx &= ~CPUID_INTC_ECX_SSE3;
5142
5143		if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
5144			*ecx &= ~CPUID_INTC_ECX_SSSE3;
5145		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
5146			*ecx &= ~CPUID_INTC_ECX_SSE4_1;
5147		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
5148			*ecx &= ~CPUID_INTC_ECX_SSE4_2;
5149		if (!is_x86_feature(x86_featureset, X86FSET_AES))
5150			*ecx &= ~CPUID_INTC_ECX_AES;
5151		if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
5152			*ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
5153		if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
5154			*ecx &= ~(CPUID_INTC_ECX_XSAVE |
5155			    CPUID_INTC_ECX_OSXSAVE);
5156		if (!is_x86_feature(x86_featureset, X86FSET_AVX))
5157			*ecx &= ~CPUID_INTC_ECX_AVX;
5158		if (!is_x86_feature(x86_featureset, X86FSET_F16C))
5159			*ecx &= ~CPUID_INTC_ECX_F16C;
5160		if (!is_x86_feature(x86_featureset, X86FSET_FMA))
5161			*ecx &= ~CPUID_INTC_ECX_FMA;
5162		if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
5163			*ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
5164		if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
5165			*ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
5166		if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
5167			*ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
5168		if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
5169			*ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
5170		if (!is_x86_feature(x86_featureset, X86FSET_ADX))
5171			*ebx &= ~CPUID_INTC_EBX_7_0_ADX;
5172
5173		/*
5174		 * [no explicit support required beyond x87 fp context]
5175		 */
5176		if (!fpu_exists)
5177			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5178
5179		/*
5180		 * Now map the supported feature vector to things that we
5181		 * think userland will care about.
5182		 */
5183		if (*edx & CPUID_INTC_EDX_SEP)
5184			hwcap_flags |= AV_386_SEP;
5185		if (*edx & CPUID_INTC_EDX_SSE)
5186			hwcap_flags |= AV_386_FXSR | AV_386_SSE;
5187		if (*edx & CPUID_INTC_EDX_SSE2)
5188			hwcap_flags |= AV_386_SSE2;
5189		if (*ecx & CPUID_INTC_ECX_SSE3)
5190			hwcap_flags |= AV_386_SSE3;
5191		if (*ecx & CPUID_INTC_ECX_SSSE3)
5192			hwcap_flags |= AV_386_SSSE3;
5193		if (*ecx & CPUID_INTC_ECX_SSE4_1)
5194			hwcap_flags |= AV_386_SSE4_1;
5195		if (*ecx & CPUID_INTC_ECX_SSE4_2)
5196			hwcap_flags |= AV_386_SSE4_2;
5197		if (*ecx & CPUID_INTC_ECX_MOVBE)
5198			hwcap_flags |= AV_386_MOVBE;
5199		if (*ecx & CPUID_INTC_ECX_AES)
5200			hwcap_flags |= AV_386_AES;
5201		if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5202			hwcap_flags |= AV_386_PCLMULQDQ;
5203		if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5204		    (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5205			hwcap_flags |= AV_386_XSAVE;
5206
5207			if (*ecx & CPUID_INTC_ECX_AVX) {
5208				uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5209				uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5210
5211				hwcap_flags |= AV_386_AVX;
5212				if (*ecx & CPUID_INTC_ECX_F16C)
5213					hwcap_flags_2 |= AV_386_2_F16C;
5214				if (*ecx & CPUID_INTC_ECX_FMA)
5215					hwcap_flags_2 |= AV_386_2_FMA;
5216
5217				if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5218					hwcap_flags_2 |= AV_386_2_BMI1;
5219				if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5220					hwcap_flags_2 |= AV_386_2_BMI2;
5221				if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5222					hwcap_flags_2 |= AV_386_2_AVX2;
5223				if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5224					hwcap_flags_2 |= AV_386_2_AVX512F;
5225				if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5226					hwcap_flags_2 |= AV_386_2_AVX512DQ;
5227				if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5228					hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5229				if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5230					hwcap_flags_2 |= AV_386_2_AVX512PF;
5231				if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5232					hwcap_flags_2 |= AV_386_2_AVX512ER;
5233				if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5234					hwcap_flags_2 |= AV_386_2_AVX512CD;
5235				if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5236					hwcap_flags_2 |= AV_386_2_AVX512BW;
5237				if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5238					hwcap_flags_2 |= AV_386_2_AVX512VL;
5239
5240				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5241					hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5242				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5243					hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5244				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5245					hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5246
5247				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5248					hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5249				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5250					hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5251			}
5252		}
5253		if (*ecx & CPUID_INTC_ECX_VMX)
5254			hwcap_flags |= AV_386_VMX;
5255		if (*ecx & CPUID_INTC_ECX_POPCNT)
5256			hwcap_flags |= AV_386_POPCNT;
5257		if (*edx & CPUID_INTC_EDX_FPU)
5258			hwcap_flags |= AV_386_FPU;
5259		if (*edx & CPUID_INTC_EDX_MMX)
5260			hwcap_flags |= AV_386_MMX;
5261
5262		if (*edx & CPUID_INTC_EDX_TSC)
5263			hwcap_flags |= AV_386_TSC;
5264		if (*edx & CPUID_INTC_EDX_CX8)
5265			hwcap_flags |= AV_386_CX8;
5266		if (*edx & CPUID_INTC_EDX_CMOV)
5267			hwcap_flags |= AV_386_CMOV;
5268		if (*ecx & CPUID_INTC_ECX_CX16)
5269			hwcap_flags |= AV_386_CX16;
5270
5271		if (*ecx & CPUID_INTC_ECX_RDRAND)
5272			hwcap_flags_2 |= AV_386_2_RDRAND;
5273		if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5274			hwcap_flags_2 |= AV_386_2_ADX;
5275		if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5276			hwcap_flags_2 |= AV_386_2_RDSEED;
5277		if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5278			hwcap_flags_2 |= AV_386_2_SHA;
5279		if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5280			hwcap_flags_2 |= AV_386_2_FSGSBASE;
5281		if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5282			hwcap_flags_2 |= AV_386_2_CLWB;
5283		if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5284			hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5285
5286	}
5287	/*
5288	 * Check a few miscilaneous features.
5289	 */
5290	if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5291		hwcap_flags_2 |= AV_386_2_CLZERO;
5292
5293	if (cpi->cpi_xmaxeax < 0x80000001)
5294		goto pass4_done;
5295
5296	switch (cpi->cpi_vendor) {
5297		struct cpuid_regs cp;
5298		uint32_t *edx, *ecx;
5299
5300	case X86_VENDOR_Intel:
5301		/*
5302		 * Seems like Intel duplicated what we necessary
5303		 * here to make the initial crop of 64-bit OS's work.
5304		 * Hopefully, those are the only "extended" bits
5305		 * they'll add.
5306		 */
5307		/*FALLTHROUGH*/
5308
5309	case X86_VENDOR_AMD:
5310		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5311		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5312
5313		*edx = CPI_FEATURES_XTD_EDX(cpi);
5314		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5315
5316		/*
5317		 * [these features require explicit kernel support]
5318		 */
5319		switch (cpi->cpi_vendor) {
5320		case X86_VENDOR_Intel:
5321			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5322				*edx &= ~CPUID_AMD_EDX_TSCP;
5323			break;
5324
5325		case X86_VENDOR_AMD:
5326			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5327				*edx &= ~CPUID_AMD_EDX_TSCP;
5328			if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5329				*ecx &= ~CPUID_AMD_ECX_SSE4A;
5330			break;
5331
5332		default:
5333			break;
5334		}
5335
5336		/*
5337		 * [no explicit support required beyond
5338		 * x87 fp context and exception handlers]
5339		 */
5340		if (!fpu_exists)
5341			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5342			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5343
5344		if (!is_x86_feature(x86_featureset, X86FSET_NX))
5345			*edx &= ~CPUID_AMD_EDX_NX;
5346#if !defined(__amd64)
5347		*edx &= ~CPUID_AMD_EDX_LM;
5348#endif
5349		/*
5350		 * Now map the supported feature vector to
5351		 * things that we think userland will care about.
5352		 */
5353#if defined(__amd64)
5354		if (*edx & CPUID_AMD_EDX_SYSC)
5355			hwcap_flags |= AV_386_AMD_SYSC;
5356#endif
5357		if (*edx & CPUID_AMD_EDX_MMXamd)
5358			hwcap_flags |= AV_386_AMD_MMX;
5359		if (*edx & CPUID_AMD_EDX_3DNow)
5360			hwcap_flags |= AV_386_AMD_3DNow;
5361		if (*edx & CPUID_AMD_EDX_3DNowx)
5362			hwcap_flags |= AV_386_AMD_3DNowx;
5363		if (*ecx & CPUID_AMD_ECX_SVM)
5364			hwcap_flags |= AV_386_AMD_SVM;
5365
5366		switch (cpi->cpi_vendor) {
5367		case X86_VENDOR_AMD:
5368			if (*edx & CPUID_AMD_EDX_TSCP)
5369				hwcap_flags |= AV_386_TSCP;
5370			if (*ecx & CPUID_AMD_ECX_AHF64)
5371				hwcap_flags |= AV_386_AHF;
5372			if (*ecx & CPUID_AMD_ECX_SSE4A)
5373				hwcap_flags |= AV_386_AMD_SSE4A;
5374			if (*ecx & CPUID_AMD_ECX_LZCNT)
5375				hwcap_flags |= AV_386_AMD_LZCNT;
5376			if (*ecx & CPUID_AMD_ECX_MONITORX)
5377				hwcap_flags_2 |= AV_386_2_MONITORX;
5378			break;
5379
5380		case X86_VENDOR_Intel:
5381			if (*edx & CPUID_AMD_EDX_TSCP)
5382				hwcap_flags |= AV_386_TSCP;
5383			if (*ecx & CPUID_AMD_ECX_LZCNT)
5384				hwcap_flags |= AV_386_AMD_LZCNT;
5385			/*
5386			 * Aarrgh.
5387			 * Intel uses a different bit in the same word.
5388			 */
5389			if (*ecx & CPUID_INTC_ECX_AHF64)
5390				hwcap_flags |= AV_386_AHF;
5391			break;
5392
5393		default:
5394			break;
5395		}
5396		break;
5397
5398	case X86_VENDOR_TM:
5399		cp.cp_eax = 0x80860001;
5400		(void) __cpuid_insn(&cp);
5401		cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5402		break;
5403
5404	default:
5405		break;
5406	}
5407
5408pass4_done:
5409	cpi->cpi_pass = 4;
5410	if (hwcap_out != NULL) {
5411		hwcap_out[0] = hwcap_flags;
5412		hwcap_out[1] = hwcap_flags_2;
5413	}
5414}
5415
5416
5417/*
5418 * Simulate the cpuid instruction using the data we previously
5419 * captured about this CPU.  We try our best to return the truth
5420 * about the hardware, independently of kernel support.
5421 */
5422uint32_t
5423cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5424{
5425	struct cpuid_info *cpi;
5426	struct cpuid_regs *xcp;
5427
5428	if (cpu == NULL)
5429		cpu = CPU;
5430	cpi = cpu->cpu_m.mcpu_cpi;
5431
5432	ASSERT(cpuid_checkpass(cpu, 3));
5433
5434	/*
5435	 * CPUID data is cached in two separate places: cpi_std for standard
5436	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5437	 */
5438	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5439		xcp = &cpi->cpi_std[cp->cp_eax];
5440	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5441	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5442	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5443		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5444	} else {
5445		/*
5446		 * The caller is asking for data from an input parameter which
5447		 * the kernel has not cached.  In this case we go fetch from
5448		 * the hardware and return the data directly to the user.
5449		 */
5450		return (__cpuid_insn(cp));
5451	}
5452
5453	cp->cp_eax = xcp->cp_eax;
5454	cp->cp_ebx = xcp->cp_ebx;
5455	cp->cp_ecx = xcp->cp_ecx;
5456	cp->cp_edx = xcp->cp_edx;
5457	return (cp->cp_eax);
5458}
5459
5460int
5461cpuid_checkpass(cpu_t *cpu, int pass)
5462{
5463	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5464	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5465}
5466
5467int
5468cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5469{
5470	ASSERT(cpuid_checkpass(cpu, 3));
5471
5472	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5473}
5474
5475int
5476cpuid_is_cmt(cpu_t *cpu)
5477{
5478	if (cpu == NULL)
5479		cpu = CPU;
5480
5481	ASSERT(cpuid_checkpass(cpu, 1));
5482
5483	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5484}
5485
5486/*
5487 * AMD and Intel both implement the 64-bit variant of the syscall
5488 * instruction (syscallq), so if there's -any- support for syscall,
5489 * cpuid currently says "yes, we support this".
5490 *
5491 * However, Intel decided to -not- implement the 32-bit variant of the
5492 * syscall instruction, so we provide a predicate to allow our caller
5493 * to test that subtlety here.
5494 *
5495 * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5496 *	even in the case where the hardware would in fact support it.
5497 */
5498/*ARGSUSED*/
5499int
5500cpuid_syscall32_insn(cpu_t *cpu)
5501{
5502	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5503
5504#if !defined(__xpv)
5505	if (cpu == NULL)
5506		cpu = CPU;
5507
5508	/*CSTYLED*/
5509	{
5510		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5511
5512		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5513		    cpi->cpi_xmaxeax >= 0x80000001 &&
5514		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5515			return (1);
5516	}
5517#endif
5518	return (0);
5519}
5520
5521int
5522cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5523{
5524	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5525
5526	static const char fmt[] =
5527	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5528	static const char fmt_ht[] =
5529	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5530
5531	ASSERT(cpuid_checkpass(cpu, 1));
5532
5533	if (cpuid_is_cmt(cpu))
5534		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5535		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5536		    cpi->cpi_family, cpi->cpi_model,
5537		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5538	return (snprintf(s, n, fmt,
5539	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5540	    cpi->cpi_family, cpi->cpi_model,
5541	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5542}
5543
5544const char *
5545cpuid_getvendorstr(cpu_t *cpu)
5546{
5547	ASSERT(cpuid_checkpass(cpu, 1));
5548	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5549}
5550
5551uint_t
5552cpuid_getvendor(cpu_t *cpu)
5553{
5554	ASSERT(cpuid_checkpass(cpu, 1));
5555	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5556}
5557
5558uint_t
5559cpuid_getfamily(cpu_t *cpu)
5560{
5561	ASSERT(cpuid_checkpass(cpu, 1));
5562	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5563}
5564
5565uint_t
5566cpuid_getmodel(cpu_t *cpu)
5567{
5568	ASSERT(cpuid_checkpass(cpu, 1));
5569	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5570}
5571
5572uint_t
5573cpuid_get_ncpu_per_chip(cpu_t *cpu)
5574{
5575	ASSERT(cpuid_checkpass(cpu, 1));
5576	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5577}
5578
5579uint_t
5580cpuid_get_ncore_per_chip(cpu_t *cpu)
5581{
5582	ASSERT(cpuid_checkpass(cpu, 1));
5583	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5584}
5585
5586uint_t
5587cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5588{
5589	ASSERT(cpuid_checkpass(cpu, 2));
5590	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5591}
5592
5593id_t
5594cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5595{
5596	ASSERT(cpuid_checkpass(cpu, 2));
5597	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5598}
5599
5600uint_t
5601cpuid_getstep(cpu_t *cpu)
5602{
5603	ASSERT(cpuid_checkpass(cpu, 1));
5604	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5605}
5606
5607uint_t
5608cpuid_getsig(struct cpu *cpu)
5609{
5610	ASSERT(cpuid_checkpass(cpu, 1));
5611	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5612}
5613
5614uint32_t
5615cpuid_getchiprev(struct cpu *cpu)
5616{
5617	ASSERT(cpuid_checkpass(cpu, 1));
5618	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5619}
5620
5621const char *
5622cpuid_getchiprevstr(struct cpu *cpu)
5623{
5624	ASSERT(cpuid_checkpass(cpu, 1));
5625	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5626}
5627
5628uint32_t
5629cpuid_getsockettype(struct cpu *cpu)
5630{
5631	ASSERT(cpuid_checkpass(cpu, 1));
5632	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5633}
5634
5635const char *
5636cpuid_getsocketstr(cpu_t *cpu)
5637{
5638	static const char *socketstr = NULL;
5639	struct cpuid_info *cpi;
5640
5641	ASSERT(cpuid_checkpass(cpu, 1));
5642	cpi = cpu->cpu_m.mcpu_cpi;
5643
5644	/* Assume that socket types are the same across the system */
5645	if (socketstr == NULL)
5646		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5647		    cpi->cpi_model, cpi->cpi_step);
5648
5649
5650	return (socketstr);
5651}
5652
5653int
5654cpuid_get_chipid(cpu_t *cpu)
5655{
5656	ASSERT(cpuid_checkpass(cpu, 1));
5657
5658	if (cpuid_is_cmt(cpu))
5659		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5660	return (cpu->cpu_id);
5661}
5662
5663id_t
5664cpuid_get_coreid(cpu_t *cpu)
5665{
5666	ASSERT(cpuid_checkpass(cpu, 1));
5667	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5668}
5669
5670int
5671cpuid_get_pkgcoreid(cpu_t *cpu)
5672{
5673	ASSERT(cpuid_checkpass(cpu, 1));
5674	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5675}
5676
5677int
5678cpuid_get_clogid(cpu_t *cpu)
5679{
5680	ASSERT(cpuid_checkpass(cpu, 1));
5681	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5682}
5683
5684int
5685cpuid_get_cacheid(cpu_t *cpu)
5686{
5687	ASSERT(cpuid_checkpass(cpu, 1));
5688	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5689}
5690
5691uint_t
5692cpuid_get_procnodeid(cpu_t *cpu)
5693{
5694	ASSERT(cpuid_checkpass(cpu, 1));
5695	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5696}
5697
5698uint_t
5699cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5700{
5701	ASSERT(cpuid_checkpass(cpu, 1));
5702	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5703}
5704
5705uint_t
5706cpuid_get_compunitid(cpu_t *cpu)
5707{
5708	ASSERT(cpuid_checkpass(cpu, 1));
5709	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5710}
5711
5712uint_t
5713cpuid_get_cores_per_compunit(cpu_t *cpu)
5714{
5715	ASSERT(cpuid_checkpass(cpu, 1));
5716	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5717}
5718
5719/*ARGSUSED*/
5720int
5721cpuid_have_cr8access(cpu_t *cpu)
5722{
5723#if defined(__amd64)
5724	return (1);
5725#else
5726	struct cpuid_info *cpi;
5727
5728	ASSERT(cpu != NULL);
5729	cpi = cpu->cpu_m.mcpu_cpi;
5730	if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5731	    (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5732		return (1);
5733	return (0);
5734#endif
5735}
5736
5737uint32_t
5738cpuid_get_apicid(cpu_t *cpu)
5739{
5740	ASSERT(cpuid_checkpass(cpu, 1));
5741	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5742		return (UINT32_MAX);
5743	} else {
5744		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5745	}
5746}
5747
5748void
5749cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5750{
5751	struct cpuid_info *cpi;
5752
5753	if (cpu == NULL)
5754		cpu = CPU;
5755	cpi = cpu->cpu_m.mcpu_cpi;
5756
5757	ASSERT(cpuid_checkpass(cpu, 1));
5758
5759	if (pabits)
5760		*pabits = cpi->cpi_pabits;
5761	if (vabits)
5762		*vabits = cpi->cpi_vabits;
5763}
5764
5765size_t
5766cpuid_get_xsave_size()
5767{
5768	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5769	    sizeof (struct xsave_state)));
5770}
5771
5772/*
5773 * Return true if the CPUs on this system require 'pointer clearing' for the
5774 * floating point error pointer exception handling. In the past, this has been
5775 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5776 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5777 * feature bit and is reflected in the cpi_fp_amd_save member.
5778 */
5779boolean_t
5780cpuid_need_fp_excp_handling()
5781{
5782	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5783	    cpuid_info0.cpi_fp_amd_save != 0);
5784}
5785
5786/*
5787 * Returns the number of data TLB entries for a corresponding
5788 * pagesize.  If it can't be computed, or isn't known, the
5789 * routine returns zero.  If you ask about an architecturally
5790 * impossible pagesize, the routine will panic (so that the
5791 * hat implementor knows that things are inconsistent.)
5792 */
5793uint_t
5794cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5795{
5796	struct cpuid_info *cpi;
5797	uint_t dtlb_nent = 0;
5798
5799	if (cpu == NULL)
5800		cpu = CPU;
5801	cpi = cpu->cpu_m.mcpu_cpi;
5802
5803	ASSERT(cpuid_checkpass(cpu, 1));
5804
5805	/*
5806	 * Check the L2 TLB info
5807	 */
5808	if (cpi->cpi_xmaxeax >= 0x80000006) {
5809		struct cpuid_regs *cp = &cpi->cpi_extd[6];
5810
5811		switch (pagesize) {
5812
5813		case 4 * 1024:
5814			/*
5815			 * All zero in the top 16 bits of the register
5816			 * indicates a unified TLB. Size is in low 16 bits.
5817			 */
5818			if ((cp->cp_ebx & 0xffff0000) == 0)
5819				dtlb_nent = cp->cp_ebx & 0x0000ffff;
5820			else
5821				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5822			break;
5823
5824		case 2 * 1024 * 1024:
5825			if ((cp->cp_eax & 0xffff0000) == 0)
5826				dtlb_nent = cp->cp_eax & 0x0000ffff;
5827			else
5828				dtlb_nent = BITX(cp->cp_eax, 27, 16);
5829			break;
5830
5831		default:
5832			panic("unknown L2 pagesize");
5833			/*NOTREACHED*/
5834		}
5835	}
5836
5837	if (dtlb_nent != 0)
5838		return (dtlb_nent);
5839
5840	/*
5841	 * No L2 TLB support for this size, try L1.
5842	 */
5843	if (cpi->cpi_xmaxeax >= 0x80000005) {
5844		struct cpuid_regs *cp = &cpi->cpi_extd[5];
5845
5846		switch (pagesize) {
5847		case 4 * 1024:
5848			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5849			break;
5850		case 2 * 1024 * 1024:
5851			dtlb_nent = BITX(cp->cp_eax, 23, 16);
5852			break;
5853		default:
5854			panic("unknown L1 d-TLB pagesize");
5855			/*NOTREACHED*/
5856		}
5857	}
5858
5859	return (dtlb_nent);
5860}
5861
5862/*
5863 * Return 0 if the erratum is not present or not applicable, positive
5864 * if it is, and negative if the status of the erratum is unknown.
5865 *
5866 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5867 * Processors" #25759, Rev 3.57, August 2005
5868 */
5869int
5870cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5871{
5872	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5873	uint_t eax;
5874
5875	/*
5876	 * Bail out if this CPU isn't an AMD CPU, or if it's
5877	 * a legacy (32-bit) AMD CPU.
5878	 */
5879	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5880	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5881	    cpi->cpi_family == 6) {
5882		return (0);
5883	}
5884
5885	eax = cpi->cpi_std[1].cp_eax;
5886
5887#define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
5888#define	SH_B3(eax)	(eax == 0xf51)
5889#define	B(eax)		(SH_B0(eax) || SH_B3(eax))
5890
5891#define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
5892
5893#define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5894#define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5895#define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
5896#define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5897
5898#define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5899#define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
5900#define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
5901#define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5902
5903#define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5904#define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
5905#define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
5906#define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
5907#define	BH_E4(eax)	(eax == 0x20fb1)
5908#define	SH_E5(eax)	(eax == 0x20f42)
5909#define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
5910#define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
5911#define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5912			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5913			    DH_E6(eax) || JH_E6(eax))
5914
5915#define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5916#define	DR_B0(eax)	(eax == 0x100f20)
5917#define	DR_B1(eax)	(eax == 0x100f21)
5918#define	DR_BA(eax)	(eax == 0x100f2a)
5919#define	DR_B2(eax)	(eax == 0x100f22)
5920#define	DR_B3(eax)	(eax == 0x100f23)
5921#define	RB_C0(eax)	(eax == 0x100f40)
5922
5923	switch (erratum) {
5924	case 1:
5925		return (cpi->cpi_family < 0x10);
5926	case 51:	/* what does the asterisk mean? */
5927		return (B(eax) || SH_C0(eax) || CG(eax));
5928	case 52:
5929		return (B(eax));
5930	case 57:
5931		return (cpi->cpi_family <= 0x11);
5932	case 58:
5933		return (B(eax));
5934	case 60:
5935		return (cpi->cpi_family <= 0x11);
5936	case 61:
5937	case 62:
5938	case 63:
5939	case 64:
5940	case 65:
5941	case 66:
5942	case 68:
5943	case 69:
5944	case 70:
5945	case 71:
5946		return (B(eax));
5947	case 72:
5948		return (SH_B0(eax));
5949	case 74:
5950		return (B(eax));
5951	case 75:
5952		return (cpi->cpi_family < 0x10);
5953	case 76:
5954		return (B(eax));
5955	case 77:
5956		return (cpi->cpi_family <= 0x11);
5957	case 78:
5958		return (B(eax) || SH_C0(eax));
5959	case 79:
5960		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5961	case 80:
5962	case 81:
5963	case 82:
5964		return (B(eax));
5965	case 83:
5966		return (B(eax) || SH_C0(eax) || CG(eax));
5967	case 85:
5968		return (cpi->cpi_family < 0x10);
5969	case 86:
5970		return (SH_C0(eax) || CG(eax));
5971	case 88:
5972#if !defined(__amd64)
5973		return (0);
5974#else
5975		return (B(eax) || SH_C0(eax));
5976#endif
5977	case 89:
5978		return (cpi->cpi_family < 0x10);
5979	case 90:
5980		return (B(eax) || SH_C0(eax) || CG(eax));
5981	case 91:
5982	case 92:
5983		return (B(eax) || SH_C0(eax));
5984	case 93:
5985		return (SH_C0(eax));
5986	case 94:
5987		return (B(eax) || SH_C0(eax) || CG(eax));
5988	case 95:
5989#if !defined(__amd64)
5990		return (0);
5991#else
5992		return (B(eax) || SH_C0(eax));
5993#endif
5994	case 96:
5995		return (B(eax) || SH_C0(eax) || CG(eax));
5996	case 97:
5997	case 98:
5998		return (SH_C0(eax) || CG(eax));
5999	case 99:
6000		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6001	case 100:
6002		return (B(eax) || SH_C0(eax));
6003	case 101:
6004	case 103:
6005		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6006	case 104:
6007		return (SH_C0(eax) || CG(eax) || D0(eax));
6008	case 105:
6009	case 106:
6010	case 107:
6011		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6012	case 108:
6013		return (DH_CG(eax));
6014	case 109:
6015		return (SH_C0(eax) || CG(eax) || D0(eax));
6016	case 110:
6017		return (D0(eax) || EX(eax));
6018	case 111:
6019		return (CG(eax));
6020	case 112:
6021		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6022	case 113:
6023		return (eax == 0x20fc0);
6024	case 114:
6025		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6026	case 115:
6027		return (SH_E0(eax) || JH_E1(eax));
6028	case 116:
6029		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6030	case 117:
6031		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6032	case 118:
6033		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6034		    JH_E6(eax));
6035	case 121:
6036		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6037	case 122:
6038		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6039	case 123:
6040		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6041	case 131:
6042		return (cpi->cpi_family < 0x10);
6043	case 6336786:
6044
6045		/*
6046		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6047		 * if this is a K8 family or newer processor. We're testing for
6048		 * this 'erratum' to determine whether or not we have a constant
6049		 * TSC.
6050		 *
6051		 * Our current fix for this is to disable the C1-Clock ramping.
6052		 * However, this doesn't work on newer processor families nor
6053		 * does it work when virtualized as those devices don't exist.
6054		 */
6055		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6056			return (0);
6057		}
6058
6059		if (CPI_FAMILY(cpi) == 0xf) {
6060			struct cpuid_regs regs;
6061			regs.cp_eax = 0x80000007;
6062			(void) __cpuid_insn(&regs);
6063			return (!(regs.cp_edx & 0x100));
6064		}
6065		return (0);
6066	case 6323525:
6067		/*
6068		 * This erratum (K8 #147) is not present on family 10 and newer.
6069		 */
6070		if (cpi->cpi_family >= 0x10) {
6071			return (0);
6072		}
6073		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6074		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6075
6076	case 6671130:
6077		/*
6078		 * check for processors (pre-Shanghai) that do not provide
6079		 * optimal management of 1gb ptes in its tlb.
6080		 */
6081		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6082
6083	case 298:
6084		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6085		    DR_B2(eax) || RB_C0(eax));
6086
6087	case 721:
6088#if defined(__amd64)
6089		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6090#else
6091		return (0);
6092#endif
6093
6094	default:
6095		return (-1);
6096
6097	}
6098}
6099
6100/*
6101 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6102 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6103 */
6104int
6105osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6106{
6107	struct cpuid_info	*cpi;
6108	uint_t			osvwid;
6109	static int		osvwfeature = -1;
6110	uint64_t		osvwlength;
6111
6112
6113	cpi = cpu->cpu_m.mcpu_cpi;
6114
6115	/* confirm OSVW supported */
6116	if (osvwfeature == -1) {
6117		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6118	} else {
6119		/* assert that osvw feature setting is consistent on all cpus */
6120		ASSERT(osvwfeature ==
6121		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6122	}
6123	if (!osvwfeature)
6124		return (-1);
6125
6126	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6127
6128	switch (erratum) {
6129	case 298:	/* osvwid is 0 */
6130		osvwid = 0;
6131		if (osvwlength <= (uint64_t)osvwid) {
6132			/* osvwid 0 is unknown */
6133			return (-1);
6134		}
6135
6136		/*
6137		 * Check the OSVW STATUS MSR to determine the state
6138		 * of the erratum where:
6139		 *   0 - fixed by HW
6140		 *   1 - BIOS has applied the workaround when BIOS
6141		 *   workaround is available. (Or for other errata,
6142		 *   OS workaround is required.)
6143		 * For a value of 1, caller will confirm that the
6144		 * erratum 298 workaround has indeed been applied by BIOS.
6145		 *
6146		 * A 1 may be set in cpus that have a HW fix
6147		 * in a mixed cpu system. Regarding erratum 298:
6148		 *   In a multiprocessor platform, the workaround above
6149		 *   should be applied to all processors regardless of
6150		 *   silicon revision when an affected processor is
6151		 *   present.
6152		 */
6153
6154		return (rdmsr(MSR_AMD_OSVW_STATUS +
6155		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6156		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6157
6158	default:
6159		return (-1);
6160	}
6161}
6162
6163static const char assoc_str[] = "associativity";
6164static const char line_str[] = "line-size";
6165static const char size_str[] = "size";
6166
6167static void
6168add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6169    uint32_t val)
6170{
6171	char buf[128];
6172
6173	/*
6174	 * ndi_prop_update_int() is used because it is desirable for
6175	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6176	 */
6177	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6178		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6179}
6180
6181/*
6182 * Intel-style cache/tlb description
6183 *
6184 * Standard cpuid level 2 gives a randomly ordered
6185 * selection of tags that index into a table that describes
6186 * cache and tlb properties.
6187 */
6188
6189static const char l1_icache_str[] = "l1-icache";
6190static const char l1_dcache_str[] = "l1-dcache";
6191static const char l2_cache_str[] = "l2-cache";
6192static const char l3_cache_str[] = "l3-cache";
6193static const char itlb4k_str[] = "itlb-4K";
6194static const char dtlb4k_str[] = "dtlb-4K";
6195static const char itlb2M_str[] = "itlb-2M";
6196static const char itlb4M_str[] = "itlb-4M";
6197static const char dtlb4M_str[] = "dtlb-4M";
6198static const char dtlb24_str[] = "dtlb0-2M-4M";
6199static const char itlb424_str[] = "itlb-4K-2M-4M";
6200static const char itlb24_str[] = "itlb-2M-4M";
6201static const char dtlb44_str[] = "dtlb-4K-4M";
6202static const char sl1_dcache_str[] = "sectored-l1-dcache";
6203static const char sl2_cache_str[] = "sectored-l2-cache";
6204static const char itrace_str[] = "itrace-cache";
6205static const char sl3_cache_str[] = "sectored-l3-cache";
6206static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6207
6208static const struct cachetab {
6209	uint8_t		ct_code;
6210	uint8_t		ct_assoc;
6211	uint16_t	ct_line_size;
6212	size_t		ct_size;
6213	const char	*ct_label;
6214} intel_ctab[] = {
6215	/*
6216	 * maintain descending order!
6217	 *
6218	 * Codes ignored - Reason
6219	 * ----------------------
6220	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6221	 * f0H/f1H - Currently we do not interpret prefetch size by design
6222	 */
6223	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6224	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6225	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6226	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6227	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6228	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6229	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6230	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6231	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6232	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6233	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6234	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6235	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6236	{ 0xc0, 4, 0, 8, dtlb44_str },
6237	{ 0xba, 4, 0, 64, dtlb4k_str },
6238	{ 0xb4, 4, 0, 256, dtlb4k_str },
6239	{ 0xb3, 4, 0, 128, dtlb4k_str },
6240	{ 0xb2, 4, 0, 64, itlb4k_str },
6241	{ 0xb0, 4, 0, 128, itlb4k_str },
6242	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6243	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6244	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6245	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6246	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6247	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6248	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6249	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6250	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6251	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6252	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6253	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6254	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6255	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6256	{ 0x73, 8, 0, 64*1024, itrace_str},
6257	{ 0x72, 8, 0, 32*1024, itrace_str},
6258	{ 0x71, 8, 0, 16*1024, itrace_str},
6259	{ 0x70, 8, 0, 12*1024, itrace_str},
6260	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6261	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6262	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6263	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6264	{ 0x5d, 0, 0, 256, dtlb44_str},
6265	{ 0x5c, 0, 0, 128, dtlb44_str},
6266	{ 0x5b, 0, 0, 64, dtlb44_str},
6267	{ 0x5a, 4, 0, 32, dtlb24_str},
6268	{ 0x59, 0, 0, 16, dtlb4k_str},
6269	{ 0x57, 4, 0, 16, dtlb4k_str},
6270	{ 0x56, 4, 0, 16, dtlb4M_str},
6271	{ 0x55, 0, 0, 7, itlb24_str},
6272	{ 0x52, 0, 0, 256, itlb424_str},
6273	{ 0x51, 0, 0, 128, itlb424_str},
6274	{ 0x50, 0, 0, 64, itlb424_str},
6275	{ 0x4f, 0, 0, 32, itlb4k_str},
6276	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6277	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6278	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6279	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6280	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6281	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6282	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6283	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6284	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6285	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6286	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6287	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6288	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6289	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6290	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6291	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6292	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6293	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6294	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6295	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6296	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6297	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6298	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6299	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6300	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6301	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6302	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6303	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6304	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6305	{ 0x0b, 4, 0, 4, itlb4M_str},
6306	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6307	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6308	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6309	{ 0x05, 4, 0, 32, dtlb4M_str},
6310	{ 0x04, 4, 0, 8, dtlb4M_str},
6311	{ 0x03, 4, 0, 64, dtlb4k_str},
6312	{ 0x02, 4, 0, 2, itlb4M_str},
6313	{ 0x01, 4, 0, 32, itlb4k_str},
6314	{ 0 }
6315};
6316
6317static const struct cachetab cyrix_ctab[] = {
6318	{ 0x70, 4, 0, 32, "tlb-4K" },
6319	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6320	{ 0 }
6321};
6322
6323/*
6324 * Search a cache table for a matching entry
6325 */
6326static const struct cachetab *
6327find_cacheent(const struct cachetab *ct, uint_t code)
6328{
6329	if (code != 0) {
6330		for (; ct->ct_code != 0; ct++)
6331			if (ct->ct_code <= code)
6332				break;
6333		if (ct->ct_code == code)
6334			return (ct);
6335	}
6336	return (NULL);
6337}
6338
6339/*
6340 * Populate cachetab entry with L2 or L3 cache-information using
6341 * cpuid function 4. This function is called from intel_walk_cacheinfo()
6342 * when descriptor 0x49 is encountered. It returns 0 if no such cache
6343 * information is found.
6344 */
6345static int
6346intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6347{
6348	uint32_t level, i;
6349	int ret = 0;
6350
6351	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6352		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6353
6354		if (level == 2 || level == 3) {
6355			ct->ct_assoc =
6356			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6357			ct->ct_line_size =
6358			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6359			ct->ct_size = ct->ct_assoc *
6360			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6361			    ct->ct_line_size *
6362			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6363
6364			if (level == 2) {
6365				ct->ct_label = l2_cache_str;
6366			} else if (level == 3) {
6367				ct->ct_label = l3_cache_str;
6368			}
6369			ret = 1;
6370		}
6371	}
6372
6373	return (ret);
6374}
6375
6376/*
6377 * Walk the cacheinfo descriptor, applying 'func' to every valid element
6378 * The walk is terminated if the walker returns non-zero.
6379 */
6380static void
6381intel_walk_cacheinfo(struct cpuid_info *cpi,
6382    void *arg, int (*func)(void *, const struct cachetab *))
6383{
6384	const struct cachetab *ct;
6385	struct cachetab des_49_ct, des_b1_ct;
6386	uint8_t *dp;
6387	int i;
6388
6389	if ((dp = cpi->cpi_cacheinfo) == NULL)
6390		return;
6391	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6392		/*
6393		 * For overloaded descriptor 0x49 we use cpuid function 4
6394		 * if supported by the current processor, to create
6395		 * cache information.
6396		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6397		 * to disambiguate the cache information.
6398		 */
6399		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6400		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6401				ct = &des_49_ct;
6402		} else if (*dp == 0xb1) {
6403			des_b1_ct.ct_code = 0xb1;
6404			des_b1_ct.ct_assoc = 4;
6405			des_b1_ct.ct_line_size = 0;
6406			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6407				des_b1_ct.ct_size = 8;
6408				des_b1_ct.ct_label = itlb2M_str;
6409			} else {
6410				des_b1_ct.ct_size = 4;
6411				des_b1_ct.ct_label = itlb4M_str;
6412			}
6413			ct = &des_b1_ct;
6414		} else {
6415			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6416				continue;
6417			}
6418		}
6419
6420		if (func(arg, ct) != 0) {
6421			break;
6422		}
6423	}
6424}
6425
6426/*
6427 * (Like the Intel one, except for Cyrix CPUs)
6428 */
6429static void
6430cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6431    void *arg, int (*func)(void *, const struct cachetab *))
6432{
6433	const struct cachetab *ct;
6434	uint8_t *dp;
6435	int i;
6436
6437	if ((dp = cpi->cpi_cacheinfo) == NULL)
6438		return;
6439	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6440		/*
6441		 * Search Cyrix-specific descriptor table first ..
6442		 */
6443		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6444			if (func(arg, ct) != 0)
6445				break;
6446			continue;
6447		}
6448		/*
6449		 * .. else fall back to the Intel one
6450		 */
6451		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6452			if (func(arg, ct) != 0)
6453				break;
6454			continue;
6455		}
6456	}
6457}
6458
6459/*
6460 * A cacheinfo walker that adds associativity, line-size, and size properties
6461 * to the devinfo node it is passed as an argument.
6462 */
6463static int
6464add_cacheent_props(void *arg, const struct cachetab *ct)
6465{
6466	dev_info_t *devi = arg;
6467
6468	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6469	if (ct->ct_line_size != 0)
6470		add_cache_prop(devi, ct->ct_label, line_str,
6471		    ct->ct_line_size);
6472	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6473	return (0);
6474}
6475
6476
6477static const char fully_assoc[] = "fully-associative?";
6478
6479/*
6480 * AMD style cache/tlb description
6481 *
6482 * Extended functions 5 and 6 directly describe properties of
6483 * tlbs and various cache levels.
6484 */
6485static void
6486add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6487{
6488	switch (assoc) {
6489	case 0:	/* reserved; ignore */
6490		break;
6491	default:
6492		add_cache_prop(devi, label, assoc_str, assoc);
6493		break;
6494	case 0xff:
6495		add_cache_prop(devi, label, fully_assoc, 1);
6496		break;
6497	}
6498}
6499
6500static void
6501add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6502{
6503	if (size == 0)
6504		return;
6505	add_cache_prop(devi, label, size_str, size);
6506	add_amd_assoc(devi, label, assoc);
6507}
6508
6509static void
6510add_amd_cache(dev_info_t *devi, const char *label,
6511    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6512{
6513	if (size == 0 || line_size == 0)
6514		return;
6515	add_amd_assoc(devi, label, assoc);
6516	/*
6517	 * Most AMD parts have a sectored cache. Multiple cache lines are
6518	 * associated with each tag. A sector consists of all cache lines
6519	 * associated with a tag. For example, the AMD K6-III has a sector
6520	 * size of 2 cache lines per tag.
6521	 */
6522	if (lines_per_tag != 0)
6523		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6524	add_cache_prop(devi, label, line_str, line_size);
6525	add_cache_prop(devi, label, size_str, size * 1024);
6526}
6527
6528static void
6529add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6530{
6531	switch (assoc) {
6532	case 0:	/* off */
6533		break;
6534	case 1:
6535	case 2:
6536	case 4:
6537		add_cache_prop(devi, label, assoc_str, assoc);
6538		break;
6539	case 6:
6540		add_cache_prop(devi, label, assoc_str, 8);
6541		break;
6542	case 8:
6543		add_cache_prop(devi, label, assoc_str, 16);
6544		break;
6545	case 0xf:
6546		add_cache_prop(devi, label, fully_assoc, 1);
6547		break;
6548	default: /* reserved; ignore */
6549		break;
6550	}
6551}
6552
6553static void
6554add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6555{
6556	if (size == 0 || assoc == 0)
6557		return;
6558	add_amd_l2_assoc(devi, label, assoc);
6559	add_cache_prop(devi, label, size_str, size);
6560}
6561
6562static void
6563add_amd_l2_cache(dev_info_t *devi, const char *label,
6564    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6565{
6566	if (size == 0 || assoc == 0 || line_size == 0)
6567		return;
6568	add_amd_l2_assoc(devi, label, assoc);
6569	if (lines_per_tag != 0)
6570		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6571	add_cache_prop(devi, label, line_str, line_size);
6572	add_cache_prop(devi, label, size_str, size * 1024);
6573}
6574
6575static void
6576amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6577{
6578	struct cpuid_regs *cp;
6579
6580	if (cpi->cpi_xmaxeax < 0x80000005)
6581		return;
6582	cp = &cpi->cpi_extd[5];
6583
6584	/*
6585	 * 4M/2M L1 TLB configuration
6586	 *
6587	 * We report the size for 2M pages because AMD uses two
6588	 * TLB entries for one 4M page.
6589	 */
6590	add_amd_tlb(devi, "dtlb-2M",
6591	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6592	add_amd_tlb(devi, "itlb-2M",
6593	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6594
6595	/*
6596	 * 4K L1 TLB configuration
6597	 */
6598
6599	switch (cpi->cpi_vendor) {
6600		uint_t nentries;
6601	case X86_VENDOR_TM:
6602		if (cpi->cpi_family >= 5) {
6603			/*
6604			 * Crusoe processors have 256 TLB entries, but
6605			 * cpuid data format constrains them to only
6606			 * reporting 255 of them.
6607			 */
6608			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6609				nentries = 256;
6610			/*
6611			 * Crusoe processors also have a unified TLB
6612			 */
6613			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6614			    nentries);
6615			break;
6616		}
6617		/*FALLTHROUGH*/
6618	default:
6619		add_amd_tlb(devi, itlb4k_str,
6620		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6621		add_amd_tlb(devi, dtlb4k_str,
6622		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6623		break;
6624	}
6625
6626	/*
6627	 * data L1 cache configuration
6628	 */
6629
6630	add_amd_cache(devi, l1_dcache_str,
6631	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6632	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6633
6634	/*
6635	 * code L1 cache configuration
6636	 */
6637
6638	add_amd_cache(devi, l1_icache_str,
6639	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6640	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6641
6642	if (cpi->cpi_xmaxeax < 0x80000006)
6643		return;
6644	cp = &cpi->cpi_extd[6];
6645
6646	/* Check for a unified L2 TLB for large pages */
6647
6648	if (BITX(cp->cp_eax, 31, 16) == 0)
6649		add_amd_l2_tlb(devi, "l2-tlb-2M",
6650		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6651	else {
6652		add_amd_l2_tlb(devi, "l2-dtlb-2M",
6653		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6654		add_amd_l2_tlb(devi, "l2-itlb-2M",
6655		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6656	}
6657
6658	/* Check for a unified L2 TLB for 4K pages */
6659
6660	if (BITX(cp->cp_ebx, 31, 16) == 0) {
6661		add_amd_l2_tlb(devi, "l2-tlb-4K",
6662		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6663	} else {
6664		add_amd_l2_tlb(devi, "l2-dtlb-4K",
6665		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6666		add_amd_l2_tlb(devi, "l2-itlb-4K",
6667		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6668	}
6669
6670	add_amd_l2_cache(devi, l2_cache_str,
6671	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6672	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6673}
6674
6675/*
6676 * There are two basic ways that the x86 world describes it cache
6677 * and tlb architecture - Intel's way and AMD's way.
6678 *
6679 * Return which flavor of cache architecture we should use
6680 */
6681static int
6682x86_which_cacheinfo(struct cpuid_info *cpi)
6683{
6684	switch (cpi->cpi_vendor) {
6685	case X86_VENDOR_Intel:
6686		if (cpi->cpi_maxeax >= 2)
6687			return (X86_VENDOR_Intel);
6688		break;
6689	case X86_VENDOR_AMD:
6690		/*
6691		 * The K5 model 1 was the first part from AMD that reported
6692		 * cache sizes via extended cpuid functions.
6693		 */
6694		if (cpi->cpi_family > 5 ||
6695		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6696			return (X86_VENDOR_AMD);
6697		break;
6698	case X86_VENDOR_TM:
6699		if (cpi->cpi_family >= 5)
6700			return (X86_VENDOR_AMD);
6701		/*FALLTHROUGH*/
6702	default:
6703		/*
6704		 * If they have extended CPU data for 0x80000005
6705		 * then we assume they have AMD-format cache
6706		 * information.
6707		 *
6708		 * If not, and the vendor happens to be Cyrix,
6709		 * then try our-Cyrix specific handler.
6710		 *
6711		 * If we're not Cyrix, then assume we're using Intel's
6712		 * table-driven format instead.
6713		 */
6714		if (cpi->cpi_xmaxeax >= 0x80000005)
6715			return (X86_VENDOR_AMD);
6716		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6717			return (X86_VENDOR_Cyrix);
6718		else if (cpi->cpi_maxeax >= 2)
6719			return (X86_VENDOR_Intel);
6720		break;
6721	}
6722	return (-1);
6723}
6724
6725void
6726cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6727    struct cpuid_info *cpi)
6728{
6729	dev_info_t *cpu_devi;
6730	int create;
6731
6732	cpu_devi = (dev_info_t *)dip;
6733
6734	/* device_type */
6735	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6736	    "device_type", "cpu");
6737
6738	/* reg */
6739	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6740	    "reg", cpu_id);
6741
6742	/* cpu-mhz, and clock-frequency */
6743	if (cpu_freq > 0) {
6744		long long mul;
6745
6746		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6747		    "cpu-mhz", cpu_freq);
6748		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6749			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6750			    "clock-frequency", (int)mul);
6751	}
6752
6753	if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6754		return;
6755	}
6756
6757	/* vendor-id */
6758	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6759	    "vendor-id", cpi->cpi_vendorstr);
6760
6761	if (cpi->cpi_maxeax == 0) {
6762		return;
6763	}
6764
6765	/*
6766	 * family, model, and step
6767	 */
6768	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6769	    "family", CPI_FAMILY(cpi));
6770	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6771	    "cpu-model", CPI_MODEL(cpi));
6772	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6773	    "stepping-id", CPI_STEP(cpi));
6774
6775	/* type */
6776	switch (cpi->cpi_vendor) {
6777	case X86_VENDOR_Intel:
6778		create = 1;
6779		break;
6780	default:
6781		create = 0;
6782		break;
6783	}
6784	if (create)
6785		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6786		    "type", CPI_TYPE(cpi));
6787
6788	/* ext-family */
6789	switch (cpi->cpi_vendor) {
6790	case X86_VENDOR_Intel:
6791	case X86_VENDOR_AMD:
6792		create = cpi->cpi_family >= 0xf;
6793		break;
6794	default:
6795		create = 0;
6796		break;
6797	}
6798	if (create)
6799		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6800		    "ext-family", CPI_FAMILY_XTD(cpi));
6801
6802	/* ext-model */
6803	switch (cpi->cpi_vendor) {
6804	case X86_VENDOR_Intel:
6805		create = IS_EXTENDED_MODEL_INTEL(cpi);
6806		break;
6807	case X86_VENDOR_AMD:
6808		create = CPI_FAMILY(cpi) == 0xf;
6809		break;
6810	default:
6811		create = 0;
6812		break;
6813	}
6814	if (create)
6815		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6816		    "ext-model", CPI_MODEL_XTD(cpi));
6817
6818	/* generation */
6819	switch (cpi->cpi_vendor) {
6820	case X86_VENDOR_AMD:
6821		/*
6822		 * AMD K5 model 1 was the first part to support this
6823		 */
6824		create = cpi->cpi_xmaxeax >= 0x80000001;
6825		break;
6826	default:
6827		create = 0;
6828		break;
6829	}
6830	if (create)
6831		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6832		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6833
6834	/* brand-id */
6835	switch (cpi->cpi_vendor) {
6836	case X86_VENDOR_Intel:
6837		/*
6838		 * brand id first appeared on Pentium III Xeon model 8,
6839		 * and Celeron model 8 processors and Opteron
6840		 */
6841		create = cpi->cpi_family > 6 ||
6842		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6843		break;
6844	case X86_VENDOR_AMD:
6845		create = cpi->cpi_family >= 0xf;
6846		break;
6847	default:
6848		create = 0;
6849		break;
6850	}
6851	if (create && cpi->cpi_brandid != 0) {
6852		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6853		    "brand-id", cpi->cpi_brandid);
6854	}
6855
6856	/* chunks, and apic-id */
6857	switch (cpi->cpi_vendor) {
6858		/*
6859		 * first available on Pentium IV and Opteron (K8)
6860		 */
6861	case X86_VENDOR_Intel:
6862		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6863		break;
6864	case X86_VENDOR_AMD:
6865		create = cpi->cpi_family >= 0xf;
6866		break;
6867	default:
6868		create = 0;
6869		break;
6870	}
6871	if (create) {
6872		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6873		    "chunks", CPI_CHUNKS(cpi));
6874		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6875		    "apic-id", cpi->cpi_apicid);
6876		if (cpi->cpi_chipid >= 0) {
6877			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6878			    "chip#", cpi->cpi_chipid);
6879			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6880			    "clog#", cpi->cpi_clogid);
6881		}
6882	}
6883
6884	/* cpuid-features */
6885	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6886	    "cpuid-features", CPI_FEATURES_EDX(cpi));
6887
6888
6889	/* cpuid-features-ecx */
6890	switch (cpi->cpi_vendor) {
6891	case X86_VENDOR_Intel:
6892		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6893		break;
6894	case X86_VENDOR_AMD:
6895		create = cpi->cpi_family >= 0xf;
6896		break;
6897	default:
6898		create = 0;
6899		break;
6900	}
6901	if (create)
6902		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6903		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6904
6905	/* ext-cpuid-features */
6906	switch (cpi->cpi_vendor) {
6907	case X86_VENDOR_Intel:
6908	case X86_VENDOR_AMD:
6909	case X86_VENDOR_Cyrix:
6910	case X86_VENDOR_TM:
6911	case X86_VENDOR_Centaur:
6912		create = cpi->cpi_xmaxeax >= 0x80000001;
6913		break;
6914	default:
6915		create = 0;
6916		break;
6917	}
6918	if (create) {
6919		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6920		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6921		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6922		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6923	}
6924
6925	/*
6926	 * Brand String first appeared in Intel Pentium IV, AMD K5
6927	 * model 1, and Cyrix GXm.  On earlier models we try and
6928	 * simulate something similar .. so this string should always
6929	 * same -something- about the processor, however lame.
6930	 */
6931	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6932	    "brand-string", cpi->cpi_brandstr);
6933
6934	/*
6935	 * Finally, cache and tlb information
6936	 */
6937	switch (x86_which_cacheinfo(cpi)) {
6938	case X86_VENDOR_Intel:
6939		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6940		break;
6941	case X86_VENDOR_Cyrix:
6942		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6943		break;
6944	case X86_VENDOR_AMD:
6945		amd_cache_info(cpi, cpu_devi);
6946		break;
6947	default:
6948		break;
6949	}
6950}
6951
6952struct l2info {
6953	int *l2i_csz;
6954	int *l2i_lsz;
6955	int *l2i_assoc;
6956	int l2i_ret;
6957};
6958
6959/*
6960 * A cacheinfo walker that fetches the size, line-size and associativity
6961 * of the L2 cache
6962 */
6963static int
6964intel_l2cinfo(void *arg, const struct cachetab *ct)
6965{
6966	struct l2info *l2i = arg;
6967	int *ip;
6968
6969	if (ct->ct_label != l2_cache_str &&
6970	    ct->ct_label != sl2_cache_str)
6971		return (0);	/* not an L2 -- keep walking */
6972
6973	if ((ip = l2i->l2i_csz) != NULL)
6974		*ip = ct->ct_size;
6975	if ((ip = l2i->l2i_lsz) != NULL)
6976		*ip = ct->ct_line_size;
6977	if ((ip = l2i->l2i_assoc) != NULL)
6978		*ip = ct->ct_assoc;
6979	l2i->l2i_ret = ct->ct_size;
6980	return (1);		/* was an L2 -- terminate walk */
6981}
6982
6983/*
6984 * AMD L2/L3 Cache and TLB Associativity Field Definition:
6985 *
6986 *	Unlike the associativity for the L1 cache and tlb where the 8 bit
6987 *	value is the associativity, the associativity for the L2 cache and
6988 *	tlb is encoded in the following table. The 4 bit L2 value serves as
6989 *	an index into the amd_afd[] array to determine the associativity.
6990 *	-1 is undefined. 0 is fully associative.
6991 */
6992
6993static int amd_afd[] =
6994	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6995
6996static void
6997amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6998{
6999	struct cpuid_regs *cp;
7000	uint_t size, assoc;
7001	int i;
7002	int *ip;
7003
7004	if (cpi->cpi_xmaxeax < 0x80000006)
7005		return;
7006	cp = &cpi->cpi_extd[6];
7007
7008	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7009	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7010		uint_t cachesz = size * 1024;
7011		assoc = amd_afd[i];
7012
7013		ASSERT(assoc != -1);
7014
7015		if ((ip = l2i->l2i_csz) != NULL)
7016			*ip = cachesz;
7017		if ((ip = l2i->l2i_lsz) != NULL)
7018			*ip = BITX(cp->cp_ecx, 7, 0);
7019		if ((ip = l2i->l2i_assoc) != NULL)
7020			*ip = assoc;
7021		l2i->l2i_ret = cachesz;
7022	}
7023}
7024
7025int
7026getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7027{
7028	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7029	struct l2info __l2info, *l2i = &__l2info;
7030
7031	l2i->l2i_csz = csz;
7032	l2i->l2i_lsz = lsz;
7033	l2i->l2i_assoc = assoc;
7034	l2i->l2i_ret = -1;
7035
7036	switch (x86_which_cacheinfo(cpi)) {
7037	case X86_VENDOR_Intel:
7038		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7039		break;
7040	case X86_VENDOR_Cyrix:
7041		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7042		break;
7043	case X86_VENDOR_AMD:
7044		amd_l2cacheinfo(cpi, l2i);
7045		break;
7046	default:
7047		break;
7048	}
7049	return (l2i->l2i_ret);
7050}
7051
7052#if !defined(__xpv)
7053
7054uint32_t *
7055cpuid_mwait_alloc(cpu_t *cpu)
7056{
7057	uint32_t	*ret;
7058	size_t		mwait_size;
7059
7060	ASSERT(cpuid_checkpass(CPU, 2));
7061
7062	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7063	if (mwait_size == 0)
7064		return (NULL);
7065
7066	/*
7067	 * kmem_alloc() returns cache line size aligned data for mwait_size
7068	 * allocations.  mwait_size is currently cache line sized.  Neither
7069	 * of these implementation details are guarantied to be true in the
7070	 * future.
7071	 *
7072	 * First try allocating mwait_size as kmem_alloc() currently returns
7073	 * correctly aligned memory.  If kmem_alloc() does not return
7074	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7075	 *
7076	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7077	 * decide to free this memory.
7078	 */
7079	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7080	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7081		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7082		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7083		*ret = MWAIT_RUNNING;
7084		return (ret);
7085	} else {
7086		kmem_free(ret, mwait_size);
7087		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7088		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7089		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7090		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7091		*ret = MWAIT_RUNNING;
7092		return (ret);
7093	}
7094}
7095
7096void
7097cpuid_mwait_free(cpu_t *cpu)
7098{
7099	if (cpu->cpu_m.mcpu_cpi == NULL) {
7100		return;
7101	}
7102
7103	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7104	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7105		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7106		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7107	}
7108
7109	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7110	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7111}
7112
7113void
7114patch_tsc_read(int flag)
7115{
7116	size_t cnt;
7117
7118	switch (flag) {
7119	case TSC_NONE:
7120		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7121		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7122		break;
7123	case TSC_RDTSC_MFENCE:
7124		cnt = &_tsc_mfence_end - &_tsc_mfence_start;
7125		(void) memcpy((void *)tsc_read,
7126		    (void *)&_tsc_mfence_start, cnt);
7127		break;
7128	case TSC_RDTSC_LFENCE:
7129		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7130		(void) memcpy((void *)tsc_read,
7131		    (void *)&_tsc_lfence_start, cnt);
7132		break;
7133	case TSC_TSCP:
7134		cnt = &_tscp_end - &_tscp_start;
7135		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7136		break;
7137	default:
7138		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7139		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7140		break;
7141	}
7142	tsc_type = flag;
7143}
7144
7145int
7146cpuid_deep_cstates_supported(void)
7147{
7148	struct cpuid_info *cpi;
7149	struct cpuid_regs regs;
7150
7151	ASSERT(cpuid_checkpass(CPU, 1));
7152
7153	cpi = CPU->cpu_m.mcpu_cpi;
7154
7155	if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
7156		return (0);
7157
7158	switch (cpi->cpi_vendor) {
7159	case X86_VENDOR_Intel:
7160		if (cpi->cpi_xmaxeax < 0x80000007)
7161			return (0);
7162
7163		/*
7164		 * TSC run at a constant rate in all ACPI C-states?
7165		 */
7166		regs.cp_eax = 0x80000007;
7167		(void) __cpuid_insn(&regs);
7168		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7169
7170	default:
7171		return (0);
7172	}
7173}
7174
7175#endif	/* !__xpv */
7176
7177void
7178post_startup_cpu_fixups(void)
7179{
7180#ifndef __xpv
7181	/*
7182	 * Some AMD processors support C1E state. Entering this state will
7183	 * cause the local APIC timer to stop, which we can't deal with at
7184	 * this time.
7185	 */
7186	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
7187		on_trap_data_t otd;
7188		uint64_t reg;
7189
7190		if (!on_trap(&otd, OT_DATA_ACCESS)) {
7191			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
7192			/* Disable C1E state if it is enabled by BIOS */
7193			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7194			    AMD_ACTONCMPHALT_MASK) {
7195				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7196				    AMD_ACTONCMPHALT_SHIFT);
7197				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7198			}
7199		}
7200		no_trap();
7201	}
7202#endif	/* !__xpv */
7203}
7204
7205void
7206enable_pcid(void)
7207{
7208	if (x86_use_pcid == -1)
7209		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7210
7211	if (x86_use_invpcid == -1) {
7212		x86_use_invpcid = is_x86_feature(x86_featureset,
7213		    X86FSET_INVPCID);
7214	}
7215
7216	if (!x86_use_pcid)
7217		return;
7218
7219	/*
7220	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7221	 * bits; better make sure there's nothing there.
7222	 */
7223	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7224
7225	setcr4(getcr4() | CR4_PCIDE);
7226}
7227
7228/*
7229 * Setup necessary registers to enable XSAVE feature on this processor.
7230 * This function needs to be called early enough, so that no xsave/xrstor
7231 * ops will execute on the processor before the MSRs are properly set up.
7232 *
7233 * Current implementation has the following assumption:
7234 * - cpuid_pass1() is done, so that X86 features are known.
7235 * - fpu_probe() is done, so that fp_save_mech is chosen.
7236 */
7237void
7238xsave_setup_msr(cpu_t *cpu)
7239{
7240	ASSERT(fp_save_mech == FP_XSAVE);
7241